Prometheus告警媒介 钉钉
创建钉钉群
添加钉钉群机器人
secret: SECc95134129e043e4be06df4d5aa2afdef066a6d361ac73da97bc7220618cfa9da
Webhook: https://oapi.dingtalk.com/robot/send?access_token=e7f8aac8fc2705064b28ae1b6d1a6d0dfc53974e0dc98423384b637c9ebe4498
安装钉钉告警插件
[root@alertmanager ~]# wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
[root@alertmanager ~]# ls
prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
[root@alertmanager ~]# tar xf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
[root@alertmanager ~]# ls
prometheus-webhook-dingtalk-2.1.0.linux-amd64
[root@alertmanager ~]# mv prometheus-webhook-dingtalk-2.1.0.linux-amd64 /usr/local/src/prometheus-webhook-dingtalk
[root@alertmanager ~]# ls /usr/local/src/
alertmanager prometheus-webhook-dingtalk
[root@alertmanager ~]# mv /usr/local/src/prometheus-webhook-dingtalk/config.example.yml /usr/local/src/prometheus-webhook-dingtalk/config.yml
[root@alertmanager ~]# ls /usr/local/src/prometheus-webhook-dingtalk/
config.yml contrib LICENSE prometheus-webhook-dingtalk
注册为系统服务
[root@alertmanager prometheus-webhook-dingtalk]# vim /usr/lib/systemd/system/prometheus-webhook-dingtalk.service
[root@alertmanager prometheus-webhook-dingtalk]# cat > /usr/lib/systemd/system/prometheus-webhook-dingtalk.service << EOF
[Service]
ExecStart=/usr/local/src/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --config.file=/usr/local/src/prometheus-webhook-dingtalk/config.yml
[Install]
WantedBy=multi-user.target
[Unit]
Description=prometheus-webhook-dingtalk
After=network.target
EOF
重载/开机自启/查看状态/启动
systemctl daemon-reload
systemctl enable prometheus-webhook-dingtalk
systemctl start prometheus-webhook-dingtalk
systemctl status prometheus-webhook-dingtalk
配置钉钉告警插件与钉钉机器人集成
[root@alertmanager ~]# cd /usr/local/src/prometheus-webhook-dingtalk/
[root@alertmanager prometheus-webhook-dingtalk]# pwd
/usr/local/src/prometheus-webhook-dingtalk
[root@alertmanager prometheus-webhook-dingtalk]# ls
config.yml contrib LICENSE prometheus-webhook-dingtalk
[root@alertmanager prometheus-webhook-dingtalk]# vim config.yml
[root@alertmanager prometheus-webhook-dingtalk]# cat config.yml
## Request timeout
# timeout: 5s
## Uncomment following line in order to write template from scratch (be careful!)
#no_builtin_template: true
## Customizable templates path
#templates:
# - contrib/templates/legacy/template.tmpl
templates:
- /usr/local/src/alertmanager/dingtalk.tmpl
## You can also override default template using `default_message`
## The following example to use the 'legacy' template from v0.3.0
#default_message:
# title: '{{ template "legacy.title" . }}'
# text: '{{ template "legacy.content" . }}'
## Targets, previously was known as "profiles"
targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=e7f8aac8fc2705064b28ae1b6d1a6d0dfc53974e0dc98423384b637c9ebe4498
# secret for signature
secret: SECc95134129e043e4be06df4d5aa2afdef066a6d361ac73da97bc7220618cfa9da
# Customize template content
message:
# Use legacy template
title: '{{ template "ops.title" . }}'
text: '{{ template "ops.content" . }}'
说明:
## Request timeout
# timeout: 5s
## Uncomment following line in order to write template from scratch (be careful!)
#no_builtin_template: true
## Customizable templates path
#templates:
# - contrib/templates/legacy/template.tmpl
templates:
- /usr/local/src/alertmanager/dingtalk.tmpl 自定义告警模板文件及位置
## You can also override default template using `default_message`
## The following example to use the 'legacy' template from v0.3.0
#default_message:
# title: '{{ template "legacy.title" . }}'
# text: '{{ template "legacy.content" . }}'
## Targets, previously was known as "profiles"
targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=e7f8aac8fc2705064b28ae1b6d1a6d0dfc53974e0dc98423384b637c9ebe4498 配置钉钉机器人webhook_url
# secret for signature
secret: SECc95134129e043e4be06df4d5aa2afdef066a6d361ac73da97bc7220618cfa9da 配置加签SECRET
# Customize template content
message:
# Use legacy template
title: '{{ template "ops.title" . }}' 添加模板标题,在下面的模板文件title位置
text: '{{ template "ops.content" . }}' 添加模板内容,在下面的模板文件content位置
为alertmanager配置告警模板文件
[root@alertmanager ~]# vim /usr/local/src/alertmanager/dingtalk.tmpl
[root@alertmanager ~]# cat /usr/local/src/alertmanager/dingtalk.tmpl
{{ define "__subject" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
{{ end }}
{{ define "__alert_list" }}{{ range . }}
---
**zqf 告警类型**: {{ .Labels.alertname }}
**zqf 告警级别**: {{ .Labels.level }}
**zqf 故障主机**: {{ .Labels.instance }}
**zqf 告警信息**: {{ .Annotations.description }}
**zqf 触发时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}
{{ define "__resolved_list" }}{{ range . }}
---
**zqf 告警类型**: {{ .Labels.alertname }}
**zqf 告警级别**: {{ .Labels.level }}
**zqf 故障主机**: {{ .Labels.instance }}
**zqf 触发时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
**zqf 恢复时间**: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}
{{ define "ops.title" }}
{{ template "__subject" . }}
{{ end }}
{{ define "ops.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
**====zqf 侦测到{{ .Alerts.Firing | len }}个故障====**
{{ template "__alert_list" .Alerts.Firing }}
---
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
**====zqf 恢复{{ .Alerts.Resolved | len }}个故障====**
{{ template "__resolved_list" .Alerts.Resolved }}
{{ end }}
{{ end }}
{{ define "ops.link.title" }}{{ template "ops.title" . }}{{ end }}
{{ define "ops.link.content" }}{{ template "ops.content" . }}{{ end }}
{{ template "ops.title" . }}
{{ template "ops.content" . }}
修改Alertmanager配置文件添加钉钉告警渠道
[root@alertmanager ~]# ss -anput | grep ":8060"
tcp LISTEN 0 4096 [::]:8060 [::]:* users:(("prometheus-webh",pid=9313,fd=3))
[root@alertmanager ~]# vim /usr/local/src/alertmanager/alertmanager.yml
[root@alertmanager ~]# cat /usr/local/src/alertmanager/alertmanager.yml
global:
resolve_timeout: 3m
templates:
- '/usr/local/src/alertmanager/dingtalk.tmpl'
route:
group_by: ['env','instance','type','group','job','alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: dingtalk_webhook1
routes:
- receiver: dingtalk_webhook1
group_wait: 30s
match_re:
severity: "critical|warning"
receivers:
- name: 'dingtalk_webhook1'
webhook_configs:
- url: 'http://192.168.10.173:8060/dingtalk/webhook1/send'
send_resolved: true
说明如下:
global:
resolve_timeout: 3m
templates:
- '/usr/local/src/alertmanager/dingtalk.tmpl' 告警模板位置
route:
group_by: ['env','instance','type','group','job','alertname'] 根据告警规则组名进行分组
group_wait: 30s 分组内第一个告警等待时间,30s内如有第二个告警会合并一个告警
group_interval: 5m 发送新告警间隔时间
repeat_interval: 1h 重复告警间隔发送时间,如果没处理过1h再次发送
receiver: dingtalk_webhook 告警接收人
receivers:
- name: 'dingtalk_webhook' 告警接收人
webhook_configs:
- url: 'http://192.168.10.173:8060/dingtalk/webhook1/send' 访问webhook1 url
send_resolved: true 在恢复后是否发送恢复消息给接收人
邮件告警的配置文件:
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.126.com:25'
smtp_from: 'nextgo@126.com'
smtp_auth_username: 'nextgo@126.com'
smtp_auth_password: 'RXGFEHFQCLXAMFTP'
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 5m
repeat_interval: 1m
receiver: 'mail'
receivers:
- name: 'mail'
email_configs:
- to: 'nextgo@126.com'
关于alertmanager.yaml文件的解释如下:
global: #全局设置,配置解决告警时间间隔和邮件发送服务
resolve_timeout: 5m # 定义持续多长时间未接收到告警标记后,就将告警状态标记为resolved
smtp_smarthost: 'smtp.126.com:25' # 邮件服务器
smtp_from: 'nextgo@126.com' # 告警发送邮箱
smtp_auth_username: 'nextgo@126.com' # 邮箱名
smtp_auth_password: 'RXGFEHFQCLXAMFTP' # 邮箱认证使用授权码
smtp_require_tls: false # 是否启动tls
route: # 路由树,每个告警都会在配置的顶级路由中进入路由树,路由树匹配所有报警规则
group_by: ['alertname'] # 告警过滤中分组标签
group_wait: 10s # 分组等待的时间
group_interval: 5m # 上下两组发送告警的间隔时间
repeat_interval: 1m # 重复发送告警时间,默认为1h,现修改为1分钟
receiver: 'mail' # 指定告警媒介类型
receivers: # 告警接收器,这里配置接收邮箱地址。
- name: 'mail' # 告警来源自定义名称
email_configs:
- to: 'nextgo@126.com' # 指定接收端email
重启prometheus-webhook-dingtalk及alertmanager
[root@alertmanager ~]# systemctl restart prometheus-webhook-dingtalk
[root@alertmanager ~]# systemctl restart alertmanager
或使用curl -lv -X POST http://localhost:9093/-/reload进行配置文件重新加载
告警测试
[root@prometheus-server ~]# df -h
文件系统 容量 已用 可用 已用% 挂载点
devtmpfs 1.9G 0 1.9G 0% /dev
tmpfs 2.0G 0 2.0G 0% /dev/shm
tmpfs 2.0G 9.8M 1.9G 1% /run
tmpfs 2.0G 0 2.0G 0% /sys/fs/cgroup
/dev/mapper/centos-root 50G 7.5G 43G 15% /
/dev/sda1 1014M 293M 722M 29% /boot
/dev/mapper/centos-home 969G 33M 969G 1% /home
tmpfs 391M 12K 391M 1% /run/user/42
tmpfs 391M 0 391M 0% /run/user/0
[root@prometheus-server ~]# dd if=/dev/zero of=/test1 bs=1M count=10000
记录了10000+0 的读入
记录了10000+0 的写出
10485760000字节(10 GB)已复制,14.1928 秒,739 MB/秒
[root@prometheus-server ~]# df -h
文件系统 容量 已用 可用 已用% 挂载点
devtmpfs 1.9G 0 1.9G 0% /dev
tmpfs 2.0G 0 2.0G 0% /dev/shm
tmpfs 2.0G 9.8M 1.9G 1% /run
tmpfs 2.0G 0 2.0G 0% /sys/fs/cgroup
/dev/mapper/centos-root 50G 18G 33G 35% /
/dev/sda1 1014M 293M 722M 29% /boot
/dev/mapper/centos-home 969G 33M 969G 1% /home
tmpfs 391M 12K 391M 1% /run/user/42
tmpfs 391M 0 391M 0% /run/user/0