Alertmanager是一个独立的告警模块,通过接收Prometheus等客户端发来的警报,prometheus是作为管理,Alertmanager是执行告警规则。
1、Prometheus按告警规则(rule_files配置块)向Alertmanager发送告警(即告警规则是在Prometheus上定义的
2、Alertmanager来管理这些告警,包括去重(Deduplicating)、分组(Grouping)、静音(silencing)、抑制(inhibition)、聚合(aggregation ),最终将面要发出的告警通过电子邮件、webhook等方式将告警通知路由(route)给对应的联系。
1、分组
分组将类似性质的警报分类为单个通知。当许多系统同时发生故障并且可能同时触发数百到 数千个警报时,此功能特别有用
2、沉默
沉默是一种简单的特定时间静音提醒的机制。一种沉默是通过匹配器来配置,就像路由树一 样。传入的警报会匹配RE,如果匹配,将不会为此警报发送通知,在Alertmanager的Web界 面中配置沉默
3、抑制
抑制是指当警报发出后,停止重复发送由此警报引发其他错误的警报的机制。
一、配置Alertmanager来做告警通知
1、安装alertmanager服务
tar -xvf alertmanager-0.20.0.linux-amd64.tar.gz
mv alertmanager-0.20.0.linux-amd64 /data/alertmanager
2、cd /data/alertmanager/
修改alertmanager.yml
做好备份 cp alertmanager.yml alertmanager.yml.bak
global:全局配置,主要配置告警方式,如邮件、webhook等。
route:用来设置报警的分发策略,它是一个树状结构,按照深度优先从左向右的顺序进行匹配。
receivers: 配置告警消息接受者信息,例如常用的 email、wechat、slack、webhook 等消息通知方式。
inhibit_rules: 抑制规则配置,当存在与另一组匹配的警报(源)时,抑制规则将禁用与一组匹配的警报(目标)
vim alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:465'
smtp_from: 'kangwei_k@163.com'
smtp_auth_username: 'kangwei_k@163.com'
smtp_auth_password: 'VSFGFMZVREDBZHNQ'
smtp_require_tls: false
templates:
- '/data/alertmanager/template/default.tmpl' #邮件模板配置
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 20m
repeat_interval: 10m
receiver: 'email'
routes:
- match_re:
resource: ^(memory|cpu|disk)$
receiver: system-pager
continue: false
routes:
- match:
severity: critical
receiver: critical-pager
- match:
resource: net
receiver: net-pager
- match:
resource: host
receiver: system-pager
continue: true
routes:
- match:
severity: crirical
receiver: critical-pager
receivers:
- name: 'email'
email_configs:
- to: 'kangwei_k@163.com'
html: '{{ template "default.html" . }}'
headers: { Subject: "{{ .GroupLabels.SortedPairs.Values }} [{{ .Status | toUpper }}:{{ .Alerts.Firing | len }}]" }
- name: 'system-pager'
email_configs:
- to: 'kangwei_k@163.com'
html: '{{ template "default.html" . }}'
headers: { Subject: "{{ .GroupLabels.SortedPairs.Values }} [{{ .Status | toUpper }}:{{ .Alerts.Firing | len }}]" }
- name: 'net-pager'
email_configs:
- to: 'kangwei_k@163.com'
html: '{{ template "default.html" . }}'
headers: { Subject: "{{ .GroupLabels.SortedPairs.Values }} [{{ .Status | toUpper }}:{{ .Alerts.Firing | len }}]" }
- name: 'critical-pager'
email_configs:
- to: 'kangwei_k@163.com'
html: '{{ template "default.html" . }}'
headers: { Subject: "{{ .GroupLabels.SortedPairs.Values }} [{{ .Status | toUpper }}:{{ .Alerts.Firing | len }}]" }
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
自定义邮件模板
{{ define "default.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
[{{ .Status | toUpper }}:{{ .Alerts.Firing | len }}]
{{ range $i, $alert := .Alerts }}
<pre>
告警节点:{{ index $alert.Labels "nodename" }}
告警服务:{{ index $alert.Labels "alertname" }}
报警详情:{{ index $alert.Annotations "summary" }}
开始时间:{{ $alert.StartsAt.Local }}
</pre>
{{ end }}
{{ end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
[{{ .Status | toUpper }}:{{ .Alerts.Resolved | len }}]
{{ range $i, $alert := .Alerts }}
<pre>
恢复节点:{{ index $alert.Labels "nodename" }}
恢复服务:{{ index $alert.Labels "alertname" }}
状 态:{{ index $alert.Status }}
开始时间:{{ $alert.StartsAt.Local }}
恢复时间:{{ $alert.EndsAti.Local }}
</pre>
{{ end }}
{{ end }}
{{- end }}
define 用来定义变量,配置3个变量,分别是:email.from、email.to、email.to.html ,可以在 alertmanager.yml 文件中直接配置引用。
这里 email.to.html 就是要发送的邮件内容,支持 Html 和 Text 格式。为了显示好看,采用 Html 格式简单显示信息
{{ range .Alerts }} 是个循环语法,用于循环获取匹配的 Alerts 的信息,下边的告警信息跟上边默认邮件显示信息一样,只是提取了部分核心值来展示
配置告警规则
需要在prometheus修改
vim prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
#scrape_timeout: 20s # scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 121.36.252.79:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*_rules.yml"
- "rules/*_alerts.yml"
rules.yml是定义规则
alerts.yml是触发规则
[root@huawei rules]# cat node_rules.yml
groups:
- name: node_rules
rules:
- record: node_cpu_usage
expr: 100 - avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (nodename) * 100
labels:
metric_type: cpu_users
- record: node_1m_load
expr: node_load15
labels:
meteic_type: node_1m_load
- record: node_mem_usage
expr: 100 - (node_memory_MemAvailable_bytes)/(node_memory_MemTotal_bytes) *100
labels:
metric_type: mem_usage
#- record: node_root_diski
# expr: predict_linear(node_filesystem_free_bytes{device="/dev/vda1",mountpoint="/"}[2h],24*3600)/(1024*1024*1024)
# labels:
# meteic_type: root_disk
- record: node_cpu_usage_rate1m
expr: (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,vendor,account,group,name)) * 100
labels:
meteic_type: cpu_usage_rate1m
- record: node_mem_usage_rate1m
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
labels:
meteic_type: mem_usage_rate1m
[root@huawei rules]# cat node_alerts.yml
groups:
- name: node_alerts
rules:
- alert: cpu_usage_over_threshold
expr: node_cpu_usage > 80
for: 1m
labels:
severity: warning
annotations:
summary: 主机{{$labels.nodename}}的cpu使用率持续1分钟超出阈值,当前为{{humanize $value}}%
- alert: systemc_1m_load
expr: node_1m_load > 20
for: 1m
labels:
severity: warning
annotations:
summary: 主机{{$labels.nodename}}的1分钟负载阈值超出,当前阈值是{{humanize $value}}%
- alert: mem_usage_over
expr: node_mem_usage > 90
for: 1m
labels:
severity: warning
annotations:
summary: 主机{{$labels.nodename}}的内存使用率持续一分钟超出阈值,当前是{{humanize $value}}%
#- alert: root_disk_over
# expr: node_root_diski < 60
# for: 1m
# labels:
# severty: warning
# annotations:
# summary: 主机{{$labels.nodename}}的磁盘预计会在1天后达到{{humanize1024 $value}}GB 请及时扩容!
- alert: port_status
expr: probe_success == 0
for: 1m
labels:
severty: warning
annotations:
summary: 主机{{$labels.nodename}}的端口异常
访问127.0.0.1:9093 是alertmanager web界面管理
过滤规则------开启沉默