Skip to content

alertManager

install

docker

  # ==========================
  # 4. Alertmanager (告警管理)
  # ==========================
  alertmanager:
    image: 192.168.3.12:5005/prometheus/alertmanager:v0.28.0
    container_name: alertmanager
    restart: unless-stopped
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
      - alertmanager_data:/alertmanager
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
# 数据持久化卷
volumes:
  alertmanager_data:
alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - alertmanager:9093

Note

注意repeat_interval的间隔时间 测试时同一告警触发后需要到设置时间才能再次触发

Silence 的状态

状态 含义
Active 当前正在生效,匹配的告警会被静默
Pending Silence 设置了未来的开始时间,还没生效
Expired Silence 已经过了 endsAt 时间,不再生效

altermanager 指标注册到prometheus

- job_name: 'altermanager' 
  static_configs:
  - targets: ['localhost:9093']

http://192.168.3.204:9093/api/v2/alerts http://192.168.3.204:9093/api/v2/status

配置

group_wait: 30s # 第一次告警延迟发送时间 当一个新的告警触发时,Alertmanager 会 等待 30 秒 再发送通知,以便收集同组的其他告警。 group_interval: 5m # 同一分组后续通知发送间隔 如果同一组告警仍在持续触发(FIRING),Alertmanager 会 每 5 分钟发送一次更新通知。 repeat_interval: 1h #重复发送整个告警的间隔 如果某条告警一直存在,没有解决,Alertmanager 每隔 1 小时会再次发送整条告警。确保长期存在的告警不会被忽略。

发送多个

receivers:
  - name: 'all_webhooks'
    webhook_configs:
      - url: 'http://192.168.3.51:8000/'
      - url: 'http://localhost:8060/dingtalk/webhook1/send'
        send_resolved: true

route:
  group_by: ['alertname']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 1h
  receiver: 'all_webhooks'
global:
  resolve_timeout: 5m

route:
  group_by: ['alertname', 'instance']
  group_wait: 10s
  group_interval: 30s
  repeat_interval: 1m
  receiver: 'http-webhook'
  routes:
    - match_re:
        alertname: ".*"       # 匹配所有告警
      receiver: 'dingtalk_webhook'


receivers:
  - name: 'http-webhook'
    webhook_configs:
      - url: 'http://192.168.3.51:8001/alter'
        send_resolved: true   # 告警恢复时也发送通知
  - name: 'dingtalk_webhook'
    webhook_configs:
      - url: 'http://localhost:8060/dingtalk/webhook1/send' # 填写prometheus-webhook的webhook1 url
        send_resolved: true     # 在恢复后是否发送恢复消息给接收人        

alertmanager

config

telegram_config

rocketchat

测试

curl -X POST http://localhost:9093/api/v2/alerts \
  -H "Content-Type: application/json" \
  -d '[{
    "labels": {
      "alertname": "HighCPU",
      "instance": "server1:9100",
      "severity": "critical"
    },
    "annotations": {
      "description": "CPU usage超过95%"
    },
    "startsAt": "2023-07-20T12:00:00Z"
  }]'
  • 告警修复
curl -X POST http://localhost:9093/api/v2/alerts \
  -H "Content-Type: application/json" \
  -d '[{
    "labels": {
      "alertname": "HighCPU",
      "instance": "server1:9100",
      "severity": "critical"
    },
    "annotations": {
      "description": "CPU usage超过95%"
    },
    "startsAt": "2023-07-20T12:00:00Z",
    "endsAt": "2024-07-20T12:00:00Z"
  }]'
  • 触发中(firing) : endsAt 为空/不填,或者填一个未来时间(且还没到)。
  • 已恢复(resolved) : endsAt 填一个“现在或过去”的时间(到期后 Alertmanager 就认为这条告警结束了)。 注意两点:

  • 要让“同一条告警”从 firing 变成 resolved, labels 必须完全一致 (alertname/instance/severity 等都一样),Alertmanager 才会把它当作同一个告警的生命周期。

  • 是否会把“恢复通知”推到下游(钉钉/Grafana/webhook),还取决于接收端是否配置了 send_resolved: true 。
route:
  receiver: 'webhook'

receivers:
- name: 'webhook'
  webhook_configs:
  - url: 'http://localhost:5000/alert'  # 自定义Webhook接收地址
    send_resolved: true                   # 发送恢复通知
    # http_config:
    #   bearer_token: 'your-auth-token'     # 可选认证
    max_alerts: 10                        # 单次请求最大告警数

Alertmanager 会将相同标签的告警合并为一个通知组,默认 group_wait: 30s(首次等待)和 group_interval: 5m(重复间隔)

route:
  receiver: 'webhook'
  group_by: ['alertname']  # 按告警名称分组
  group_wait: 1s          # 立即发送首次告警
  group_interval: 1s      # 每组告警间隔1秒
  repeat_interval: 1s     # 相同告警重复间隔

receivers:
- name: 'webhook'
  webhook_configs:
  - url: 'http://localhost:5000/alert'
    send_resolved: true
    max_alerts: 10

email 告警通知

{{ define "custom.email.html" }}
<!DOCTYPE html>

    <h1>hello world</h1>
</html>

{{ end }}
global:
  resolve_timeout: 5m
  smtp_from: ''
  smtp_smarthost: 'smtp.qq.com:465'
  smtp_auth_username: ''
  smtp_auth_password: ''  # 邮箱授权码(非登录密码)
  smtp_require_tls: false               # SSL,此处设为false
  smtp_hello: '163.com'

templates:
  - './hello.html'

route:
  receiver: 'email-notice'

receivers:
- name: 'email-notice'
  email_configs:
  - to: 'wangjn@shingi.cn'
    send_resolved: true                 # 发送问题解决通知
    headers:
      Subject: '【报警】{{ .CommonLabels.alertname }}'
    html: '{{ template "custom.email.html" . }}'

prometheus-webhook-dingtalk

global:
  resolve_timeout: 5m

route:
  group_by: ['alertname', 'instance']
  group_wait: 10s
  group_interval: 30s
  repeat_interval: 1m
  receiver: 'http-webhook'
  routes:
    - match_re:
        alertname: ".*"       # 匹配所有告警
      receiver: 'dingtalk_webhook'


receivers:
  - name: 'http-webhook'
    webhook_configs:
      - url: 'http://192.168.3.51:8001/alter'
        send_resolved: true   # 告警恢复时也发送通知
  - name: 'dingtalk_webhook'
    webhook_configs:
      - url: 'http://localhost:8060/dingtalk/webhook1/send' # 填写prometheus-webhook的webhook1 url
        send_resolved: true     # 在恢复后是否发送恢复消息给接收人      
☁️ 部署建议
如果你打算长期运行项目(博客 / API / 自动化脚本),建议直接用云服务器,会比本地稳定很多。
👉 查看云服务器(新用户优惠)