| 级别 | 响应 | 通知方式 | 示例 |
|---|---|---|---|
| P0 (Critical) | 5min 内 | 电话 + 页面 | 核心服务完全不可用 |
| P1 (Warning) | 30min 内 | 页面 | 延迟 p99 超过 SLO |
| P2 (Info) | 4h 内 | 消息/工单 | 非核心服务异常 |
| P3 (Notice) | 工作日 | 工单 | 磁盘预计 48h 满 |
| P4 (Low) | 观察 | 邮件 | 证书 30 天后过期 |
groups:
- name: service_alerts
rules:
# P0: 服务不可用
- alert: ServiceDown
expr: up{job="api"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Service is down"
runbook: "https://wiki.internal/runbooks/service-down"
# P1: 错误率超过 SLO
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
/
sum(rate(http_requests_total[5m])) by (service)
> 0.01
for: 5m
labels:
severity: warning
annotations:
summary: "Error rate exceeds 1%"
# P3: 磁盘预测
- alert: DiskFullIn48h
expr: predict_linear(node_filesystem_free_bytes[1h], 48*3600) < 0
for: 10m
labels:
severity: notice
annotations:
summary: "Disk will be full in 48h"
route:
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'oncall-phone'
- match:
severity: warning
receiver: 'oncall-slack'
inhibit_rules:
# 单节点宕机时抑制节点上的服务告警
- source_match:
alertname: 'NodeDown'
target_match:
alertname: 'ServiceDown'
equal: ['instance']