先檢查有沒有開啟 Rancher 的 Catalog 功能,預設是開啟的
prometheus-alertmanageer 的 頁面,可以看到被觸發的警告事件
groups:
- name: test-rule
rules:
---
- alert: NodeFilesystemUsage # Alert名稱
# 監控條件
expr: avg by (instance, kubernetes_name, kubernetes_namespace)((node_filesystem_size{device="/dev/sda2"} - node_filesystem_free{device="/dev/sda2"}) / node_filesystem_size{device="/dev/sda2"} * 100) > 10
for: 1m # 符合條件,多久後視為異常狀態
labels:
team: node
annotations:
# 警告詳細敘述 (會顯示在 AlertManager 管理頁面上)
summary: "{{$labels.instance}}: High Filesystem usage detected"
description: "{{$labels.instance}}: Filesystem usage is above 80% (current value is: {{ $value }}"
# ----------
- alert: NodeMemoryUsage
expr: (node_memory_MemTotal - (node_memory_MemFree+node_memory_Buffers+node_memory_Cached )) / node_memory_MemTotal * 100 > 80
for: 1m
labels:
team: node
annotations:
summary: "{{$labels.instance}}: High Memory usage detected"
description: "{{$labels.instance}}: Memory usage is above 80% (current value is: {{ $value }}"
# ----------
- alert: NodeCPUUsage
expr: (100 - (avg by (instance) (irate(node_cpu{mode="idle"}[5m])) * 100)) > 80
for: 2m
labels:
team: node
annotations:
summary: "{{$labels.instance}}: High CPU usage detected"
description: "{{$labels.instance}}: CPU usage is above 80% (current value is: {{ $value }}"