118 lines
4.8 KiB
YAML
118 lines
4.8 KiB
YAML
|
apiVersion: monitoring.coreos.com/v1
|
||
|
kind: PrometheusRule
|
||
|
metadata:
|
||
|
labels:
|
||
|
app.kubernetes.io/component: exporter
|
||
|
app.kubernetes.io/name: kube-prometheus
|
||
|
app.kubernetes.io/part-of: kube-prometheus
|
||
|
prometheus: k8s
|
||
|
role: alert-rules
|
||
|
name: kube-prometheus-rules
|
||
|
namespace: monitoring
|
||
|
annotations:
|
||
|
argocd.argoproj.io/sync-wave: '1'
|
||
|
spec:
|
||
|
groups:
|
||
|
- name: general.rules
|
||
|
rules:
|
||
|
- alert: TargetDown
|
||
|
annotations:
|
||
|
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{
|
||
|
$labels.service }} targets in {{ $labels.namespace }} namespace are
|
||
|
down.'
|
||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
||
|
summary: One or more targets are unreachable.
|
||
|
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
|
||
|
BY (cluster, job, namespace, service)) > 10
|
||
|
for: 10m
|
||
|
labels:
|
||
|
severity: warning
|
||
|
- alert: Watchdog
|
||
|
annotations:
|
||
|
description: 'This is an alert meant to ensure that the entire alerting
|
||
|
pipeline is functional.
|
||
|
|
||
|
This alert is always firing, therefore it should always be firing in
|
||
|
Alertmanager
|
||
|
|
||
|
and always fire against a receiver. There are integrations with various
|
||
|
notification
|
||
|
|
||
|
mechanisms that send a notification when this alert is not firing. For
|
||
|
example the
|
||
|
|
||
|
"DeadMansSnitch" integration in PagerDuty.
|
||
|
|
||
|
'
|
||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
||
|
summary: An alert that should always be firing to certify that Alertmanager
|
||
|
is working properly.
|
||
|
expr: vector(1)
|
||
|
labels:
|
||
|
severity: none
|
||
|
- alert: InfoInhibitor
|
||
|
annotations:
|
||
|
description: 'This is an alert that is used to inhibit info alerts.
|
||
|
|
||
|
By themselves, the info-level alerts are sometimes very noisy, but they
|
||
|
are relevant when combined with
|
||
|
|
||
|
other alerts.
|
||
|
|
||
|
This alert fires whenever there''s a severity="info" alert, and stops
|
||
|
firing when another alert with a
|
||
|
|
||
|
severity of ''warning'' or ''critical'' starts firing on the same namespace.
|
||
|
|
||
|
This alert should be routed to a null receiver and configured to inhibit
|
||
|
alerts with severity="info".
|
||
|
|
||
|
'
|
||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
|
||
|
summary: Info-level alert inhibition.
|
||
|
expr: ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname
|
||
|
!= "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"}
|
||
|
== 1
|
||
|
labels:
|
||
|
severity: none
|
||
|
- name: node-network
|
||
|
rules:
|
||
|
- alert: NodeNetworkInterfaceFlapping
|
||
|
annotations:
|
||
|
description: Network interface "{{ $labels.device }}" changing its up
|
||
|
status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod
|
||
|
}}
|
||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
|
||
|
summary: Network interface is often changing its status
|
||
|
expr: 'changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m])
|
||
|
> 2
|
||
|
|
||
|
'
|
||
|
for: 2m
|
||
|
labels:
|
||
|
severity: warning
|
||
|
- name: kube-prometheus-node-recording.rules
|
||
|
rules:
|
||
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m]))
|
||
|
BY (instance)
|
||
|
record: instance:node_cpu:rate:sum
|
||
|
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||
|
record: instance:node_network_receive_bytes:rate:sum
|
||
|
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||
|
record: instance:node_network_transmit_bytes:rate:sum
|
||
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
||
|
WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
|
||
|
BY (instance, cpu)) BY (instance)
|
||
|
record: instance:node_cpu:ratio
|
||
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
||
|
record: cluster:node_cpu:sum_rate5m
|
||
|
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY
|
||
|
(instance, cpu))
|
||
|
record: cluster:node_cpu:ratio
|
||
|
- name: kube-prometheus-general.rules
|
||
|
rules:
|
||
|
- expr: count without(instance, pod, node) (up == 1)
|
||
|
record: count:up1
|
||
|
- expr: count without(instance, pod, node) (up == 0)
|
||
|
record: count:up0
|