基于Prometheus的全方位监控平台--企业中需要哪些告警Rules?一、前言二、定义告警规则三、企业中的告警rules3.1、Node.rules3.2、prometheus.rules3.3、website.rules3.4、pod.rules3.5、volume.rules3.6、process.rules四、总结
Prometheus中的告警规则允许你基于PromQL表达式定义告警触发条件,Prometheus后端对这些触发规则进行周期性计算,当满足触发条件后则会触发告警通知。
在企业中,为了确保业务的稳定性和可靠性,Prometheus告警规则非常重要。以下是需要考虑的几个维度:
一条典型的告警规则如下所示:
xxxxxxxxxx
groups
name general.rules
rules
alert InstanceDown
expr
up{job=~"other-ECS|k8s-nodes|prometheus"} == 0
for 1m
labels
severity critical
annotations
summary"Instance {{ $labels.instance }} 停止工作"
description"{{ $labels.instance }} 主机名:{{ $labels.hostname }} 已经停止1分钟以上."
在告警规则文件中,我们可以将一组相关的规则设置定义在一个group下。
在每一个group中我们可以定义多个告警规则(rule)。一条告警规则主要由以下几部分组成:
结合公司的业务场景参考:Awesome Prometheus alerts | Collection of alerting rules (samber.github.io)
xxxxxxxxxx
groups
name node.rules
rules
alert NodeFilesystemUsage
expr
100 - (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 > 85
for 1m
labels
severity warning
annotations
summary"Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
description"{{ $labels.instance }} 主机名:{{ $labels.hostname }} : {{ $labels.mountpoint }} 分区使用大于85% (当前值: {{ $value }})"
alert NodeMemoryUsage
expr
100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 85
for 5m
labels
severity warning
annotations
summary"Instance {{ $labels.instance }} 内存使用率过高"
description"{{ $labels.instance }} 主机名:{{ $labels.hostname }} 内存使用大于85% (当前值: {{ $value }})"
alert NodeCPUUsage
expr
100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 85
for 10m
labels
hostname'{{$labels.hostname}}'
severity warning
annotations
summary"Instance {{ $labels.instance }} CPU使用率过高"
description"{{ $labels.instance }} 主机名:{{ $labels.hostname }} CPU使用大于85% (当前值: {{ $value }})"
alert TCP_Estab
expr
node_netstat_Tcp_CurrEstab > 5500
for 5m
labels
severity warning
annotations
summary"Instance {{ $labels.instance }} TCP_Estab链接过高"
description"{{ $labels.instance }} 主机名:{{ $labels.hostname }} TCP_Estab链接过高!(当前值: {{ $value }})"
alert TCP_TIME_WAIT
expr
node_sockstat_TCP_tw > 3000
for 5m
labels
severity warning
annotations
summary"Instance {{ $labels.instance }} TCP_TIME_WAIT过高"
description"{{ $labels.instance }} 主机名:{{ $labels.hostname }} TCP_TIME_WAIT过高!(当前值: {{ $value }})"
alert TCP_Sockets
expr
node_sockstat_sockets_used > 10000
for 5m
labels
severity warning
annotations
summary"Instance {{ $labels.instance }} TCP_Sockets链接过高"
description"{{ $labels.instance }} 主机名:{{ $labels.hostname }} TCP_Sockets链接过高!(当前值: {{ $value }})"
alert KubeNodeNotReady
expr
kube_node_status_condition{condition="Ready",status="true"} == 0
for 1m
labels
severity critical
annotations
description'{{ $labels.node }} NotReady已经1分钟.'
alert KubernetesMemoryPressure
expr kube_node_status_condition condition="MemoryPressure" status="true" == 1
for 2m
labels
severity critical
annotations
summary Kubernetes memory pressure (instance $labels.instance )
description"{{ $labels.node }} has MemoryPressure condition VALUE = {{ $value }}"
alert KubernetesDiskPressure
expr kube_node_status_condition condition="DiskPressure" status="true" == 1
for 2m
labels
severity critical
annotations
summary Kubernetes disk pressure (instance $labels.instance )
description"{{ $labels.node }} has DiskPressure condition."
alert KubernetesContainerOomKiller
expr (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason reason="OOMKilled" 10m ) == 1
for 10m
labels
severity warning
annotations
summary Kubernetes container oom killer (instance $labels.instance )
description"{{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes."
alert KubernetesJobFailed
expr kube_job_status_failed > 0
for 1m
labels
severity warning
annotations
summary Kubernetes Job failed (instance $labels.instance )
description"Job {{$labels.namespace}}/{{$labels.job_name}} failed to complete."
alert UnusualDiskReadRate
expr
sum by (job,instance) (irate(node_disk_read_bytes_total[5m])) / 1024 / 1024 > 140
for 5m
labels
severity critical
hostname'{{ $labels.hostname }}'
annotations
description'{{ $labels.instance }} 主机名:{{ $labels.hostname }} 持续5分钟磁盘读取数据(> 140 MB/s) (当前值: {{ $value }}) 阿里云ESSD PL0最大吞吐量180MB/s, PL1最大350MB/s'
alert UnusualDiskWriteRate
expr
sum by (job,instance) (irate(node_disk_written_bytes_total[5m])) / 1024 / 1024 > 140
for 5m
labels
severity critical
hostname'{{ $labels.hostname }}'
annotations
description'{{ $labels.instance }} 主机名:{{ $labels.hostname }} 持续5分钟磁盘写入数据(> 140 MB/s) (当前值: {{ $value }}) 阿里云ESSD PL0最大吞吐量180MB/s, PL1最大350MB/s'
alert UnusualNetworkThroughputIn
expr
sum by (job,instance) (irate(node_network_receive_bytes_total{job=~"aws-hk-monitor|k8s-nodes"}[5m])) / 1024 / 1024 > 80
for 5m
labels
severity critical
annotations
description'{{ $labels.instance }} 主机名:{{ $labels.hostname }} 持续5分钟网络带宽接收数据(> 80 MB/s) (当前值: {{ $value }})'
alert UnusualNetworkThroughputOut
expr
sum by (job,instance) (irate(node_network_transmit_bytes_total{job=~"aws-hk-monitor|k8s-nodes"}[5m])) / 1024 / 1024 > 80
for 5m
labels
severity critical
annotations
description'{{ $labels.instance }} 主机名:{{ $labels.hostname }} 持续5分钟网络带宽发送数据(> 80 MB/s) (当前值: {{ $value }})'
alert SystemdServiceCrashed
expr
node_systemd_unit_state{state="failed"} == 1
for 5m
labels
severity warning
annotations
description'{{ $labels.instance }} 主机名:{{ $labels.hostname }} 上的{{$labels.name}}服务有问题已经5分钟,请及时处理'
alert HostDiskWillFillIn24Hours
expr (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes fstype!~"tmpfs" 1h , 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for 2m
labels
severity warning
annotations
summary Host disk will fill in 24 hours (instance $labels.instance )
description"{{ $labels.instance }} 主机名:{{ $labels.hostname }} 以当前写入速率,预计文件系统将在未来24小时内耗尽空间!"
alert HostOutOfInodes
expr node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for 2m
labels
severity warning
annotations
summary Host out of inodes (instance $labels.instance )
description"{{ $labels.instance }} 主机名:{{ $labels.hostname }} 磁盘iNode空间剩余小于10%!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
alert HostOomKillDetected
expr increase(node_vmstat_oom_kill 1m ) > 0
for 0m
labels
severity warning
annotations
summary Host OOM kill detected (instance $labels.instance )
description"{{ $labels.instance }} 主机名:{{ $labels.hostname }} 当前主机检查到有OOM现象!"
xxxxxxxxxx
groups
name prometheus.rules
rules
alert PrometheusErrorSendingAlertsToAnyAlertmanagers
expr
(rate(prometheus_notifications_errors_total{instance="localhost:9090", job="prometheus"}[5m]) / rate(prometheus_notifications_sent_total{instance="localhost:9090", job="prometheus"}[5m])) * 100 > 3
for 5m
labels
severity warning
annotations
description'{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
alert PrometheusNotConnectedToAlertmanagers
expr
max_over_time(prometheus_notifications_alertmanagers_discovered{instance="localhost:9090", job="prometheus"}[5m]) != 1
for 5m
labels
severity critical
annotations
description"Prometheus {{$labels.namespace}}/{{$labels.pod}} 链接alertmanager异常!"
alert PrometheusRuleFailures
expr
increase(prometheus_rule_evaluation_failures_total{instance="localhost:9090", job="prometheus"}[5m]) > 0
for 5m
labels
severity critical
annotations
description'Prometheus {{$labels.namespace}}/{{$labels.pod}} 在5分钟执行失败的规则次数 {{ printf "%.0f" $value }}'
alert PrometheusRuleEvaluationFailures
expr increase(prometheus_rule_evaluation_failures_total 3m ) > 0
for 0m
labels
severity critical
annotations
summary Prometheus rule evaluation failures (instance $labels.instance )
description"Prometheus 遇到规则 {{ $value }} 载入失败, 请及时检查."
alert PrometheusTsdbReloadFailures
expr increase(prometheus_tsdb_reloads_failures_total 1m ) > 0
for 0m
labels
severity critical
annotations
summary Prometheus TSDB reload failures (instance $labels.instance )
description"Prometheus {{ $value }} TSDB 重载失败!"
alert PrometheusTsdbWalCorruptions
expr increase(prometheus_tsdb_wal_corruptions_total 1m ) > 0
for 0m
labels
severity critical
annotations
summary Prometheus TSDB WAL corruptions (instance $labels.instance )
description"Prometheus {{ $value }} TSDB WAL 模块出现问题!"
xxxxxxxxxx
groups
name website.rules
rules
alert"ssl证书过期警告"
expr (probe_ssl_earliest_cert_expiry - time())/86400 <30
for 1h
labels
severity warning
annotations
description'域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书'
summary"ssl证书过期警告"
alert blackbox_network_stats
expr probe_success == 0
for 1m
labels
severity critical
pod'{{$labels.instance}}'
namespace'{{$labels.kubernetes_namespace}}'
annotations
summary"接口/主机/端口/域名 {{ $labels.instance }} 不能访问"
description"接口/主机/端口/域名 {{ $labels.instance }} 不能访问,请尽快检测!"
alert curlHttpStatus
expr probe_http_status_code job="blackbox-http" >= 422 and probe_success job="blackbox-http" == 0
for 1m
labels
severity critical
annotations
summary'业务报警: 网站不可访问'
description'{{$labels.instance}} 不可访问,请及时查看,当前状态码为{{$value}}'
xxxxxxxxxx
groups
name pod.rules
rules
alert PodCPUUsage
expr
sum(rate(container_cpu_usage_seconds_total{image!=""}[5m]) * 100) by (pod, namespace) > 90
for 5m
labels
severity warning
pod'{{$labels.pod}}'
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于90% (当前值: {{ $value }})"
alert PodMemoryUsage
expr
sum(container_memory_rss{image!=""}) by(pod, namespace) / sum(container_spec_memory_limit_bytes{image!=""}) by(pod, namespace) * 100 != +inf > 85
for 5m
labels
severity critical
pod'{{$labels.pod}}'
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 内存使用大于85% (当前值: {{ $value }})"
alert KubeDeploymentError
expr
kube_deployment_spec_replicas{job="kubernetes-service-endpoints"} != kube_deployment_status_replicas_available{job="kubernetes-service-endpoints"}
for 3m
labels
severity warning
pod'{{$labels.deployment}}'
annotations
description"Deployment {{ $labels.namespace }}/{{ $labels.deployment }}控制器与实际数量不相符 (当前值: {{ $value }})"
alert coreDnsError
expr
kube_pod_container_status_running{container="coredns"} == 0
for 1m
labels
severity critical
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} coreDns服务异常 (当前值: {{ $value }})"
alert kubeProxyError
expr
kube_pod_container_status_running{container="kube-proxy"} == 0
for 1m
labels
severity critical
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} kube-proxy服务异常 (当前值: {{ $value }})"
alert filebeatError
expr
kube_pod_container_status_running{container="filebeat"} == 0
for 1m
labels
severity critical
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} filebeat服务异常 (当前值: {{ $value }})"
alert PodNetworkReceive
expr
sum(rate(container_network_receive_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod,namespace) > 60000
for 5m
labels
severity warning
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 入口流量大于60MB/s (当前值: {{ $value }}K/s)"
alert PodNetworkTransmit
expr
sum(rate(container_network_transmit_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod,namespace) > 60000
for 5m
labels
severity warning
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 出口流量大于60MB/s (当前值: {{ $value }}/K/s)"
alert PodRestart
expr
sum(changes(kube_pod_container_status_restarts_total[1m])) by (pod,namespace) > 1
for 1m
labels
severity warning
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod重启 (当前值: {{ $value }})"
alert PodFailed
expr
sum(kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0
for 5s
labels
severity critical
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Failed (当前值: {{ $value }})"
alert PodPending
expr
sum(kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0
for 30s
labels
severity critical
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Pending (当前值: {{ $value }})"
alert PodErrImagePull
expr
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ErrImagePull"}) == 1
for 1m
labels
severity warning
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态ErrImagePull (当前值: {{ $value }})"
alert PodImagePullBackOff
expr
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ImagePullBackOff"}) == 1
for 1m
labels
severity warning
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态ImagePullBackOff (当前值: {{ $value }})"
alert PodCrashLoopBackOff
expr
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}) == 1
for 1m
labels
severity warning
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态CrashLoopBackOff (当前值: {{ $value }})"
alert PodInvalidImageName
expr
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="InvalidImageName"}) == 1
for 1m
labels
severity warning
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态InvalidImageName (当前值: {{ $value }})"
alert PodCreateContainerConfigError
expr
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CreateContainerConfigError"}) == 1
for 1m
labels
severity warning
annotations
description"命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态CreateContainerConfigError (当前值: {{ $value }})"
alert KubernetesContainerOomKiller
expr (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason reason="OOMKilled" 10m ) == 1
for 0m
labels
severity warning
annotations
summary Kubernetes container oom killer (instance $labels.instance )
description"{{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes!"
alert KubernetesPersistentvolumeError
expr kube_persistentvolume_status_phase phase=~"Failed|Pending" job="kube-state-metrics" > 0
for 0m
labels
severity critical
annotations
summary Kubernetes PersistentVolume error (instance $labels.instance )
description"{{ $labels.instance }} Persistent volume is in bad state!"
alert KubernetesStatefulsetDown
expr (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
for 1m
labels
severity critical
annotations
summary Kubernetes StatefulSet down (instance $labels.instance )
description"{{ $labels.statefulset }} A StatefulSet went down!"
alert KubernetesStatefulsetReplicasMismatch
expr kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
for 10m
labels
severity warning
annotations
summary Kubernetes StatefulSet replicas mismatch (instance $labels.instance )
description"{{ $labels.statefulset }} A StatefulSet does not match the expected number of replicas."
xxxxxxxxxx
groups
name volume.rules
rules
alert PersistentVolumeClaimLost
expr
sum by(namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_status_phase{phase="Lost"}) == 1
for 2m
labels
severity warning
annotations
description"PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is lost!"
alert PersistentVolumeClaimPendig
expr
sum by(namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_status_phase{phase="Pendig"}) == 1
for 2m
labels
severity warning
annotations
description"PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pendig!"
alert PersistentVolume Failed
expr
sum(kube_persistentvolume_status_phase{phase="Failed",job="kubernetes-service-endpoints"}) by (persistentvolume) == 1
for 2m
labels
severity warning
annotations
description"Persistent volume is failed state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
alert PersistentVolume Pending
expr
sum(kube_persistentvolume_status_phase{phase="Pending",job="kubernetes-service-endpoints"}) by (persistentvolume) == 1
for 2m
labels
severity warning
annotations
description"Persistent volume is pending state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
xxxxxxxxxx
groups
name process.rules
rules
alert Process for Sparkxtask already down!!!
expr
(namedprocess_namegroup_num_procs{groupname="map[:sparkxtask]"}) < 4
for 1m
labels
severity warning
pod sparkxads-process
annotations
description"任务名称: sparktask | 正常进程数量: 4个 | 当前值: {{ $value }},请Robot及时处理!"
本节课主要探讨了 Prometheus 中不同维度的规则定义,总结如下: