Alerts


/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-kube-prometheus-stack-rabbitmq.yaml > ngate-certificate-expiration-monitoring
NgateCertificateExpiring (0 active)
alert: NgateCertificateExpiring
expr: ng_infra_cert_expiry < 2.592e+06
labels:
  severity: warning
annotations:
  description: Certificate is about to expire or expired
  summary: Certificate {{ $labels.name }} is about to expire or expired. Link https://grafana.basis.center/d/uU0S3zISz/ngate?orgId=1
NgateCertificateExpiring (0 active)
alert: NgateCertificateExpiring
expr: ng_infra_cert_expiry < 1.2096e+06
labels:
  severity: critical
annotations:
  description: Certificate expired in 14 days or less
  summary: Certificate {{ $labels.name }} is about to expire or expired. Link https://grafana.basis.center/d/uU0S3zISz/ngate?orgId=1
NgateKeyExpiring (0 active)
alert: NgateKeyExpiring
expr: ng_infra_pk_expiry < 2.592e+06
labels:
  severity: warning
annotations:
  description: Key is about to expire or expired
  summary: Key {{ $labels.name }} is about to expire or expired. Link https://grafana.basis.center/d/uU0S3zISz/ngate?orgId=1
NgateKeyExpiring (0 active)
alert: NgateKeyExpiring
expr: ng_infra_pk_expiry < 1.2096e+06
labels:
  severity: critical
annotations:
  description: Key expired in 14 days or less
  summary: Key {{ $labels.name }} is about to expire or expired. Link https://grafana.basis.center/d/uU0S3zISz/ngate?orgId=1
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-kube-prometheus-stack-rabbitmq.yaml > rabbitmq-messages
NumberOfConsumers (0 active)
alert: NumberOfConsumers
expr: sum(rabbitmq_queue_consumers{job="rabbitmq"}) < 600
for: 10m
labels:
  severity: critical
annotations:
  description: Общее количество консьюмеров {{ $value }}.
  summary: Это может означать что работают не все поды rabbitmq.
RabbitMQ-SoManyMessagesInQueue-10m (0 active)
alert: RabbitMQ-SoManyMessagesInQueue-10m
expr: sum(increase(cqrs_commands_queue_time_seconds_sum{namespace="doc-production"}[5m])) / sum(increase(cqrs_commands_queue_time_seconds_count{namespace="doc-production"}[5m])) > 0.5
for: 5m
labels:
  severity: critical
annotations:
  description: There are a lot of messages in some queues within 10 min.
  summary: This means there is a service overloaded or down. Link https://grafana.basis.center/d/hxivlZ1Gk/health?viewPanel=88&from=now-30m
RabbitMQ-SoManyMessagesInQueue-1m (0 active)
alert: RabbitMQ-SoManyMessagesInQueue-1m
expr: sum(increase(cqrs_commands_queue_time_seconds_sum{namespace="doc-production"}[1m])) / sum(increase(cqrs_commands_queue_time_seconds_count{namespace="doc-production"}[1m])) > 0.5
for: 1m
labels:
  severity: warning
annotations:
  description: There are a lot of messages in some queues within 1 min.
  summary: This means there is a service overloaded. Link https://grafana.basis.center/d/hxivlZ1Gk/health?viewPanel=88&from=now-30m
RabbitMQTooManyMessagesInQueue (0 active)
alert: RabbitMQTooManyMessagesInQueue
expr: rabbitmq_queue_messages > 100
for: 5m
labels:
  severity: critical
annotations:
  description: There are {{ $value }} messages in some queues.
  summary: This means there is a service overloaded or it's down.
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-alertmanager.rules.yaml > alertmanager.rules
AlertmanagerConfigInconsistent (0 active)
alert: AlertmanagerConfigInconsistent
expr: count by(namespace, service) (count_values by(namespace, service) ("config_hash", alertmanager_config_hash{job="prometheus-operator-kube-p-alertmanager",namespace="default"})) != 1
for: 5m
labels:
  severity: critical
annotations:
  message: |
    The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
    {{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
    Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
    {{ end }}
AlertmanagerFailedReload (0 active)
alert: AlertmanagerFailedReload
expr: alertmanager_config_last_reload_successful{job="prometheus-operator-kube-p-alertmanager",namespace="default"} == 0
for: 10m
labels:
  severity: warning
annotations:
  message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.
AlertmanagerMembersInconsistent (0 active)
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-general.rules.yaml > general.rules
TargetDown (1 active)
alert: TargetDown
expr: 100 * (count by(job, namespace, service) (up == 0) / count by(job, namespace, service) (up)) > 10
for: 10m
labels:
  severity: warning
annotations:
  message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
Labels State Active Since Value
alertname="TargetDown" job="ozon-agent" namespace="doc-production" service="ozon-agent" severity="warning" firing 2026-01-27 23:05:50.223808037 +0000 UTC 100
Watchdog (1 active)
alert: Watchdog
expr: vector(1)
labels:
  severity: none
annotations:
  message: |
    This is an alert meant to ensure that the entire alerting pipeline is functional.
    This alert is always firing, therefore it should always be firing in Alertmanager
    and always fire against a receiver. There are integrations with various notification
    mechanisms that send a notification when this alert is not firing. For example the
    "DeadMansSnitch" integration in PagerDuty.
Labels State Active Since Value
alertname="Watchdog" severity="none" firing 2026-01-27 23:05:50.223808037 +0000 UTC 1
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kube-apiserver-slos.yaml > kube-apiserver-slos
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate1h) > (14.4 * 0.01) and sum(apiserver_request:burnrate5m) > (14.4 * 0.01)
for: 2m
labels:
  long: 1h
  severity: critical
  short: 5m
annotations:
  description: The API server is burning too much error budget.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
  summary: The API server is burning too much error budget.
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate6h) > (6 * 0.01) and sum(apiserver_request:burnrate30m) > (6 * 0.01)
for: 15m
labels:
  long: 6h
  severity: critical
  short: 30m
annotations:
  description: The API server is burning too much error budget.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
  summary: The API server is burning too much error budget.
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate1d) > (3 * 0.01) and sum(apiserver_request:burnrate2h) > (3 * 0.01)
for: 1h
labels:
  long: 1d
  severity: warning
  short: 2h
annotations:
  description: The API server is burning too much error budget.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
  summary: The API server is burning too much error budget.
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate3d) > (1 * 0.01) and sum(apiserver_request:burnrate6h) > (1 * 0.01)
for: 3h
labels:
  long: 3d
  severity: warning
  short: 6h
annotations:
  description: The API server is burning too much error budget.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
  summary: The API server is burning too much error budget.
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kube-state-metrics.yaml > kube-state-metrics
KubeStateMetricsListErrors (0 active)
alert: KubeStateMetricsListErrors
expr: (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) / sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) > 0.01
for: 15m
labels:
  severity: critical
annotations:
  description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors
  summary: kube-state-metrics is experiencing errors in list operations.
KubeStateMetricsWatchErrors (0 active)
alert: KubeStateMetricsWatchErrors
expr: (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) / sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) > 0.01
for: 15m
labels:
  severity: critical
annotations:
  description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors
  summary: kube-state-metrics is experiencing errors in watch operations.
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kubernetes-apps.yaml > kubernetes-apps
KubeDaemonSetRolloutStuck (1 active)
alert: KubeDaemonSetRolloutStuck
expr: ((kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",namespace=~".*"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"}) or (kube_daemonset_status_number_misscheduled{job="kube-state-metrics",namespace=~".*"} != 0) or (kube_daemonset_updated_number_scheduled{job="kube-state-metrics",namespace=~".*"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"}) or (kube_daemonset_status_number_available{job="kube-state-metrics",namespace=~".*"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"})) and (changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics",namespace=~".*"}[5m]) == 0)
for: 15m
labels:
  severity: warning
annotations:
  description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
  summary: DaemonSet rollout is stuck.
Labels State Active Since Value
alertname="KubeDaemonSetRolloutStuck" container="kube-state-metrics" daemonset="fluentd" endpoint="http" instance="10.43.59.31:8080" job="kube-state-metrics" namespace="elastic-stack" pod="prometheus-operator-kube-state-metrics-66b4c95cd9-f4zf6" service="prometheus-operator-kube-state-metrics" severity="warning" firing 2026-03-04 14:30:29.276276712 +0000 UTC 4
KubeJobCompletion (1 active)
alert: KubeJobCompletion
expr: kube_job_spec_completions{job="kube-state-metrics",namespace=~".*"} - kube_job_status_succeeded{job="kube-state-metrics",namespace=~".*"} > 0
for: 12h
labels:
  severity: warning
annotations:
  description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
  summary: Job did not complete in time
Labels State Active Since Value
alertname="KubeJobCompletion" container="kube-state-metrics" endpoint="http" instance="10.43.59.31:8080" job="kube-state-metrics" job_name="logical-backup-doc-hr-1769648400" namespace="hr-production" pod="prometheus-operator-kube-state-metrics-66b4c95cd9-f4zf6" service="prometheus-operator-kube-state-metrics" severity="warning" firing 2026-03-04 14:30:29.276276712 +0000 UTC 1
KubeJobFailed (1 active)
alert: KubeJobFailed
expr: kube_job_failed{job="kube-state-metrics",namespace=~".*"} > 0
for: 15m
labels:
  severity: warning
annotations:
  description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
  summary: Job failed to complete.
Labels State Active Since Value
alertname="KubeJobFailed" condition="true" container="kube-state-metrics" endpoint="http" instance="10.43.59.31:8080" job="kube-state-metrics" job_name="logical-backup-doc-hr-1769648400" namespace="hr-production" pod="prometheus-operator-kube-state-metrics-66b4c95cd9-f4zf6" service="prometheus-operator-kube-state-metrics" severity="warning" firing 2026-03-04 14:30:29.276276712 +0000 UTC 1
KubeContainerWaiting (0 active)
alert: KubeContainerWaiting
expr: sum by(namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",namespace=~".*"}) > 0
for: 1h
labels:
  severity: warning
annotations:
  description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
  summary: Pod container waiting longer than 1 hour
KubeDaemonSetMisScheduled (0 active)
alert: KubeDaemonSetMisScheduled
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics",namespace=~".*"} > 0
for: 15m
labels:
  severity: warning
annotations:
  description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
  summary: DaemonSet pods are misscheduled.
KubeDaemonSetNotScheduled (0 active)
alert: KubeDaemonSetNotScheduled
expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",namespace=~".*"} > 0
for: 10m
labels:
  severity: warning
annotations:
  description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
  summary: DaemonSet pods are not scheduled.
KubeDeploymentGenerationMismatch (0 active)
alert: KubeDeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation{job="kube-state-metrics",namespace=~".*"} != kube_deployment_metadata_generation{job="kube-state-metrics",namespace=~".*"}
for: 15m
labels:
  severity: warning
annotations:
  description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
  summary: Deployment generation mismatch due to possible roll-back
KubeDeploymentReplicasMismatch (0 active)
alert: KubeDeploymentReplicasMismatch
expr: (kube_deployment_spec_replicas{job="kube-state-metrics",namespace=~".*"} != kube_deployment_status_replicas_available{job="kube-state-metrics",namespace=~".*"}) and (changes(kube_deployment_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[5m]) == 0)
for: 15m
labels:
  severity: warning
annotations:
  description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
  summary: Deployment has not matched the expected number of replicas.
KubeHpaMaxedOut (0 active)
alert: KubeHpaMaxedOut
expr: kube_hpa_status_current_replicas{job="kube-state-metrics",namespace=~".*"} == kube_hpa_spec_max_replicas{job="kube-state-metrics",namespace=~".*"}
for: 15m
labels:
  severity: warning
annotations:
  description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
  summary: HPA is running at max replicas
KubeHpaReplicasMismatch (0 active)
alert: KubeHpaReplicasMismatch
expr: (kube_hpa_status_desired_replicas{job="kube-state-metrics",namespace=~".*"} != kube_hpa_status_current_replicas{job="kube-state-metrics",namespace=~".*"}) and (kube_hpa_status_current_replicas{job="kube-state-metrics",namespace=~".*"} > kube_hpa_spec_min_replicas{job="kube-state-metrics",namespace=~".*"}) and (kube_hpa_status_current_replicas{job="kube-state-metrics",namespace=~".*"} < kube_hpa_spec_max_replicas{job="kube-state-metrics",namespace=~".*"}) and changes(kube_hpa_status_current_replicas[15m]) == 0
for: 15m
labels:
  severity: warning
annotations:
  description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
  summary: HPA has not matched descired number of replicas.
KubePodCrashLooping (0 active)
alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics",namespace=~".*"}[5m]) * 60 * 5 > 0
for: 15m
labels:
  severity: warning
annotations:
  description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
  summary: Pod is crash looping.
KubePodNotReady (0 active)
alert: KubePodNotReady
expr: sum by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",namespace=~".*",phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}))) > 0
for: 15m
labels:
  severity: warning
annotations:
  description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
  summary: Pod has been in a non-ready state for more than 15 minutes.
KubeStatefulSetGenerationMismatch (0 active)
alert: KubeStatefulSetGenerationMismatch
expr: kube_statefulset_status_observed_generation{job="kube-state-metrics",namespace=~".*"} != kube_statefulset_metadata_generation{job="kube-state-metrics",namespace=~".*"}
for: 15m
labels:
  severity: warning
annotations:
  description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
  summary: StatefulSet generation mismatch due to possible roll-back
KubeStatefulSetReplicasMismatch (0 active)
alert: KubeStatefulSetReplicasMismatch
expr: (kube_statefulset_status_replicas_ready{job="kube-state-metrics",namespace=~".*"} != kube_statefulset_status_replicas{job="kube-state-metrics",namespace=~".*"}) and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[5m]) == 0)
for: 15m
labels:
  severity: warning
annotations:
  description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
  summary: Deployment has not matched the expected number of replicas.
KubeStatefulSetUpdateNotRolledOut (0 active)
alert: KubeStatefulSetUpdateNotRolledOut
expr: (max without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics",namespace=~".*"} unless kube_statefulset_status_update_revision{job="kube-state-metrics",namespace=~".*"}) * (kube_statefulset_replicas{job="kube-state-metrics",namespace=~".*"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"})) and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[5m]) == 0)
for: 15m
labels:
  severity: warning
annotations:
  description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
  summary: StatefulSet update has not been rolled out.
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kubernetes-resources.yaml > kubernetes-resources
CPUThrottlingHigh (1 active)
alert: CPUThrottlingHigh
expr: sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > (25 / 100)
for: 15m
labels:
  severity: info
annotations:
  description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
  summary: Processes experience elevated CPU throttling.
Labels State Active Since Value
alertname="CPUThrottlingHigh" container="ngate-exporter" namespace="ngate-exporter" pod="ngate-exporter-78cfd45866-qqd2d" severity="info" firing 2026-03-05 10:02:40.545087902 +0000 UTC 0.7258064516129032
KubeCPUOvercommit (0 active)
alert: KubeCPUOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum) / sum(kube_node_status_allocatable_cpu_cores) > (count(kube_node_status_allocatable_cpu_cores) - 1) / count(kube_node_status_allocatable_cpu_cores)
for: 5m
labels:
  severity: info
annotations:
  description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
  summary: Cluster has overcommitted CPU resource requests.
KubeCPUQuotaOvercommit (0 active)
alert: KubeCPUQuotaOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="cpu",type="hard"}) / sum(kube_node_status_allocatable_cpu_cores) > 1.5
for: 5m
labels:
  severity: warning
annotations:
  description: Cluster has overcommitted CPU resource requests for Namespaces.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
  summary: Cluster has overcommitted CPU resource requests.
KubeMemoryOvercommit (0 active)
alert: KubeMemoryOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum) / sum(kube_node_status_allocatable_memory_bytes) > (count(kube_node_status_allocatable_memory_bytes) - 1) / count(kube_node_status_allocatable_memory_bytes)
for: 5m
labels:
  severity: warning
annotations:
  description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
  summary: Cluster has overcommitted memory resource requests.
KubeMemoryQuotaOvercommit (0 active)
alert: KubeMemoryQuotaOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="memory",type="hard"}) / sum(kube_node_status_allocatable_memory_bytes{job="kube-state-metrics"}) > 1.5
for: 5m
labels:
  severity: warning
annotations:
  description: Cluster has overcommitted memory resource requests for Namespaces.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
  summary: Cluster has overcommitted memory resource requests.
KubeQuotaAlmostFull (0 active)
alert: KubeQuotaAlmostFull
expr: kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) > 0.9 < 1
for: 15m
labels:
  severity: info
annotations:
  description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull
  summary: Namespace quota is going to be full.
KubeQuotaExceeded (0 active)
alert: KubeQuotaExceeded
expr: kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) > 1
for: 15m
labels:
  severity: warning
annotations:
  description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
  summary: Namespace quota has exceeded the limits.
KubeQuotaFullyUsed (0 active)
alert: KubeQuotaFullyUsed
expr: kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) == 1
for: 15m
labels:
  severity: info
annotations:
  description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
  summary: Namespace quota is fully used.
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kubernetes-storage.yaml > kubernetes-storage
KubePersistentVolumeErrors (0 active)
alert: KubePersistentVolumeErrors
expr: kube_persistentvolume_status_phase{job="kube-state-metrics",phase=~"Failed|Pending"} > 0
for: 5m
labels:
  severity: critical
annotations:
  description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
  summary: PersistentVolume is having issues with provisioning.
KubePersistentVolumeFillingUp (0 active)
alert: KubePersistentVolumeFillingUp
expr: kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"} < 0.03
for: 1m
labels:
  severity: critical
annotations:
  description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
  summary: PersistentVolume is filling up.
KubePersistentVolumeFillingUp (0 active)
alert: KubePersistentVolumeFillingUp
expr: (kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"}) < 0.15 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"}[6h], 4 * 24 * 3600) < 0
for: 1h
labels:
  severity: warning
annotations:
  description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
  summary: PersistentVolume is filling up.
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kubernetes-system-apiserver.yaml > kubernetes-system-apiserver
AggregatedAPIErrors (0 active)
alert: AggregatedAPIErrors
expr: sum by(name, namespace) (increase(aggregator_unavailable_apiservice_count[5m])) > 2
labels:
  severity: warning
annotations:
  description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
  summary: An aggregated API has reported errors.
KubeAPIDown (0 active)
alert: KubeAPIDown
expr: absent(up{job="apiserver"} == 1)
for: 15m
labels:
  severity: critical
annotations:
  description: KubeAPI has disappeared from Prometheus target discovery.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
  summary: Target disappeared from Prometheus target discovery.
KubeClientCertificateExpiration (0 active)
alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
labels:
  severity: warning
annotations:
  description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
  summary: Client certificate is about to expire.
KubeClientCertificateExpiration (0 active)
alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
labels:
  severity: critical
annotations:
  description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
  summary: Client certificate is about to expire.
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kubernetes-system-kubelet.yaml > kubernetes-system-kubelet
KubeNodeNotReady (0 active)
alert: KubeNodeNotReady
expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} == 0
for: 15m
labels:
  severity: warning
annotations:
  description: '{{ $labels.node }} has been unready for more than 15 minutes.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
  summary: Node is not ready.
KubeNodeReadinessFlapping (0 active)
alert: KubeNodeReadinessFlapping
expr: sum by(node) (changes(kube_node_status_condition{condition="Ready",status="true"}[15m])) > 2
for: 15m
labels:
  severity: warning
annotations:
  description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
  summary: Node readiness status is flapping.
KubeNodeUnreachable (0 active)
alert: KubeNodeUnreachable
expr: (kube_node_spec_taint{effect="NoSchedule",job="kube-state-metrics",key="node.kubernetes.io/unreachable"} unless ignoring(key, value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
for: 15m
labels:
  severity: warning
annotations:
  description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
  summary: Node is unreachable.
KubeletClientCertificateExpiration (0 active)
alert: KubeletClientCertificateExpiration
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
labels:
  severity: critical
annotations:
  description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
  summary: Kubelet client certificate is about to expire.
KubeletClientCertificateExpiration (0 active)
alert: KubeletClientCertificateExpiration
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
labels:
  severity: warning
annotations:
  description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
  summary: Kubelet client certificate is about to expire.
KubeletClientCertificateRenewalErrors (0 active)
alert: KubeletClientCertificateRenewalErrors
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
for: 15m
labels:
  severity: warning
annotations:
  description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors
  summary: Kubelet has failed to renew its client certificate.
KubeletDown (0 active)
alert: KubeletDown
expr: absent(up{job="kubelet",metrics_path="/metrics"} == 1)
for: 15m
labels:
  severity: critical
annotations:
  description: Kubelet has disappeared from Prometheus target discovery.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
  summary: Target disappeared from Prometheus target discovery.
KubeletPlegDurationHigh (0 active)
alert: KubeletPlegDurationHigh
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
for: 5m
labels:
  severity: warning
annotations:
  description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
  summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
KubeletPodStartUpLatencyHigh (0 active)
alert: KubeletPodStartUpLatencyHigh
expr: histogram_quantile(0.99, sum by(instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet",metrics_path="/metrics"}[5m]))) * on(instance) group_left(node) kubelet_node_name{job="kubelet",metrics_path="/metrics"} > 60
for: 15m
labels:
  severity: warning
annotations:
  description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
  summary: Kubelet Pod startup latency is too high.
KubeletServerCertificateExpiration (0 active)
alert: KubeletServerCertificateExpiration
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
labels:
  severity: warning
annotations:
  description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
  summary: Kubelet server certificate is about to expire.
KubeletServerCertificateExpiration (0 active)
alert: KubeletServerCertificateExpiration
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
labels:
  severity: critical
annotations:
  description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
  summary: Kubelet server certificate is about to expire.
KubeletServerCertificateRenewalErrors (0 active)
alert: KubeletServerCertificateRenewalErrors
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
for: 15m
labels:
  severity: warning
annotations:
  description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors
  summary: Kubelet has failed to renew its server certificate.
KubeletTooManyPods (0 active)
alert: KubeletTooManyPods
expr: count by(node) ((kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance, pod, namespace, cluster) group_left(node) topk by(instance, pod, namespace, cluster) (1, kube_pod_info{job="kube-state-metrics"})) / max by(node) (kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) > 0.95
for: 15m
labels:
  severity: warning
annotations:
  description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
  summary: Kubelet is running at capacity.
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kubernetes-system.yaml > kubernetes-system
KubeClientErrors (0 active)
alert: KubeClientErrors
expr: (sum by(instance, job) (rate(rest_client_requests_total{code=~"5.."}[5m])) / sum by(instance, job) (rate(rest_client_requests_total[5m]))) > 0.01
for: 15m
labels:
  severity: warning
annotations:
  description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
  summary: Kubernetes API server client is experiencing errors.
KubeVersionMismatch (0 active)
alert: KubeVersionMismatch
expr: count(count by(gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"}, "gitVersion", "$1", "gitVersion", "(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
  severity: warning
annotations:
  description: There are {{ $value }} different semantic versions of Kubernetes components running.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
  summary: Different semantic versions of Kubernetes components running.
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-node-exporter.yaml > node-exporter
NodeClockNotSynchronising (0 active)
alert: NodeClockNotSynchronising
expr: min_over_time(node_timex_sync_status[5m]) == 0 and node_timex_maxerror_seconds >= 16
for: 10m
labels:
  severity: warning
annotations:
  message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising
  summary: Clock not synchronising.
NodeClockSkewDetected (0 active)
alert: NodeClockSkewDetected
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 10m
labels:
  severity: warning
annotations:
  message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected
  summary: Clock skew detected.
NodeFilesystemAlmostOutOfFiles (0 active)
alert: NodeFilesystemAlmostOutOfFiles
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 3 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
  summary: Filesystem has less than 3% inodes left.
NodeFilesystemAlmostOutOfFiles (0 active)
alert: NodeFilesystemAlmostOutOfFiles
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 5 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
  summary: Filesystem has less than 5% inodes left.
NodeFilesystemAlmostOutOfSpace (0 active)
alert: NodeFilesystemAlmostOutOfSpace
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 5 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
  summary: Filesystem has less than 5% space left.
NodeFilesystemAlmostOutOfSpace (0 active)
alert: NodeFilesystemAlmostOutOfSpace
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 3 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
  summary: Filesystem has less than 3% space left.
NodeFilesystemFilesFillingUp (0 active)
alert: NodeFilesystemFilesFillingUp
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 40 and predict_linear(node_filesystem_files_free{fstype!="",job="node-exporter"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
  summary: Filesystem is predicted to run out of inodes within the next 24 hours.
NodeFilesystemFilesFillingUp (0 active)
alert: NodeFilesystemFilesFillingUp
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 20 and predict_linear(node_filesystem_files_free{fstype!="",job="node-exporter"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
  summary: Filesystem is predicted to run out of inodes within the next 4 hours.
NodeFilesystemSpaceFillingUp (0 active)
alert: NodeFilesystemSpaceFillingUp
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 15 and predict_linear(node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
  summary: Filesystem is predicted to run out of space within the next 4 hours.
NodeFilesystemSpaceFillingUp (0 active)
alert: NodeFilesystemSpaceFillingUp
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 40 and predict_linear(node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
  summary: Filesystem is predicted to run out of space within the next 24 hours.
NodeHighNumberConntrackEntriesUsed (0 active)
alert: NodeHighNumberConntrackEntriesUsed
expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
  severity: warning
annotations:
  description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused
  summary: Number of conntrack are getting close to the limit.
NodeNetworkReceiveErrs (0 active)
alert: NodeNetworkReceiveErrs
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 1h
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs
  summary: Network interface is reporting many receive errors.
NodeNetworkTransmitErrs (0 active)
alert: NodeNetworkTransmitErrs
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 1h
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs
  summary: Network interface is reporting many transmit errors.
NodeRAIDDegraded (0 active)
alert: NodeRAIDDegraded
expr: node_md_disks_required - ignoring(state) (node_md_disks{state="active"}) > 0
for: 15m
labels:
  severity: critical
annotations:
  description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddegraded
  summary: RAID Array is degraded
NodeRAIDDiskFailure (0 active)
alert: NodeRAIDDiskFailure
expr: node_md_disks{state="fail"} > 0
labels:
  severity: warning
annotations:
  description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddiskfailure
  summary: Failed device in RAID array
NodeTextFileCollectorScrapeError (0 active)
alert: NodeTextFileCollectorScrapeError
expr: node_textfile_scrape_error{job="node-exporter"} == 1
labels:
  severity: warning
annotations:
  description: Node Exporter text file collector failed to scrape.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror
  summary: Node Exporter text file collector failed to scrape.
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-node-network.yaml > node-network
NodeNetworkInterfaceFlapping (0 active)
alert: NodeNetworkInterfaceFlapping
expr: changes(node_network_up{device!~"veth.+",job="node-exporter"}[2m]) > 2
for: 2m
labels:
  severity: warning
annotations:
  message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-prometheus-operator.yaml > prometheus-operator
PrometheusOperatorListErrors (0 active)
alert: PrometheusOperatorListErrors
expr: (sum by(controller, namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator-kube-p-operator",namespace="default"}[10m])) / sum by(controller, namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator-kube-p-operator",namespace="default"}[10m]))) > 0.4
for: 15m
labels:
  severity: warning
annotations:
  description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorlisterrors
  summary: Errors while performing list operations in controller.
PrometheusOperatorNodeLookupErrors (0 active)
alert: PrometheusOperatorNodeLookupErrors
expr: rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator-kube-p-operator",namespace="default"}[5m]) > 0.1
for: 10m
labels:
  severity: warning
annotations:
  description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornodelookuperrors
  summary: Errors while reconciling Prometheus.
PrometheusOperatorNotReady (0 active)
alert: PrometheusOperatorNotReady
expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator-kube-p-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
  severity: warning
annotations:
  description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornotready
  summary: Prometheus operator not ready
PrometheusOperatorReconcileErrors (0 active)
alert: PrometheusOperatorReconcileErrors
expr: (sum by(controller, namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator-kube-p-operator",namespace="default"}[5m]))) / (sum by(controller, namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator-kube-p-operator",namespace="default"}[5m]))) > 0.1
for: 10m
labels:
  severity: warning
annotations:
  description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorreconcileerrors
  summary: Errors while reconciling controller.
PrometheusOperatorRejectedResources (0 active)
alert: PrometheusOperatorRejectedResources
expr: min_over_time(prometheus_operator_managed_resources{job="prometheus-operator-kube-p-operator",namespace="default",state="rejected"}[5m]) > 0
for: 5m
labels:
  severity: warning
annotations:
  description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorrejectedresources
  summary: Resources rejected by Prometheus operator
PrometheusOperatorSyncFailed (0 active)
alert: PrometheusOperatorSyncFailed
expr: min_over_time(prometheus_operator_syncs{job="prometheus-operator-kube-p-operator",namespace="default",status="failed"}[5m]) > 0
for: 10m
labels:
  severity: warning
annotations:
  description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorsyncfailed
  summary: Last controller reconciliation failed
PrometheusOperatorWatchErrors (0 active)
alert: PrometheusOperatorWatchErrors
expr: (sum by(controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator-kube-p-operator",namespace="default"}[10m])) / sum by(controller, namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator-kube-p-operator",namespace="default"}[10m]))) > 0.4
for: 15m
labels:
  severity: warning
annotations:
  description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorwatcherrors
  summary: Errors while performing watch operations in controller.
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-prometheus.yaml > prometheus
PrometheusBadConfig (0 active)
alert: PrometheusBadConfig
expr: max_over_time(prometheus_config_last_reload_successful{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]) == 0
for: 10m
labels:
  severity: critical
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.
  summary: Failed Prometheus configuration reload.
PrometheusDuplicateTimestamps (0 active)
alert: PrometheusDuplicateTimestamps
expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]) > 0
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value  }} samples/s with different values but duplicated timestamp.
  summary: Prometheus is dropping samples with duplicate timestamps.
PrometheusErrorSendingAlertsToAnyAlertmanager (0 active)
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
expr: min without(alertmanager) (rate(prometheus_notifications_errors_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m])) * 100 > 3
for: 15m
labels:
  severity: critical
annotations:
  description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
  summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
PrometheusErrorSendingAlertsToSomeAlertmanagers (0 active)
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
expr: (rate(prometheus_notifications_errors_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m])) * 100 > 1
for: 15m
labels:
  severity: warning
annotations:
  description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
  summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
PrometheusMissingRuleEvaluations (0 active)
alert: PrometheusMissingRuleEvaluations
expr: increase(prometheus_rule_group_iterations_missed_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]) > 0
for: 15m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m.
  summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
PrometheusNotConnectedToAlertmanagers (0 active)
alert: PrometheusNotConnectedToAlertmanagers
expr: max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]) < 1
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.
  summary: Prometheus is not connected to any Alertmanagers.
PrometheusNotIngestingSamples (0 active)
alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]) <= 0
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
  summary: Prometheus is not ingesting samples.
PrometheusNotificationQueueRunningFull (0 active)
alert: PrometheusNotificationQueueRunningFull
expr: (predict_linear(prometheus_notifications_queue_length{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m], 60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]))
for: 15m
labels:
  severity: warning
annotations:
  description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.
  summary: Prometheus alert notification queue predicted to run full in less than 30m.
PrometheusOutOfOrderTimestamps (0 active)
alert: PrometheusOutOfOrderTimestamps
expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]) > 0
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value  }} samples/s with timestamps arriving out of order.
  summary: Prometheus drops samples with out-of-order timestamps.
PrometheusRemoteStorageFailures (0 active)
alert: PrometheusRemoteStorageFailures
expr: (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]) / (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]) + rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]))) * 100 > 1
for: 15m
labels:
  severity: critical
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
  summary: Prometheus fails to send samples to remote storage.
PrometheusRemoteWriteBehind (0 active)
alert: PrometheusRemoteWriteBehind
expr: (max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]) - on(job, instance) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m])) > 120
for: 15m
labels:
  severity: critical
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.
  summary: Prometheus remote write is behind.
PrometheusRemoteWriteDesiredShards (0 active)
alert: PrometheusRemoteWriteDesiredShards
expr: (max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]) > max_over_time(prometheus_remote_storage_shards_max{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]))
for: 15m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-operator-kube-p-prometheus",namespace="default"}` $labels.instance | query | first | value }}.
  summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
PrometheusRuleFailures (0 active)
alert: PrometheusRuleFailures
expr: increase(prometheus_rule_evaluation_failures_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]) > 0
for: 15m
labels:
  severity: critical
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.
  summary: Prometheus is failing rule evaluations.
PrometheusTSDBCompactionsFailing (0 active)
alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[3h]) > 0
for: 4h
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.
  summary: Prometheus has issues compacting blocks.
PrometheusTSDBReloadsFailing (0 active)
alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[3h]) > 0
for: 4h
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.
  summary: Prometheus has issues reloading blocks from disk.
PrometheusTargetLimitHit (0 active)
alert: PrometheusTargetLimitHit
expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-operator-kube-p-prometheus",namespace="default"}[5m]) > 0
for: 15m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
  summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-rules-common-kubernetes-storage.yaml > kubernetes-storage
KubePersistentVolumeFillingUp (0 active)
alert: KubePersistentVolumeFillingUp
expr: kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"} < 0.15
for: 1m
labels:
  severity: critical
annotations:
  description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
  summary: PersistentVolume is filling up.
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/doc-production-file-storage.yaml > file-storage
file-storage-error (1 active)
alert: file-storage-error
expr: (sum(rate(file_storage_upload_seconds_count{exception!="none",namespace="doc-production"}[5m]) or vector(0)) + sum(rate(file_storage_download_seconds_count{exception!="none",namespace="doc-production"}[5m]) or vector(0))) * 60 > bool 1
for: 5m
labels:
  alertname: analytics-telegram
  severity: warning
annotations:
  message: ⚠️ File-storage - количество ошибок выше нормы!
Labels State Active Since Value
alertname="file-storage-error" severity="warning" firing 2026-01-27 23:05:35.361022021 +0000 UTC 0
file-storage-speed (1 active)
Labels State Active Since Value
alertname="file-storage-speed" severity="warning" firing 2026-02-12 00:59:35.361022021 +0000 UTC 0