|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-kube-prometheus-stack-rabbitmq.yaml > ngate-certificate-expiration-monitoring
|
alert: NgateCertificateExpiring
expr: ng_infra_cert_expiry < 2.592e+06
labels:
severity: warning
annotations:
description: Certificate is about to expire or expired
summary: Certificate {{ $labels.name }} is about to expire or expired. Link https://grafana.basis.center/d/uU0S3zISz/ngate?orgId=1
|
alert: NgateCertificateExpiring
expr: ng_infra_cert_expiry < 1.2096e+06
labels:
severity: critical
annotations:
description: Certificate expired in 14 days or less
summary: Certificate {{ $labels.name }} is about to expire or expired. Link https://grafana.basis.center/d/uU0S3zISz/ngate?orgId=1
|
alert: NgateKeyExpiring
expr: ng_infra_pk_expiry < 2.592e+06
labels:
severity: warning
annotations:
description: Key is about to expire or expired
summary: Key {{ $labels.name }} is about to expire or expired. Link https://grafana.basis.center/d/uU0S3zISz/ngate?orgId=1
|
alert: NgateKeyExpiring
expr: ng_infra_pk_expiry < 1.2096e+06
labels:
severity: critical
annotations:
description: Key expired in 14 days or less
summary: Key {{ $labels.name }} is about to expire or expired. Link https://grafana.basis.center/d/uU0S3zISz/ngate?orgId=1
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-kube-prometheus-stack-rabbitmq.yaml > rabbitmq-messages
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-alertmanager.rules.yaml > alertmanager.rules
|
|
|
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-general.rules.yaml > general.rules
|
| Labels |
State |
Active Since |
Value |
|
alertname="TargetDown"
job="ozon-agent"
namespace="doc-production"
service="ozon-agent"
severity="warning"
|
firing |
2026-01-27 23:05:50.223808037 +0000 UTC |
100 |
| Annotations |
- message
- 100% of the ozon-agent/ozon-agent targets in doc-production namespace are down.
|
|
alert: Watchdog
expr: vector(1)
labels:
severity: none
annotations:
message: |
This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
| Labels |
State |
Active Since |
Value |
|
alertname="Watchdog"
severity="none"
|
firing |
2026-01-27 23:05:50.223808037 +0000 UTC |
1 |
| Annotations |
- message
- This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kube-apiserver-slos.yaml > kube-apiserver-slos
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kube-state-metrics.yaml > kube-state-metrics
|
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kubernetes-apps.yaml > kubernetes-apps
|
| Labels |
State |
Active Since |
Value |
|
alertname="KubeDaemonSetRolloutStuck"
container="kube-state-metrics"
daemonset="fluentd"
endpoint="http"
instance="10.43.59.31:8080"
job="kube-state-metrics"
namespace="elastic-stack"
pod="prometheus-operator-kube-state-metrics-66b4c95cd9-f4zf6"
service="prometheus-operator-kube-state-metrics"
severity="warning"
|
firing |
2026-03-04 14:30:29.276276712 +0000 UTC |
4 |
| Annotations |
- description
- DaemonSet elastic-stack/fluentd has not finished or progressed for at least 15 minutes.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
- summary
- DaemonSet rollout is stuck.
|
|
| Labels |
State |
Active Since |
Value |
|
alertname="KubeJobCompletion"
container="kube-state-metrics"
endpoint="http"
instance="10.43.59.31:8080"
job="kube-state-metrics"
job_name="logical-backup-doc-hr-1769648400"
namespace="hr-production"
pod="prometheus-operator-kube-state-metrics-66b4c95cd9-f4zf6"
service="prometheus-operator-kube-state-metrics"
severity="warning"
|
firing |
2026-03-04 14:30:29.276276712 +0000 UTC |
1 |
| Annotations |
- description
- Job hr-production/logical-backup-doc-hr-1769648400 is taking more than 12 hours to complete.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
- summary
- Job did not complete in time
|
|
alert: KubeJobFailed
expr: kube_job_failed{job="kube-state-metrics",namespace=~".*"} > 0
for: 15m
labels:
severity: warning
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
summary: Job failed to complete.
| Labels |
State |
Active Since |
Value |
|
alertname="KubeJobFailed"
condition="true"
container="kube-state-metrics"
endpoint="http"
instance="10.43.59.31:8080"
job="kube-state-metrics"
job_name="logical-backup-doc-hr-1769648400"
namespace="hr-production"
pod="prometheus-operator-kube-state-metrics-66b4c95cd9-f4zf6"
service="prometheus-operator-kube-state-metrics"
severity="warning"
|
firing |
2026-03-04 14:30:29.276276712 +0000 UTC |
1 |
| Annotations |
- description
- Job hr-production/logical-backup-doc-hr-1769648400 failed to complete. Removing failed job after investigation should clear this alert.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
- summary
- Job failed to complete.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kubernetes-resources.yaml > kubernetes-resources
|
| Labels |
State |
Active Since |
Value |
|
alertname="CPUThrottlingHigh"
container="ngate-exporter"
namespace="ngate-exporter"
pod="ngate-exporter-78cfd45866-qqd2d"
severity="info"
|
firing |
2026-03-05 10:02:40.545087902 +0000 UTC |
0.7230910763569457 |
| Annotations |
- description
- 72.31% throttling of CPU in namespace ngate-exporter for container ngate-exporter in pod ngate-exporter-78cfd45866-qqd2d.
- runbook_url
- https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
- summary
- Processes experience elevated CPU throttling.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kubernetes-storage.yaml > kubernetes-storage
|
|
|
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kubernetes-system-apiserver.yaml > kubernetes-system-apiserver
|
alert: AggregatedAPIErrors
expr: sum by(name, namespace) (increase(aggregator_unavailable_apiservice_count[5m])) > 2
labels:
severity: warning
annotations:
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
summary: An aggregated API has reported errors.
|
alert: KubeAPIDown
expr: absent(up{job="apiserver"} == 1)
for: 15m
labels:
severity: critical
annotations:
description: KubeAPI has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
summary: Target disappeared from Prometheus target discovery.
|
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kubernetes-system-kubelet.yaml > kubernetes-system-kubelet
|
|
|
|
|
|
|
alert: KubeletClientCertificateExpiration
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
labels:
severity: critical
annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
|
alert: KubeletClientCertificateExpiration
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
labels:
severity: warning
annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
|
|
|
alert: KubeletDown
expr: absent(up{job="kubelet",metrics_path="/metrics"} == 1)
for: 15m
labels:
severity: critical
annotations:
description: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
summary: Target disappeared from Prometheus target discovery.
|
|
|
|
|
alert: KubeletServerCertificateExpiration
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
labels:
severity: warning
annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
|
alert: KubeletServerCertificateExpiration
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
labels:
severity: critical
annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
|
alert: KubeletServerCertificateRenewalErrors
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors
summary: Kubelet has failed to renew its server certificate.
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-kubernetes-system.yaml > kubernetes-system
|
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-node-exporter.yaml > node-exporter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
alert: NodeRAIDDegraded
expr: node_md_disks_required - ignoring(state) (node_md_disks{state="active"}) > 0
for: 15m
labels:
severity: critical
annotations:
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddegraded
summary: RAID Array is degraded
|
alert: NodeRAIDDiskFailure
expr: node_md_disks{state="fail"} > 0
labels:
severity: warning
annotations:
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddiskfailure
summary: Failed device in RAID array
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-node-network.yaml > node-network
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-prometheus-operator.yaml > prometheus-operator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-operator-kube-p-prometheus.yaml > prometheus
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/default-prometheus-rules-common-kubernetes-storage.yaml > kubernetes-storage
|
|
|
|
/etc/prometheus/rules/prometheus-prometheus-operator-kube-p-prometheus-rulefiles-0/doc-production-file-storage.yaml > file-storage
|
| Labels |
State |
Active Since |
Value |
|
alertname="file-storage-error"
severity="warning"
|
firing |
2026-01-27 23:05:35.361022021 +0000 UTC |
0 |
| Annotations |
- message
- ⚠️ File-storage - количество ошибок выше нормы!
|
|
| Labels |
State |
Active Since |
Value |
|
alertname="file-storage-speed"
severity="warning"
|
firing |
2026-02-12 00:59:35.361022021 +0000 UTC |
0 |
| Annotations |
- message
- 📦 File-storage - Средний показатель sec/MB для скачивания либо загрузки превышает норму!
|
|