From 947f8ace549500b2f4213fe80eeb50243e931606 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 10:03:58 +0000 Subject: [PATCH] [monitoring] Remove stale MySQL InnoDB Cluster alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MySQL migrated from InnoDB Cluster (Bitnami chart + mysql-operator) to a standalone StatefulSet on 2026-04-16. Two Prometheus alerts still referenced the old topology and were firing falsely against resources that no longer exist: - MySQLDown: queried kube_statefulset_status_replicas_ready{statefulset="mysql-cluster"} — that StatefulSet was deleted as part of Phase 1 of the migration. - MySQLOperatorDown: queried kube_deployment_status_replicas_available{namespace="mysql-operator"} — the operator Deployment was removed in Phase 1. Replacement availability monitoring for the standalone MySQL pod will be handled via an Uptime Kuma MySQL-connection monitor (out of scope for this change — no Prometheus replacement alert is being added, per the migration plan's "simpler is better" principle). MySQLBackupStale and MySQLBackupNeverSucceeded are retained — they query the mysql-backup CronJob which is unchanged by the migration. Also removes MySQLDown from the two inhibition rules (NodeDown and NFSServerUnresponsive) that previously suppressed it during cascade outages — the alert no longer exists so the reference became dead. Closes: code-3sa Co-Authored-By: Claude Opus 4.7 (1M context) --- .../monitoring/prometheus_chart_values.tpl | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index a1724da6..fa6ccf94 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -73,12 +73,12 @@ alertmanager: - source_matchers: - alertname = NodeDown target_matchers: - - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|MySQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing" + - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing" # NFS down causes mass pod failures and NFS-dependent service outages - source_matchers: - alertname = NFSServerUnresponsive target_matchers: - - alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|MySQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|HomeAssistantDown" + - alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|HomeAssistantDown" # Traefik down makes service-level alerts noise - source_matchers: - alertname = TraefikDown @@ -1340,13 +1340,6 @@ serverFiles: severity: critical annotations: summary: "PostgreSQL pod {{ $labels.pod }} is not ready" - - alert: MySQLDown - expr: kube_statefulset_status_replicas_ready{namespace="dbaas", statefulset="mysql-cluster"} < 1 - for: 5m - labels: - severity: critical - annotations: - summary: "MySQL InnoDB Cluster has no ready replicas" - alert: RedisDown expr: kube_statefulset_status_replicas_ready{namespace="redis", statefulset="redis-node"} < 1 for: 5m @@ -1391,13 +1384,6 @@ serverFiles: severity: warning annotations: summary: "CNPG operator down — PostgreSQL failover/management degraded" - - alert: MySQLOperatorDown - expr: (kube_deployment_status_replicas_available{namespace="mysql-operator", deployment="mysql-operator"} or on() vector(0)) < 1 - for: 10m - labels: - severity: warning - annotations: - summary: "MySQL operator down — InnoDB Cluster management degraded" - name: Cluster rules: - alert: NodeDown