From 947f8ace549500b2f4213fe80eeb50243e931606 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sat, 18 Apr 2026 10:03:58 +0000
Subject: [PATCH] [monitoring] Remove stale MySQL InnoDB Cluster alerts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MySQL migrated from InnoDB Cluster (Bitnami chart + mysql-operator) to
a standalone StatefulSet on 2026-04-16. Two Prometheus alerts still
referenced the old topology and were firing falsely against resources
that no longer exist:

- MySQLDown: queried kube_statefulset_status_replicas_ready{statefulset="mysql-cluster"}
  — that StatefulSet was deleted as part of Phase 1 of the migration.
- MySQLOperatorDown: queried kube_deployment_status_replicas_available{namespace="mysql-operator"}
  — the operator Deployment was removed in Phase 1.

Replacement availability monitoring for the standalone MySQL pod will
be handled via an Uptime Kuma MySQL-connection monitor (out of scope
for this change — no Prometheus replacement alert is being added, per
the migration plan's "simpler is better" principle).

MySQLBackupStale and MySQLBackupNeverSucceeded are retained — they
query the mysql-backup CronJob which is unchanged by the migration.

Also removes MySQLDown from the two inhibition rules (NodeDown and
NFSServerUnresponsive) that previously suppressed it during cascade
outages — the alert no longer exists so the reference became dead.

Closes: code-3sa

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../monitoring/prometheus_chart_values.tpl     | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
index a1724da6..fa6ccf94 100755
--- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
@@ -73,12 +73,12 @@ alertmanager:
       - source_matchers:
           - alertname = NodeDown
         target_matchers:
-          - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|MySQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing"
+          - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing"
       # NFS down causes mass pod failures and NFS-dependent service outages
       - source_matchers:
           - alertname = NFSServerUnresponsive
         target_matchers:
-          - alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|MySQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|HomeAssistantDown"
+          - alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|HomeAssistantDown"
       # Traefik down makes service-level alerts noise
       - source_matchers:
           - alertname = TraefikDown
@@ -1340,13 +1340,6 @@ serverFiles:
               severity: critical
             annotations:
               summary: "PostgreSQL pod {{ $labels.pod }} is not ready"
-          - alert: MySQLDown
-            expr: kube_statefulset_status_replicas_ready{namespace="dbaas", statefulset="mysql-cluster"} < 1
-            for: 5m
-            labels:
-              severity: critical
-            annotations:
-              summary: "MySQL InnoDB Cluster has no ready replicas"
           - alert: RedisDown
             expr: kube_statefulset_status_replicas_ready{namespace="redis", statefulset="redis-node"} < 1
             for: 5m
@@ -1391,13 +1384,6 @@ serverFiles:
               severity: warning
             annotations:
               summary: "CNPG operator down — PostgreSQL failover/management degraded"
-          - alert: MySQLOperatorDown
-            expr: (kube_deployment_status_replicas_available{namespace="mysql-operator", deployment="mysql-operator"} or on() vector(0)) < 1
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              summary: "MySQL operator down — InnoDB Cluster management degraded"
       - name: Cluster
         rules:
           - alert: NodeDown