From 000d306542a5aa3bd42fa60b77651efe407711eb Mon Sep 17 00:00:00 2001
From: Viktor Barzin <me@viktorbarzin.me>
Date: Sat, 23 May 2026 08:41:14 +0000
Subject: [PATCH] technitium: add viktorbarzin.me apex DNS drift probe + alerts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every internal *.viktorbarzin.me hostname (~80 services) chains through the
split-horizon `viktorbarzin.me` apex A record. If the apex drifts (ISP
rollover, accidental edit), every internal service breaks at once — the
2026-05-22 ha-sofia incident was exactly this.

This adds a backstop probe so the next drift surfaces in <10 min instead
of via user-reported outage:

- CronJob `viktorbarzin-apex-probe` in `technitium` namespace, every 5 min,
  resolves `viktorbarzin.me A` against the Technitium LB IP (10.0.20.201)
  and pushes `viktorbarzin_apex_correct` + `_last_correct_timestamp` to
  Pushgateway. Python+dnspython, ~30 LOC.

- 3 Prometheus alerts:
  - `ViktorBarzinApexDrift` (critical, 10m) — apex resolved to anything
    other than 10.0.20.200.
  - `ViktorBarzinApexProbeStale` (warning, 5m on 15m gap) — probe stopped
    succeeding.
  - `ViktorBarzinApexProbeNeverRun` (warning, 30m absent) — probe never
    reported.

- Added the new alert names to the Slack receiver matcher in both routes
  alongside EmailRoundtrip*.

Verified: rules loaded as inactive (apex is correct), metric flowing, manual
probe job pass observed.
---
 .../monitoring/prometheus_chart_values.tpl    |  28 ++++-
 stacks/technitium/modules/technitium/main.tf  | 103 ++++++++++++++++++
 2 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
index 2c4b59a1..4bb3c92b 100755
--- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
@@ -84,12 +84,12 @@ alertmanager:
       - source_matchers:
           - alertname = NodeDown
         target_matchers:
-          - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|RedisDown|HeadscaleDown|HeadscaleReplicasMismatch|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|HomeAssistantCriticalSensorUnavailable|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing"
+          - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|RedisDown|HeadscaleDown|HeadscaleReplicasMismatch|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|ViktorBarzinApexDrift|ViktorBarzinApexProbeStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|HomeAssistantCriticalSensorUnavailable|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing"
       # NFS down causes mass pod failures and NFS-dependent service outages
       - source_matchers:
           - alertname = NFSServerUnresponsive
         target_matchers:
-          - alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|HomeAssistantDown|HomeAssistantCriticalSensorUnavailable"
+          - alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|ViktorBarzinApexDrift|ViktorBarzinApexProbeStale|HomeAssistantDown|HomeAssistantCriticalSensorUnavailable"
       # Traefik down makes service-level alerts noise
       - source_matchers:
           - alertname = TraefikDown
@@ -2337,6 +2337,30 @@ serverFiles:
               severity: warning
             annotations:
               summary: "Email round-trip monitor never reported - check CronJob in mailserver namespace"
+          - alert: ViktorBarzinApexDrift
+            expr: viktorbarzin_apex_correct{job="viktorbarzin-apex-probe"} == 0
+            for: 10m
+            labels:
+              severity: critical
+            annotations:
+              summary: "viktorbarzin.me apex A drifted from expected 10.0.20.200"
+              description: "Technitium serves the split-horizon apex for ~80 *.viktorbarzin.me CNAMEs. If this is wrong, every internal service (auth, vault, immich, ha-sofia, ...) breaks. Check Technitium primary zone records via API or web console."
+          - alert: ViktorBarzinApexProbeStale
+            expr: (time() - viktorbarzin_apex_last_correct_timestamp{job="viktorbarzin-apex-probe"}) > 900
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              summary: "viktorbarzin.me apex probe has not seen a correct result in >15 min"
+              description: "Probe may be failing intermittently or apex may be drifting. Check CronJob `viktorbarzin-apex-probe` in `technitium` namespace."
+          - alert: ViktorBarzinApexProbeNeverRun
+            expr: absent(viktorbarzin_apex_correct{job="viktorbarzin-apex-probe"})
+            for: 30m
+            labels:
+              severity: warning
+            annotations:
+              summary: "viktorbarzin.me apex probe never reported"
+              description: "Check `kubectl -n technitium get cronjob viktorbarzin-apex-probe` and the most recent job pod logs."
           - alert: AIOStreamsStreamCountLow
             expr: aiostreams_stream_count{job="aiostreams-stream-probe"} < 50
             for: 30m
diff --git a/stacks/technitium/modules/technitium/main.tf b/stacks/technitium/modules/technitium/main.tf
index e113bf29..1f4a6051 100644
--- a/stacks/technitium/modules/technitium/main.tf
+++ b/stacks/technitium/modules/technitium/main.tf
@@ -696,3 +696,106 @@ resource "kubernetes_cron_job_v1" "technitium_dns_optimization" {
   }
 }
 
+# viktorbarzin.me apex DNS drift probe
+# Resolves `viktorbarzin.me A` against the Technitium LoadBalancer IP every
+# 5 min and pushes a Pushgateway gauge. Backstop for the entire
+# split-horizon zone: every internal `*.viktorbarzin.me` CNAME chains through
+# this apex, so if it drifts (ISP rollover, accidental edit), this is the
+# canary. Alerts: ViktorBarzinApexDrift, ApexProbeStale, ApexProbeNeverRun
+# in stacks/monitoring/.
+resource "kubernetes_cron_job_v1" "viktorbarzin_apex_probe" {
+  metadata {
+    name      = "viktorbarzin-apex-probe"
+    namespace = kubernetes_namespace.technitium.metadata[0].name
+  }
+  spec {
+    concurrency_policy            = "Replace"
+    schedule                      = "*/5 * * * *"
+    successful_jobs_history_limit = 1
+    failed_jobs_history_limit     = 3
+    job_template {
+      metadata {}
+      spec {
+        backoff_limit              = 1
+        ttl_seconds_after_finished = 300
+        template {
+          metadata {}
+          spec {
+            container {
+              name  = "probe"
+              image = "docker.io/library/python:3.12-alpine"
+              resources {
+                requests = {
+                  cpu    = "10m"
+                  memory = "48Mi"
+                }
+                limits = {
+                  memory = "96Mi"
+                }
+              }
+              command = ["/bin/sh", "-c", <<-EOT
+                pip install --quiet --disable-pip-version-check dnspython requests && python3 -c '
+import dns.resolver, requests, time, sys
+
+EXPECTED = {"10.0.20.200"}
+NAMESERVER = "10.0.20.201"  # Technitium LB IP
+NAME = "viktorbarzin.me"
+PUSHGATEWAY = "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/viktorbarzin-apex-probe"
+
+resolver = dns.resolver.Resolver(configure=False)
+resolver.nameservers = [NAMESERVER]
+resolver.timeout = 5
+resolver.lifetime = 8
+
+correct = 0
+observed = "unknown"
+try:
+    answer = resolver.resolve(NAME, "A")
+    ips = sorted(str(r) for r in answer)
+    observed = ",".join(ips)
+    correct = 1 if set(ips) <= EXPECTED and ips else 0
+    print(f"apex {NAME} -> {observed} (expected one of {EXPECTED}); correct={correct}")
+except Exception as e:
+    observed = f"error:{type(e).__name__}"
+    print(f"resolve error: {e}", file=sys.stderr)
+
+metric_lines = [
+    "# HELP viktorbarzin_apex_correct 1 if viktorbarzin.me apex resolves to expected IP, 0 otherwise",
+    "# TYPE viktorbarzin_apex_correct gauge",
+    f"viktorbarzin_apex_correct {correct}",
+]
+if correct:
+    metric_lines += [
+        "# HELP viktorbarzin_apex_last_correct_timestamp Unix time of last correct resolution",
+        "# TYPE viktorbarzin_apex_last_correct_timestamp gauge",
+        f"viktorbarzin_apex_last_correct_timestamp {int(time.time())}",
+    ]
+metrics = "\n".join(metric_lines) + "\n"
+try:
+    r = requests.post(PUSHGATEWAY, data=metrics, timeout=10)
+    print(f"pushgateway: {r.status_code}")
+except Exception as e:
+    print(f"pushgateway error: {e}", file=sys.stderr)
+sys.exit(0 if correct else 1)
+'
+              EOT
+              ]
+            }
+            dns_config {
+              option {
+                name  = "ndots"
+                value = "2"
+              }
+            }
+            restart_policy = "OnFailure"
+          }
+        }
+      }
+    }
+  }
+  lifecycle {
+    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
+    ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
+  }
+}
+