From 000d306542a5aa3bd42fa60b77651efe407711eb Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 23 May 2026 08:41:14 +0000 Subject: [PATCH] technitium: add viktorbarzin.me apex DNS drift probe + alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every internal *.viktorbarzin.me hostname (~80 services) chains through the split-horizon `viktorbarzin.me` apex A record. If the apex drifts (ISP rollover, accidental edit), every internal service breaks at once — the 2026-05-22 ha-sofia incident was exactly this. This adds a backstop probe so the next drift surfaces in <10 min instead of via user-reported outage: - CronJob `viktorbarzin-apex-probe` in `technitium` namespace, every 5 min, resolves `viktorbarzin.me A` against the Technitium LB IP (10.0.20.201) and pushes `viktorbarzin_apex_correct` + `_last_correct_timestamp` to Pushgateway. Python+dnspython, ~30 LOC. - 3 Prometheus alerts: - `ViktorBarzinApexDrift` (critical, 10m) — apex resolved to anything other than 10.0.20.200. - `ViktorBarzinApexProbeStale` (warning, 5m on 15m gap) — probe stopped succeeding. - `ViktorBarzinApexProbeNeverRun` (warning, 30m absent) — probe never reported. - Added the new alert names to the Slack receiver matcher in both routes alongside EmailRoundtrip*. Verified: rules loaded as inactive (apex is correct), metric flowing, manual probe job pass observed. --- .../monitoring/prometheus_chart_values.tpl | 28 ++++- stacks/technitium/modules/technitium/main.tf | 103 ++++++++++++++++++ 2 files changed, 129 insertions(+), 2 deletions(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 2c4b59a1..4bb3c92b 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -84,12 +84,12 @@ alertmanager: - source_matchers: - alertname = NodeDown target_matchers: - - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|RedisDown|HeadscaleDown|HeadscaleReplicasMismatch|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|HomeAssistantCriticalSensorUnavailable|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing" + - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|RedisDown|HeadscaleDown|HeadscaleReplicasMismatch|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|ViktorBarzinApexDrift|ViktorBarzinApexProbeStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|HomeAssistantCriticalSensorUnavailable|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing" # NFS down causes mass pod failures and NFS-dependent service outages - source_matchers: - alertname = NFSServerUnresponsive target_matchers: - - alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|HomeAssistantDown|HomeAssistantCriticalSensorUnavailable" + - alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|ViktorBarzinApexDrift|ViktorBarzinApexProbeStale|HomeAssistantDown|HomeAssistantCriticalSensorUnavailable" # Traefik down makes service-level alerts noise - source_matchers: - alertname = TraefikDown @@ -2337,6 +2337,30 @@ serverFiles: severity: warning annotations: summary: "Email round-trip monitor never reported - check CronJob in mailserver namespace" + - alert: ViktorBarzinApexDrift + expr: viktorbarzin_apex_correct{job="viktorbarzin-apex-probe"} == 0 + for: 10m + labels: + severity: critical + annotations: + summary: "viktorbarzin.me apex A drifted from expected 10.0.20.200" + description: "Technitium serves the split-horizon apex for ~80 *.viktorbarzin.me CNAMEs. If this is wrong, every internal service (auth, vault, immich, ha-sofia, ...) breaks. Check Technitium primary zone records via API or web console." + - alert: ViktorBarzinApexProbeStale + expr: (time() - viktorbarzin_apex_last_correct_timestamp{job="viktorbarzin-apex-probe"}) > 900 + for: 5m + labels: + severity: warning + annotations: + summary: "viktorbarzin.me apex probe has not seen a correct result in >15 min" + description: "Probe may be failing intermittently or apex may be drifting. Check CronJob `viktorbarzin-apex-probe` in `technitium` namespace." + - alert: ViktorBarzinApexProbeNeverRun + expr: absent(viktorbarzin_apex_correct{job="viktorbarzin-apex-probe"}) + for: 30m + labels: + severity: warning + annotations: + summary: "viktorbarzin.me apex probe never reported" + description: "Check `kubectl -n technitium get cronjob viktorbarzin-apex-probe` and the most recent job pod logs." - alert: AIOStreamsStreamCountLow expr: aiostreams_stream_count{job="aiostreams-stream-probe"} < 50 for: 30m diff --git a/stacks/technitium/modules/technitium/main.tf b/stacks/technitium/modules/technitium/main.tf index e113bf29..1f4a6051 100644 --- a/stacks/technitium/modules/technitium/main.tf +++ b/stacks/technitium/modules/technitium/main.tf @@ -696,3 +696,106 @@ resource "kubernetes_cron_job_v1" "technitium_dns_optimization" { } } +# viktorbarzin.me apex DNS drift probe +# Resolves `viktorbarzin.me A` against the Technitium LoadBalancer IP every +# 5 min and pushes a Pushgateway gauge. Backstop for the entire +# split-horizon zone: every internal `*.viktorbarzin.me` CNAME chains through +# this apex, so if it drifts (ISP rollover, accidental edit), this is the +# canary. Alerts: ViktorBarzinApexDrift, ApexProbeStale, ApexProbeNeverRun +# in stacks/monitoring/. +resource "kubernetes_cron_job_v1" "viktorbarzin_apex_probe" { + metadata { + name = "viktorbarzin-apex-probe" + namespace = kubernetes_namespace.technitium.metadata[0].name + } + spec { + concurrency_policy = "Replace" + schedule = "*/5 * * * *" + successful_jobs_history_limit = 1 + failed_jobs_history_limit = 3 + job_template { + metadata {} + spec { + backoff_limit = 1 + ttl_seconds_after_finished = 300 + template { + metadata {} + spec { + container { + name = "probe" + image = "docker.io/library/python:3.12-alpine" + resources { + requests = { + cpu = "10m" + memory = "48Mi" + } + limits = { + memory = "96Mi" + } + } + command = ["/bin/sh", "-c", <<-EOT + pip install --quiet --disable-pip-version-check dnspython requests && python3 -c ' +import dns.resolver, requests, time, sys + +EXPECTED = {"10.0.20.200"} +NAMESERVER = "10.0.20.201" # Technitium LB IP +NAME = "viktorbarzin.me" +PUSHGATEWAY = "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/viktorbarzin-apex-probe" + +resolver = dns.resolver.Resolver(configure=False) +resolver.nameservers = [NAMESERVER] +resolver.timeout = 5 +resolver.lifetime = 8 + +correct = 0 +observed = "unknown" +try: + answer = resolver.resolve(NAME, "A") + ips = sorted(str(r) for r in answer) + observed = ",".join(ips) + correct = 1 if set(ips) <= EXPECTED and ips else 0 + print(f"apex {NAME} -> {observed} (expected one of {EXPECTED}); correct={correct}") +except Exception as e: + observed = f"error:{type(e).__name__}" + print(f"resolve error: {e}", file=sys.stderr) + +metric_lines = [ + "# HELP viktorbarzin_apex_correct 1 if viktorbarzin.me apex resolves to expected IP, 0 otherwise", + "# TYPE viktorbarzin_apex_correct gauge", + f"viktorbarzin_apex_correct {correct}", +] +if correct: + metric_lines += [ + "# HELP viktorbarzin_apex_last_correct_timestamp Unix time of last correct resolution", + "# TYPE viktorbarzin_apex_last_correct_timestamp gauge", + f"viktorbarzin_apex_last_correct_timestamp {int(time.time())}", + ] +metrics = "\n".join(metric_lines) + "\n" +try: + r = requests.post(PUSHGATEWAY, data=metrics, timeout=10) + print(f"pushgateway: {r.status_code}") +except Exception as e: + print(f"pushgateway error: {e}", file=sys.stderr) +sys.exit(0 if correct else 1) +' + EOT + ] + } + dns_config { + option { + name = "ndots" + value = "2" + } + } + restart_policy = "OnFailure" + } + } + } + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2 + ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] + } +} +