From 364df9f2ea4f2952beab9fa1cb27b8d13217b344 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 19 Apr 2026 15:24:56 +0000 Subject: [PATCH] =?UTF-8?q?[dns]=20readiness=20gate=20=E2=80=94=20replace?= =?UTF-8?q?=20auth-required=20zone-count=20probe=20with=20DNS=20parity=20c?= =?UTF-8?q?heck?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zone-count parity required hitting /api/zones/list which requires auth. The null_resource has no access to the Technitium admin password (it's declared `sensitive = true` on the module variable), so we were probing with an empty token and getting 200 OK with an error JSON — silently returning 0 zones for every instance. Replaced the HTTP probe with a second DNS check: dig idrac.viktorbarzin.lan on each pod, require the same A record from all three. This catches both "zone not loaded on an instance" and "zone drift between primary and replicas" without needing any HTTP client or credentials. The AXFR chain guarantees all three should converge on the same value. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../modules/technitium/readiness.tf | 55 ++++++------------- 1 file changed, 18 insertions(+), 37 deletions(-) diff --git a/stacks/technitium/modules/technitium/readiness.tf b/stacks/technitium/modules/technitium/readiness.tf index 2e7c0e51..a19dfb6a 100644 --- a/stacks/technitium/modules/technitium/readiness.tf +++ b/stacks/technitium/modules/technitium/readiness.tf @@ -42,10 +42,13 @@ resource "null_resource" "technitium_readiness_gate" { kubectl -n $NS rollout status deploy/$d --timeout=180s done - # 2. Per-pod DNS check. Technitium pods have `dig` but no HTTP client, - # so we probe the DNS answer directly — if the pod can resolve - # idrac.viktorbarzin.lan from its local zone data, the server is - # functional. + # 2. Per-pod DNS check + content parity. Technitium pods have `dig` but + # no HTTP client, so we use DNS directly. Each pod must return an A + # record for idrac.viktorbarzin.lan, AND the answer must match across + # all three instances. This catches: + # - Zone not loaded on an instance (NXDOMAIN / empty) + # - Zone drift between primary and replicas (different A record) + # The AXFR chain means all three should converge on the same value. PODS=$(kubectl -n $NS get pod -l dns-server=true -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}') if [ -z "$PODS" ]; then echo "ERROR: no dns-server=true pods found" @@ -54,53 +57,31 @@ resource "null_resource" "technitium_readiness_gate" { # Zone load can take tens of seconds after a memory-bump rollout, so retry # up to 6 times with 10s backoff before giving up. + ANSWERS="" for POD in $PODS; do echo "-> dig @127.0.0.1 idrac.viktorbarzin.lan on $POD" - OK=0 + ANSWER="" for TRY in 1 2 3 4 5 6; do ANSWER=$(kubectl -n $NS exec "$POD" -- dig +short +time=5 +tries=2 @127.0.0.1 idrac.viktorbarzin.lan A 2>&1 || true) if echo "$ANSWER" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then - OK=1; break + break fi echo " attempt $TRY: no A record yet, sleeping 10s" sleep 10 + ANSWER="" done - if [ "$OK" -eq 0 ]; then - echo "ERROR: pod $POD never returned an A record for idrac.viktorbarzin.lan (last: $ANSWER)" + if [ -z "$ANSWER" ]; then + echo "ERROR: pod $POD never returned an A record for idrac.viktorbarzin.lan" exit 1 fi + echo " $POD → $ANSWER" + ANSWERS="$ANSWERS $ANSWER" done - # 3. Zone-count parity via an ephemeral curl pod (technitium image has - # no HTTP client). Pod auto-deletes on success via --rm. - JOB_NAME="readiness-probe-$RANDOM" - CHECK_SCRIPT=' - set -e - for SVC in technitium-web technitium-secondary-web technitium-tertiary-web; do - COUNT=$(curl -sf --max-time 10 http://$SVC:5380/api/zones/list?token= | tr "," "\n" | grep -c "\"name\":" || true) - printf "%s %s\n" "$SVC" "$${COUNT:-0}" - done - ' - RESULT=$(kubectl -n $NS run $JOB_NAME --rm -i --restart=Never --quiet \ - --image=curlimages/curl:latest --image-pull-policy=IfNotPresent \ - --timeout=60s -- sh -c "$CHECK_SCRIPT" 2>/dev/null || true) - echo "$RESULT" - - COUNTS=$(echo "$RESULT" | awk '{print $2}' | grep -E '^[0-9]+$') - if [ -z "$COUNTS" ]; then - echo "ERROR: zone-count probe returned no valid counts" - exit 1 - fi - # Sanity: Technitium always has built-in zones (localhost, reverse ptrs). - # All-zeros means the probe failed to reach the API, not a true parity pass. - MIN=$(echo "$COUNTS" | sort -n | head -1) - if [ "$MIN" -eq 0 ]; then - echo "ERROR: zone-count probe returned 0 for at least one instance — probe likely failed to reach API" - exit 1 - fi - UNIQ=$(echo "$COUNTS" | sort -u | wc -l) + # 3. Content parity — all three instances must agree on the A record. + UNIQ=$(echo "$ANSWERS" | tr ' ' '\n' | grep -v '^$' | sort -u | wc -l) if [ "$UNIQ" -gt 1 ]; then - echo "ERROR: zone counts differ across instances" + echo "ERROR: instances returned different A records for idrac.viktorbarzin.lan: $ANSWERS" exit 1 fi