[dns] readiness gate — replace auth-required zone-count probe with DNS parity check
Zone-count parity required hitting /api/zones/list which requires auth. The null_resource has no access to the Technitium admin password (it's declared `sensitive = true` on the module variable), so we were probing with an empty token and getting 200 OK with an error JSON — silently returning 0 zones for every instance. Replaced the HTTP probe with a second DNS check: dig idrac.viktorbarzin.lan on each pod, require the same A record from all three. This catches both "zone not loaded on an instance" and "zone drift between primary and replicas" without needing any HTTP client or credentials. The AXFR chain guarantees all three should converge on the same value. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f09be1524d
commit
364df9f2ea
1 changed files with 18 additions and 37 deletions
|
|
@ -42,10 +42,13 @@ resource "null_resource" "technitium_readiness_gate" {
|
||||||
kubectl -n $NS rollout status deploy/$d --timeout=180s
|
kubectl -n $NS rollout status deploy/$d --timeout=180s
|
||||||
done
|
done
|
||||||
|
|
||||||
# 2. Per-pod DNS check. Technitium pods have `dig` but no HTTP client,
|
# 2. Per-pod DNS check + content parity. Technitium pods have `dig` but
|
||||||
# so we probe the DNS answer directly — if the pod can resolve
|
# no HTTP client, so we use DNS directly. Each pod must return an A
|
||||||
# idrac.viktorbarzin.lan from its local zone data, the server is
|
# record for idrac.viktorbarzin.lan, AND the answer must match across
|
||||||
# functional.
|
# all three instances. This catches:
|
||||||
|
# - Zone not loaded on an instance (NXDOMAIN / empty)
|
||||||
|
# - Zone drift between primary and replicas (different A record)
|
||||||
|
# The AXFR chain means all three should converge on the same value.
|
||||||
PODS=$(kubectl -n $NS get pod -l dns-server=true -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')
|
PODS=$(kubectl -n $NS get pod -l dns-server=true -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')
|
||||||
if [ -z "$PODS" ]; then
|
if [ -z "$PODS" ]; then
|
||||||
echo "ERROR: no dns-server=true pods found"
|
echo "ERROR: no dns-server=true pods found"
|
||||||
|
|
@ -54,53 +57,31 @@ resource "null_resource" "technitium_readiness_gate" {
|
||||||
|
|
||||||
# Zone load can take tens of seconds after a memory-bump rollout, so retry
|
# Zone load can take tens of seconds after a memory-bump rollout, so retry
|
||||||
# up to 6 times with 10s backoff before giving up.
|
# up to 6 times with 10s backoff before giving up.
|
||||||
|
ANSWERS=""
|
||||||
for POD in $PODS; do
|
for POD in $PODS; do
|
||||||
echo "-> dig @127.0.0.1 idrac.viktorbarzin.lan on $POD"
|
echo "-> dig @127.0.0.1 idrac.viktorbarzin.lan on $POD"
|
||||||
OK=0
|
ANSWER=""
|
||||||
for TRY in 1 2 3 4 5 6; do
|
for TRY in 1 2 3 4 5 6; do
|
||||||
ANSWER=$(kubectl -n $NS exec "$POD" -- dig +short +time=5 +tries=2 @127.0.0.1 idrac.viktorbarzin.lan A 2>&1 || true)
|
ANSWER=$(kubectl -n $NS exec "$POD" -- dig +short +time=5 +tries=2 @127.0.0.1 idrac.viktorbarzin.lan A 2>&1 || true)
|
||||||
if echo "$ANSWER" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then
|
if echo "$ANSWER" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'; then
|
||||||
OK=1; break
|
break
|
||||||
fi
|
fi
|
||||||
echo " attempt $TRY: no A record yet, sleeping 10s"
|
echo " attempt $TRY: no A record yet, sleeping 10s"
|
||||||
sleep 10
|
sleep 10
|
||||||
|
ANSWER=""
|
||||||
done
|
done
|
||||||
if [ "$OK" -eq 0 ]; then
|
if [ -z "$ANSWER" ]; then
|
||||||
echo "ERROR: pod $POD never returned an A record for idrac.viktorbarzin.lan (last: $ANSWER)"
|
echo "ERROR: pod $POD never returned an A record for idrac.viktorbarzin.lan"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
echo " $POD → $ANSWER"
|
||||||
|
ANSWERS="$ANSWERS $ANSWER"
|
||||||
done
|
done
|
||||||
|
|
||||||
# 3. Zone-count parity via an ephemeral curl pod (technitium image has
|
# 3. Content parity — all three instances must agree on the A record.
|
||||||
# no HTTP client). Pod auto-deletes on success via --rm.
|
UNIQ=$(echo "$ANSWERS" | tr ' ' '\n' | grep -v '^$' | sort -u | wc -l)
|
||||||
JOB_NAME="readiness-probe-$RANDOM"
|
|
||||||
CHECK_SCRIPT='
|
|
||||||
set -e
|
|
||||||
for SVC in technitium-web technitium-secondary-web technitium-tertiary-web; do
|
|
||||||
COUNT=$(curl -sf --max-time 10 http://$SVC:5380/api/zones/list?token= | tr "," "\n" | grep -c "\"name\":" || true)
|
|
||||||
printf "%s %s\n" "$SVC" "$${COUNT:-0}"
|
|
||||||
done
|
|
||||||
'
|
|
||||||
RESULT=$(kubectl -n $NS run $JOB_NAME --rm -i --restart=Never --quiet \
|
|
||||||
--image=curlimages/curl:latest --image-pull-policy=IfNotPresent \
|
|
||||||
--timeout=60s -- sh -c "$CHECK_SCRIPT" 2>/dev/null || true)
|
|
||||||
echo "$RESULT"
|
|
||||||
|
|
||||||
COUNTS=$(echo "$RESULT" | awk '{print $2}' | grep -E '^[0-9]+$')
|
|
||||||
if [ -z "$COUNTS" ]; then
|
|
||||||
echo "ERROR: zone-count probe returned no valid counts"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
# Sanity: Technitium always has built-in zones (localhost, reverse ptrs).
|
|
||||||
# All-zeros means the probe failed to reach the API, not a true parity pass.
|
|
||||||
MIN=$(echo "$COUNTS" | sort -n | head -1)
|
|
||||||
if [ "$MIN" -eq 0 ]; then
|
|
||||||
echo "ERROR: zone-count probe returned 0 for at least one instance — probe likely failed to reach API"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
UNIQ=$(echo "$COUNTS" | sort -u | wc -l)
|
|
||||||
if [ "$UNIQ" -gt 1 ]; then
|
if [ "$UNIQ" -gt 1 ]; then
|
||||||
echo "ERROR: zone counts differ across instances"
|
echo "ERROR: instances returned different A records for idrac.viktorbarzin.lan: $ANSWERS"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue