technitium: mirror mail-auth records into internal zone; fix redfish check [ci skip]

Two fixes from the post-DNS-internalization health sweep:

1. The internal viktorbarzin.me zone served only ingress A/CNAME records.
   Since the mailserver pods now resolve the domain through it (CoreDNS
   viktorbarzin.me:53 -> Technitium, 59a531b8), rspamd's SPF checks on
   inbound @viktorbarzin.me mail saw SPF=none and quarantined it — the
   Brevo email-roundtrip probe failed from the 16:20 run onward
   (EmailRoundtripFailing/Stale). The ingress-dns-sync CronJob now also
   maintains the static mail-auth records (SPF, brevo-code TXT, MX;
   DMARC + DKIM were already present), idempotently. Principle: the
   internal zone must be a SUPERSET of the public zone for every record
   type internal clients consume. Verified in-pod: all four types
   resolve; roundtrip re-probe green.

2. cluster_healthcheck #30 queried instant `up`, which goes stale for
   ~5 of every 10 minutes on the deliberate 10m redfish-idrac remnant
   job -> intermittent false "redfish-idrac=missing". Now uses
   last_over_time(up[15m]) — same answers for fast jobs.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-10 17:46:37 +00:00
parent e7fbf986fb
commit 00bc1e052d
2 changed files with 37 additions and 2 deletions

View file

@ -2026,11 +2026,16 @@ check_hardware_exporters() {
fi fi
done done
# Check Prometheus scrape targets for hardware exporters # Check Prometheus scrape targets for hardware exporters.
# last_over_time(up[15m]) instead of instant `up`: the redfish-idrac
# remnant scrapes every 10m (> the 5m staleness window), so an instant
# query returns it EMPTY ~half the time -> intermittent false "missing"
# (observed 2026-06-10). 15m covers the slowest job; identical answers
# for the 1-2m jobs.
local prom_jobs=("snmp-idrac" "snmp-ups" "redfish-idrac" "proxmox-host") local prom_jobs=("snmp-idrac" "snmp-ups" "redfish-idrac" "proxmox-host")
local up_result local up_result
up_result=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \ up_result=$($KUBECTL exec -n monitoring deploy/prometheus-server -- \
wget -q -O- 'http://localhost:9090/api/v1/query?query=up' 2>/dev/null || true) wget -q -O- 'http://localhost:9090/api/v1/query?query=last_over_time(up%5B15m%5D)' 2>/dev/null || true)
if [[ -n "$up_result" ]]; then if [[ -n "$up_result" ]]; then
for job in "${prom_jobs[@]}"; do for job in "${prom_jobs[@]}"; do

View file

@ -957,6 +957,36 @@ resource "kubernetes_cron_job_v1" "technitium_ingress_dns_sync" {
done done
echo "Sync complete. Created $$CREATED new records." echo "Sync complete. Created $$CREATED new records."
# Static mail-auth records (SPF / brevo verification / MX /
# DMARC / DKIM) mirrored from the PUBLIC Cloudflare zone.
# The internal zone is authoritative for viktorbarzin.me, and
# since 2026-06-10 the MAILSERVER pods resolve the domain
# through it (CoreDNS viktorbarzin.me:53 -> Technitium).
# Without these, rspamd's SPF/DKIM/DMARC checks on inbound
# @viktorbarzin.me mail (e.g. the Brevo email-roundtrip probe)
# see SPF=none/DKIM=fail and quarantine it (EmailRoundtrip*
# alerts, 2026-06-10). Internal zone must be a SUPERSET of the
# public one for every record type clients consume. Idempotent:
# checked against the zone dump before adding. If these change
# in Cloudflare, update here too (slow-moving).
ZONE_DUMP=$$(curl -sf "$$TECH_API/api/zones/records/get?token=$$TOKEN&zone=$$ZONE&domain=$$ZONE&listZone=true")
add_txt() {
NAME="$$1"; MARK="$$2"; VALUE="$$3"
if echo "$$ZONE_DUMP" | grep -q "$$MARK"; then echo "mail-auth: $$NAME ($$MARK) present"; return; fi
R=$$(curl -sf -G "$$TECH_API/api/zones/records/add" --data-urlencode "token=$$TOKEN" --data-urlencode "zone=$$ZONE" --data-urlencode "domain=$$NAME" --data-urlencode "type=TXT" --data-urlencode "text=$$VALUE" --data-urlencode "ttl=3600") || true
echo "$$R" | grep -q '"status":"ok"' && echo "mail-auth: added TXT $$NAME ($$MARK)" || echo "mail-auth: FAILED TXT $$NAME -- $$R"
}
add_txt "$$ZONE" "v=spf1" "v=spf1 include:spf.brevo.com ~all"
add_txt "$$ZONE" "brevo-code" "brevo-code:a6ef1dd91b248559900246eb4e7ceebd"
add_txt "_dmarc.$$ZONE" "v=DMARC1" "v=DMARC1; p=quarantine; pct=100; fo=1; ri=3600; sp=quarantine; adkim=r; aspf=r; rua=mailto:dmarc@viktorbarzin.me,mailto:adb84997@inbox.ondmarc.com; ruf=mailto:dmarc@viktorbarzin.me,mailto:adb84997@inbox.ondmarc.com,mailto:postmaster@viktorbarzin.me;"
add_txt "mail._domainkey.$$ZONE" "v=DKIM1" "v=DKIM1; h=sha256; k=rsa; p=MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAs9XHeFBKhUAEJSikXx+P49Q3nEBbnaSpn6h/9TqIhKaZWSVa2uGUGYQieNdon7DEJZ0VFo0Tvm3/UFsy2qF7ZmF+E/+N8EmkcPrMlxgJT281dpk5DxrZ+kbzw/DosfHH71K6vCLB4rSexzxJHaAx0AUddI3bFUJGjMgCXXCMZF+p8YCx+DDGPIXz2FOTtlJlR7aeZ2xXavwE/lBfI3MLnsq7X+GhPjQEax070nndOdZI0S8HpZkVxdGWl1N2Ec6LukYm2RiUkEMMQHSYX7WF3JBc+CGqUyd706Iy/5oeC3UGwZSM2uLkrp8YBjmw/h1rAeyv/ITt6ZXraP/cIMRiVQIDAQAB"
if ! echo "$$ZONE_DUMP" | grep -q '"type":"MX"'; then
R=$$(curl -sf -G "$$TECH_API/api/zones/records/add" --data-urlencode "token=$$TOKEN" --data-urlencode "zone=$$ZONE" --data-urlencode "domain=$$ZONE" --data-urlencode "type=MX" --data-urlencode "exchange=mail.viktorbarzin.me" --data-urlencode "preference=1" --data-urlencode "ttl=3600") || true
echo "$$R" | grep -q '"status":"ok"' && echo "mail-auth: added MX" || echo "mail-auth: FAILED MX -- $$R"
else
echo "mail-auth: MX present"
fi
# Pin the .lan ingress anchor A record to the LIVE Traefik LB IP. # Pin the .lan ingress anchor A record to the LIVE Traefik LB IP.
# *.viktorbarzin.lan ingress hosts CNAME to ingress.viktorbarzin.lan, # *.viktorbarzin.lan ingress hosts CNAME to ingress.viktorbarzin.lan,
# so a Traefik LB IP move that misses the .lan zone silently breaks # so a Traefik LB IP move that misses the .lan zone silently breaks