From 217a54be9d17791929364cd925b40d2ca14b3cb3 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 3 Jul 2026 10:05:54 +0000 Subject: [PATCH 1/6] cloudflared: add most.viktorbarzin.me CNAME for Cloudflare Pages site MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Viktor asked to host a static HTML site (the 'мост' school project, ОбУ „Отец Паисий", pulled from his Google Drive) on Cloudflare Pages with a custom domain, as a try-out of Pages hosting. The site content is deployed off-infra via wrangler to the Pages project 'most' (most-6if.pages.dev); this CNAME points most.viktorbarzin.me at it. The custom domain is already attached to the Pages project and is waiting on this DNS record to validate. Co-Authored-By: Claude Fable 5 --- docs/architecture/dns.md | 1 + stacks/cloudflared/modules/cloudflared/cloudflare.tf | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/docs/architecture/dns.md b/docs/architecture/dns.md index 6150d226..fe959b52 100644 --- a/docs/architecture/dns.md +++ b/docs/architecture/dns.md @@ -368,6 +368,7 @@ The Cloudflare tunnel uses a **wildcard rule** (`*.viktorbarzin.me → Traefik`) | TXT (MTA-STS) | 1 | `v=STSv1; id=20260412` | TLS enforcement | | TXT (TLSRPT) | 1 | `v=TLSRPTv1; rua=mailto:postmaster@...` | TLS reporting | | A (keyserver) | 1 | `130.162.165.220` (Oracle VPS) | PGP keyserver | +| CNAME (CF Pages) | 1 | `most-6if.pages.dev` (Cloudflare Pages) | `most` — static site hosted off-infra on CF Pages, content deployed via wrangler | ### Proxied vs Non-Proxied diff --git a/stacks/cloudflared/modules/cloudflared/cloudflare.tf b/stacks/cloudflared/modules/cloudflared/cloudflare.tf index ad4d9de8..58d87333 100644 --- a/stacks/cloudflared/modules/cloudflared/cloudflare.tf +++ b/stacks/cloudflared/modules/cloudflared/cloudflare.tf @@ -235,6 +235,18 @@ resource "cloudflare_record" "keyserver" { zone_id = var.cloudflare_zone_id } +# Cloudflare Pages site "мост" (ОбУ „Отец Паисий“ school static site). +# Content is deployed off-infra to the Pages project `most` via +# `wrangler pages deploy`; this record just points the custom domain at it. +resource "cloudflare_record" "most_pages" { + content = "most-6if.pages.dev" + name = "most" + proxied = true + ttl = 1 + type = "CNAME" + zone_id = var.cloudflare_zone_id +} + # Enable HTTP/3 (QUIC) for Cloudflare-proxied domains resource "cloudflare_zone_settings_override" "http3" { zone_id = var.cloudflare_zone_id From 7dd80b6c7c15bec8c414183f49d0d05583e3e9f2 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 3 Jul 2026 10:10:46 +0000 Subject: [PATCH 2/6] technitium: mirror most.viktorbarzin.me into the internal zone (CF Pages site) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The internal split-horizon zone is authoritative for viktorbarzin.me, so the new Cloudflare Pages site (most.viktorbarzin.me, added for Viktor's 'мост' school static site) NXDOMAINed for every internal client — LAN, VLANs and pods — while resolving fine externally. Per the superset rule, add it as a static CNAME (-> most-6if.pages.dev) in the ingress-dns-sync CronJob next to the mail-auth records, and document the off-infra-site case in dns.md. Co-Authored-By: Claude Fable 5 --- docs/architecture/dns.md | 2 +- stacks/technitium/modules/technitium/main.tf | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/architecture/dns.md b/docs/architecture/dns.md index fe959b52..106c2021 100644 --- a/docs/architecture/dns.md +++ b/docs/architecture/dns.md @@ -277,7 +277,7 @@ Technitium's **Split Horizon AddressTranslation** app post-processes DNS respons Config is synced to all 3 Technitium instances by CronJob `technitium-split-horizon-sync` (every 6h). -**Superset rule for the internal `viktorbarzin.me` zone**: it is authoritative for every internal client (pods included since 2026-06-10), so it must carry every record type those clients consume — not just ingress A/CNAMEs. The `technitium-ingress-dns-sync` CronJob therefore also maintains the static **mail-auth records** (apex SPF + brevo-code TXT, MX → mail.viktorbarzin.me, `_dmarc`, `mail._domainkey` DKIM), mirrored from the public Cloudflare zone. Without them, rspamd on the mailserver saw `SPF=none` for inbound `@viktorbarzin.me` mail and quarantined it (broke the Brevo email-roundtrip probe, 2026-06-10). If these records change in Cloudflare, update the sync script too. +**Superset rule for the internal `viktorbarzin.me` zone**: it is authoritative for every internal client (pods included since 2026-06-10), so it must carry every record type those clients consume — not just ingress A/CNAMEs. The `technitium-ingress-dns-sync` CronJob therefore also maintains the static **mail-auth records** (apex SPF + brevo-code TXT, MX → mail.viktorbarzin.me, `_dmarc`, `mail._domainkey` DKIM), mirrored from the public Cloudflare zone. Without them, rspamd on the mailserver saw `SPF=none` for inbound `@viktorbarzin.me` mail and quarantined it (broke the Brevo email-roundtrip probe, 2026-06-10). If these records change in Cloudflare, update the sync script too. The same applies to **off-infra sites** (e.g. `most` → CNAME `most-6if.pages.dev`, Cloudflare Pages): any public-only name with no Traefik ingress must be added as a static record in the sync script, or internal clients NXDOMAIN on it while it works fine externally. ## NodeLocal DNSCache diff --git a/stacks/technitium/modules/technitium/main.tf b/stacks/technitium/modules/technitium/main.tf index 966d11f3..a80c209c 100644 --- a/stacks/technitium/modules/technitium/main.tf +++ b/stacks/technitium/modules/technitium/main.tf @@ -1002,6 +1002,14 @@ resource "kubernetes_cron_job_v1" "technitium_ingress_dns_sync" { echo "mail-auth: MX present" fi + # Off-infra sites on Cloudflare Pages: the internal zone is + # authoritative (superset rule above), so public-only names + # with no Traefik ingress must be mirrored here or every + # internal client (LAN, VLANs, pods) gets NXDOMAIN for them. + # Target is the pages.dev host — resolves via upstream to CF + # edge IPs; normal egress, no hairpin involved. + add_cname "most.$$ZONE" "most-6if.pages.dev" + # Pin the .lan ingress anchor A record to the LIVE Traefik LB IP. # *.viktorbarzin.lan ingress hosts CNAME to ingress.viktorbarzin.lan, # so a Traefik LB IP move that misses the .lan zone silently breaks From e1bd1115623a6e71d14bb13ecca9a2ad87c844fe Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 3 Jul 2026 10:52:30 +0000 Subject: [PATCH 3/6] rename CF Pages site most.viktorbarzin.me -> bridge.viktorbarzin.me MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Viktor asked to rename the 'мост' school static site to 'bridge'. New Cloudflare Pages project 'bridge' (bridge-cv2.pages.dev) already deployed and the custom domain attached; this renames the public CNAME (TF resource most_pages -> bridge_pages, destroy+create swaps the record) and the internal split-horizon static CNAME in the ingress-dns-sync CronJob. The old 'most' Pages project and the stale internal 'most' record are removed out-of-band after this applies. Co-Authored-By: Claude Fable 5 --- docs/architecture/dns.md | 4 ++-- stacks/cloudflared/modules/cloudflared/cloudflare.tf | 8 ++++---- stacks/technitium/modules/technitium/main.tf | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/architecture/dns.md b/docs/architecture/dns.md index 106c2021..07c3b31c 100644 --- a/docs/architecture/dns.md +++ b/docs/architecture/dns.md @@ -277,7 +277,7 @@ Technitium's **Split Horizon AddressTranslation** app post-processes DNS respons Config is synced to all 3 Technitium instances by CronJob `technitium-split-horizon-sync` (every 6h). -**Superset rule for the internal `viktorbarzin.me` zone**: it is authoritative for every internal client (pods included since 2026-06-10), so it must carry every record type those clients consume — not just ingress A/CNAMEs. The `technitium-ingress-dns-sync` CronJob therefore also maintains the static **mail-auth records** (apex SPF + brevo-code TXT, MX → mail.viktorbarzin.me, `_dmarc`, `mail._domainkey` DKIM), mirrored from the public Cloudflare zone. Without them, rspamd on the mailserver saw `SPF=none` for inbound `@viktorbarzin.me` mail and quarantined it (broke the Brevo email-roundtrip probe, 2026-06-10). If these records change in Cloudflare, update the sync script too. The same applies to **off-infra sites** (e.g. `most` → CNAME `most-6if.pages.dev`, Cloudflare Pages): any public-only name with no Traefik ingress must be added as a static record in the sync script, or internal clients NXDOMAIN on it while it works fine externally. +**Superset rule for the internal `viktorbarzin.me` zone**: it is authoritative for every internal client (pods included since 2026-06-10), so it must carry every record type those clients consume — not just ingress A/CNAMEs. The `technitium-ingress-dns-sync` CronJob therefore also maintains the static **mail-auth records** (apex SPF + brevo-code TXT, MX → mail.viktorbarzin.me, `_dmarc`, `mail._domainkey` DKIM), mirrored from the public Cloudflare zone. Without them, rspamd on the mailserver saw `SPF=none` for inbound `@viktorbarzin.me` mail and quarantined it (broke the Brevo email-roundtrip probe, 2026-06-10). If these records change in Cloudflare, update the sync script too. The same applies to **off-infra sites** (e.g. `bridge` → CNAME `bridge-cv2.pages.dev`, Cloudflare Pages): any public-only name with no Traefik ingress must be added as a static record in the sync script, or internal clients NXDOMAIN on it while it works fine externally. ## NodeLocal DNSCache @@ -368,7 +368,7 @@ The Cloudflare tunnel uses a **wildcard rule** (`*.viktorbarzin.me → Traefik`) | TXT (MTA-STS) | 1 | `v=STSv1; id=20260412` | TLS enforcement | | TXT (TLSRPT) | 1 | `v=TLSRPTv1; rua=mailto:postmaster@...` | TLS reporting | | A (keyserver) | 1 | `130.162.165.220` (Oracle VPS) | PGP keyserver | -| CNAME (CF Pages) | 1 | `most-6if.pages.dev` (Cloudflare Pages) | `most` — static site hosted off-infra on CF Pages, content deployed via wrangler | +| CNAME (CF Pages) | 1 | `bridge-cv2.pages.dev` (Cloudflare Pages) | `bridge` — static site hosted off-infra on CF Pages, content deployed via wrangler | ### Proxied vs Non-Proxied diff --git a/stacks/cloudflared/modules/cloudflared/cloudflare.tf b/stacks/cloudflared/modules/cloudflared/cloudflare.tf index 58d87333..92554bb1 100644 --- a/stacks/cloudflared/modules/cloudflared/cloudflare.tf +++ b/stacks/cloudflared/modules/cloudflared/cloudflare.tf @@ -236,11 +236,11 @@ resource "cloudflare_record" "keyserver" { } # Cloudflare Pages site "мост" (ОбУ „Отец Паисий“ school static site). -# Content is deployed off-infra to the Pages project `most` via +# Content is deployed off-infra to the Pages project `bridge` via # `wrangler pages deploy`; this record just points the custom domain at it. -resource "cloudflare_record" "most_pages" { - content = "most-6if.pages.dev" - name = "most" +resource "cloudflare_record" "bridge_pages" { + content = "bridge-cv2.pages.dev" + name = "bridge" proxied = true ttl = 1 type = "CNAME" diff --git a/stacks/technitium/modules/technitium/main.tf b/stacks/technitium/modules/technitium/main.tf index a80c209c..5dfe8cbf 100644 --- a/stacks/technitium/modules/technitium/main.tf +++ b/stacks/technitium/modules/technitium/main.tf @@ -1008,7 +1008,7 @@ resource "kubernetes_cron_job_v1" "technitium_ingress_dns_sync" { # internal client (LAN, VLANs, pods) gets NXDOMAIN for them. # Target is the pages.dev host — resolves via upstream to CF # edge IPs; normal egress, no hairpin involved. - add_cname "most.$$ZONE" "most-6if.pages.dev" + add_cname "bridge.$$ZONE" "bridge-cv2.pages.dev" # Pin the .lan ingress anchor A record to the LIVE Traefik LB IP. # *.viktorbarzin.lan ingress hosts CNAME to ingress.viktorbarzin.lan, From 5c42155b81342732badcfce4b549840d8f31100d Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 3 Jul 2026 12:17:45 +0000 Subject: [PATCH 4/6] docs: Valia-sites domain language + ADR-0018 (off-infra Pages, in-cluster sync) Grill session with Viktor: his mother Valia will keep asking for 1-page site hosting, so the pattern is being made repeatable. Decisions: all Valia sites serve off-infra on Cloudflare Pages (survive homelab outages); one shared in-cluster CronJob mirrors her Drive folders every 10 min and redeploys on change; English subdomain names picked by Viktor; failed-Job-only visibility; stem95su migrates onto the pattern. CONTEXT.md gains Valia site / Content folder / Entry file; full rationale and rejected options in ADR-0018. Co-Authored-By: Claude Fable 5 --- CONTEXT.md | 15 ++++++ ...a-sites-off-infra-pages-in-cluster-sync.md | 47 +++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 docs/adr/0018-valia-sites-off-infra-pages-in-cluster-sync.md diff --git a/CONTEXT.md b/CONTEXT.md index 5cd34ebc..76b101d0 100644 --- a/CONTEXT.md +++ b/CONTEXT.md @@ -237,6 +237,20 @@ _Avoid_: expecting Diun to deploy; conflating with **Keel**. **Anubis**: A PoW reverse-proxy issuing a 30-day JWT cookie, used in front of public content-bearing sites without app-level auth (blog, wiki, landing pages). Never in front of Git, WebDAV, CalDAV, or API endpoints (clients can't solve PoW). +### Externally-authored sites + +**Valia site**: +A small public static site authored by Valia (Viktor's mother, external to the infra) and hosted for her under `.viktorbarzin.me`. Its source of truth is a **Content folder** she owns; the live site is a mirror of that folder, fresh within ~10 minutes. Hosted **off-infra** (Cloudflare Pages) by decision: a homelab outage freezes content but never takes her sites down. Viktor picks the English subdomain name per site at registration (her folder names stay Bulgarian). Current instances: `stem95su`, `bridge`. +_Avoid_: "school site" (the family may grow beyond school projects); treating the deployed copy as editable — edits land only in the **Content folder**. + +**Content folder**: +The Google Drive folder (or subfolder) Valia shares with `vbarzin@gmail.com` holding one **Valia site**'s files. Strictly read-only from the infra side — nothing ever writes back to her Drive. Empty or half-uploaded folder states must never wipe a live site. +_Avoid_: syncing a folder root when the servable content lives in a subfolder (stem95su serves `stem claude/files/`, not the folder root). + +**Entry file**: +The HTML file a **Valia site** serves at `/`. Defaults to `index.html`; per-site override when she names it differently (stem95su: `stem_board.html`). The override is a registration-time setting, not a constraint on her authoring. +_Avoid_: asking Valia to rename her files to fit hosting conventions. + ## Relationships - A **Service** is defined by exactly one **Stack** — **flat** or wrapping a **Stack-local module** — which sources zero or more shared **Factory modules** and resolves to one or more K8s workloads. @@ -248,6 +262,7 @@ A PoW reverse-proxy issuing a 30-day JWT cookie, used in front of public content - A **Service**'s image reaches the cluster via **Woodpecker deploy** (push-driven, on commit) or **Keel** (poll-driven, on a new registry tag); **Diun** only notifies. Operator-managed StatefulSets are rolled by neither. - An owned **Service**'s image is built by GitHub Actions from the **Canonical repo**'s **GitHub mirror** and hosted on ghcr.io (ADR-0002); the **Forgejo registry** keeps only a frozen last-known-good tag per **Service**. - Tier-1 **State tier** state and ~12 app databases share one **CNPG** `pg-cluster`, reached through **PgBouncer**; their credentials rotate via the `vault-database` store. +- A **Valia site** mirrors exactly one **Content folder** and serves exactly one **Entry file** at `/`; the folder is hers, the subdomain name is Viktor's, the hosting is off-infra. ## Example dialogue diff --git a/docs/adr/0018-valia-sites-off-infra-pages-in-cluster-sync.md b/docs/adr/0018-valia-sites-off-infra-pages-in-cluster-sync.md new file mode 100644 index 00000000..5344382a --- /dev/null +++ b/docs/adr/0018-valia-sites-off-infra-pages-in-cluster-sync.md @@ -0,0 +1,47 @@ +# Valia sites are served off-infra (Cloudflare Pages), synced in-cluster + +Valia (Viktor's mother) authors small one-page static sites in Google Drive folders she +shares, and keeps asking for them to be hosted — two exist already (`stem95su`, `bridge`) +and more are expected. We decided all **Valia sites** are served **off-infra on Cloudflare +Pages** under `.viktorbarzin.me`, kept fresh by **one shared in-cluster +CronJob** (`stacks/valia-sites/`) that mirrors each **Content folder** every 10 minutes +(rclone, drive.readonly) and re-deploys only on change (wrangler direct upload). The +existing in-cluster `stem95su` serving stack (nginx + NFS + ingress + per-site sync) +migrates onto this and is retired. + +Why off-infra serving: these are her sites, shown to teachers/parents — they must survive +homelab outages (cf. the 2026-06-27 egress incident that took every proxied in-cluster +site down). With Pages, a homelab outage degrades to "content frozen until we're back", +never "site down". Serving costs no cluster resources and no per-site nginx/PVC/ingress/ +Anubis. Why the syncer stays in-cluster anyway: secrets stay in Vault (no per-site GHA +secret sprawl), and the stem95su guard patterns (hard-fail on Drive auth errors, never +wipe a live site on an empty/partial folder, capped deletes) carry over wholesale. The +deliberate asymmetry — off-infra serving, on-infra syncing — is the point, not an +accident. + +## Considered options + +- **In-cluster everywhere** (generalise stem95su into a factory module): one roof, no + Cloudflare Pages dependency — but her sites share the homelab's fate and each site + spends cluster resources to serve static files a free CDN serves better. +- **Pages for new sites only**: less work now, two patterns and two runbooks forever. +- **GHA-scheduled sync** (fully off-infra pipeline): no cluster dependency at all, but + Drive + Cloudflare credentials would live as GitHub secrets per repo, outside Vault. + +## Consequences + +- Registration is one entry in the `sites` map (name, Content folder, optional Entry + file); CI applies Pages project, custom domain, public CNAME, and internal-DNS config + together. Names are English, picked by Viktor (most → bridge set the precedent). +- The internal split-horizon zone learns Valia sites from a ConfigMap the + `technitium-ingress-dns-sync` script consumes — declaratively, including **removal** + (the previous static-CNAME approach was add-only; a retired site left a stale record). +- Deploy-on-change is mandatory, not an optimisation: Pages caps monthly deployments on + the free tier, and a 10-minute cadence would burn ~4,300/month if unchanged runs + deployed. +- Failure visibility is **failed-Job-only** by explicit choice (no stale-sync alert, no + per-site uptime monitors, no notifications to Valia) — Viktor fields "it didn't + update" reports, consistent with the alert-noise-reduction posture. Revisit if a + silent stall actually bites. +- If the homelab is down, content updates pause; the sites keep serving last-deployed + content. Accepted degradation. From 8b80b4cc41ece1dd520f6ec4f549c3ff413a2422 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 3 Jul 2026 12:28:06 +0000 Subject: [PATCH 5/6] valia-sites: registry stack for Valia's Pages sites + declarative internal DNS (ADR-0018) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Valia keeps asking Viktor to host 1-page sites from her Drive folders; this makes it one map entry. New stacks/valia-sites: per site a CF Pages project + custom domain + proxied CNAME (bridge adopted via import{}), a ConfigMap feed (valia-sites-dns) the technitium ingress-dns-sync script now reconciles internal CNAMEs from (add/update/REMOVE — fixes the add-only stale-record gotcha), and one shared 10-min CronJob that mirrors each Content folder (rclone, drive.readonly, stem95su's guards) and wrangler-deploys ONLY on manifest change (free-tier deploy cap). Scoped CF Pages token + shared rclone conf in secret/valia-sites; the Global API Key never enters a pod. cloudflared forgets bridge's record via removed{} (no destroy). stem95su is in the map dns-parked (manage_dns=false) until its cutover commit. Co-Authored-By: Claude Fable 5 --- .github/workflows/build-valia-sites-sync.yml | 39 ++ .../modules/cloudflared/cloudflare.tf | 20 +- stacks/technitium/modules/technitium/main.tf | 50 ++- stacks/valia-sites/main.tf | 360 ++++++++++++++++++ stacks/valia-sites/sync-image/Dockerfile | 15 + stacks/valia-sites/terragrunt.hcl | 8 + stacks/valia-sites/variables.tf | 3 + 7 files changed, 478 insertions(+), 17 deletions(-) create mode 100644 .github/workflows/build-valia-sites-sync.yml create mode 100644 stacks/valia-sites/main.tf create mode 100644 stacks/valia-sites/sync-image/Dockerfile create mode 100644 stacks/valia-sites/terragrunt.hcl create mode 100644 stacks/valia-sites/variables.tf diff --git a/.github/workflows/build-valia-sites-sync.yml b/.github/workflows/build-valia-sites-sync.yml new file mode 100644 index 00000000..090b7f5c --- /dev/null +++ b/.github/workflows/build-valia-sites-sync.yml @@ -0,0 +1,39 @@ +name: Build valia-sites-sync + +# ADR-0002 + ADR-0018: infra-owned image built off-infra on GHA → ghcr (public). +# Rclone + wrangler runner for the Valia-sites Content-folder mirror CronJob. +# Rebuilds are rare (tool pins only change deliberately) → dispatch + path. +# Security note: no untrusted event inputs are interpolated anywhere (only +# github.actor / github.sha / GITHUB_TOKEN — same shape as the other +# build-*.yml workflows in this repo). +on: + push: + branches: [master] + paths: + - 'stacks/valia-sites/sync-image/**' + workflow_dispatch: {} + +permissions: + contents: read + packages: write + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: docker/setup-buildx-action@v3 + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - uses: docker/build-push-action@v6 + with: + context: stacks/valia-sites/sync-image + platforms: linux/amd64 + provenance: false + push: true + tags: | + ghcr.io/viktorbarzin/valia-sites-sync:latest + ghcr.io/viktorbarzin/valia-sites-sync:${{ github.sha }} diff --git a/stacks/cloudflared/modules/cloudflared/cloudflare.tf b/stacks/cloudflared/modules/cloudflared/cloudflare.tf index 92554bb1..e6edc402 100644 --- a/stacks/cloudflared/modules/cloudflared/cloudflare.tf +++ b/stacks/cloudflared/modules/cloudflared/cloudflare.tf @@ -235,16 +235,16 @@ resource "cloudflare_record" "keyserver" { zone_id = var.cloudflare_zone_id } -# Cloudflare Pages site "мост" (ОбУ „Отец Паисий“ school static site). -# Content is deployed off-infra to the Pages project `bridge` via -# `wrangler pages deploy`; this record just points the custom domain at it. -resource "cloudflare_record" "bridge_pages" { - content = "bridge-cv2.pages.dev" - name = "bridge" - proxied = true - ttl = 1 - type = "CNAME" - zone_id = var.cloudflare_zone_id +# bridge.viktorbarzin.me (Cloudflare Pages, "мост" school site) moved to +# stacks/valia-sites (ADR-0018) — all Valia-site records live there now. +# Forget from this state WITHOUT destroying; valia-sites imports the live +# record by id. Delete this block once both stacks have applied. +removed { + from = cloudflare_record.bridge_pages + + lifecycle { + destroy = false + } } # Enable HTTP/3 (QUIC) for Cloudflare-proxied domains diff --git a/stacks/technitium/modules/technitium/main.tf b/stacks/technitium/modules/technitium/main.tf index 5dfe8cbf..22895b7c 100644 --- a/stacks/technitium/modules/technitium/main.tf +++ b/stacks/technitium/modules/technitium/main.tf @@ -873,6 +873,14 @@ resource "kubernetes_cluster_role" "ingress_dns_sync" { resources = ["services"] verbs = ["get", "list"] } + # Read the Valia-sites internal-DNS feed (written by stacks/valia-sites, + # ADR-0018) so the sync can reconcile off-infra Pages CNAMEs declaratively. + rule { + api_groups = [""] + resources = ["configmaps"] + resource_names = ["valia-sites-dns"] + verbs = ["get"] + } } resource "kubernetes_cluster_role_binding" "ingress_dns_sync" { @@ -1002,13 +1010,41 @@ resource "kubernetes_cron_job_v1" "technitium_ingress_dns_sync" { echo "mail-auth: MX present" fi - # Off-infra sites on Cloudflare Pages: the internal zone is - # authoritative (superset rule above), so public-only names - # with no Traefik ingress must be mirrored here or every - # internal client (LAN, VLANs, pods) gets NXDOMAIN for them. - # Target is the pages.dev host — resolves via upstream to CF - # edge IPs; normal egress, no hairpin involved. - add_cname "bridge.$$ZONE" "bridge-cv2.pages.dev" + # Valia sites (ADR-0018) — off-infra Cloudflare Pages sites. + # The internal zone is authoritative (superset rule above), so + # these public-only names must exist here or every internal + # client NXDOMAINs on them. Reconciled DECLARATIVELY from the + # ConfigMap valia-sites-dns (written by stacks/valia-sites): + # ensure/update every entry, and DELETE stale records that + # left the map (site retired/renamed). Deletion is scoped to + # CNAMEs targeting *.pages.dev — nothing else is ever touched. + # Targets resolve upstream to CF edge IPs; no hairpin involved. + VALIA=$$(kubectl get configmap valia-sites-dns -n technitium -o go-template='{{range $$k, $$v := .data}}{{$$k}} {{$$v}}{{"\n"}}{{end}}' 2>/dev/null || true) + if [ -n "$$VALIA" ]; then + printf '%s\n' "$$VALIA" | while read -r VNAME VTARGET; do + [ -z "$$VNAME" ] && continue + CUR=$$(curl -sf "$$TECH_API/api/zones/records/get?token=$$TOKEN&zone=$$ZONE&domain=$$VNAME.$$ZONE" | grep -o '"cname":"[^"]*"' | head -1 | cut -d'"' -f4) + if [ "$$CUR" = "$$VTARGET" ]; then + echo "valia: $$VNAME.$$ZONE ok" + continue + fi + if [ -n "$$CUR" ]; then + curl -sf -G "$$TECH_API/api/zones/records/delete" --data-urlencode "token=$$TOKEN" --data-urlencode "zone=$$ZONE" --data-urlencode "domain=$$VNAME.$$ZONE" --data-urlencode "type=CNAME" --data-urlencode "cname=$$CUR" > /dev/null || true + fi + R=$$(curl -sf -G "$$TECH_API/api/zones/records/add" --data-urlencode "token=$$TOKEN" --data-urlencode "zone=$$ZONE" --data-urlencode "domain=$$VNAME.$$ZONE" --data-urlencode "type=CNAME" --data-urlencode "cname=$$VTARGET" --data-urlencode "ttl=3600") || true + echo "$$R" | grep -q '"status":"ok"' && echo "valia: set $$VNAME.$$ZONE -> $$VTARGET" || echo "valia: FAILED $$VNAME.$$ZONE -- $$R" + done + # Deletion pass: zone CNAMEs targeting *.pages.dev that are + # no longer in the map. ZONE_DUMP predates this run's adds, + # but just-set names are in $VALIA so they're never deleted. + printf '%s' "$$ZONE_DUMP" | tr ',' '\n' | awk -F'"' '/"name":/{n=$$4} /"cname":/{print n" "$$4}' | grep '\.pages\.dev *$$' | while read -r RNAME RTARGET; do + SHORT=$${RNAME%%.$$ZONE} + printf '%s\n' "$$VALIA" | grep -q "^$$SHORT " && continue + curl -sf -G "$$TECH_API/api/zones/records/delete" --data-urlencode "token=$$TOKEN" --data-urlencode "zone=$$ZONE" --data-urlencode "domain=$$RNAME" --data-urlencode "type=CNAME" --data-urlencode "cname=$$RTARGET" > /dev/null && echo "valia: removed stale $$RNAME -> $$RTARGET" + done + else + echo "valia: CM valia-sites-dns absent/unreadable -- skipping Pages CNAMEs this run" + fi # Pin the .lan ingress anchor A record to the LIVE Traefik LB IP. # *.viktorbarzin.lan ingress hosts CNAME to ingress.viktorbarzin.lan, diff --git a/stacks/valia-sites/main.tf b/stacks/valia-sites/main.tf new file mode 100644 index 00000000..f185aac5 --- /dev/null +++ b/stacks/valia-sites/main.tf @@ -0,0 +1,360 @@ +# Valia sites (ADR-0018): small static sites authored by Valia in Google Drive, +# served OFF-INFRA on Cloudflare Pages, mirrored by the in-cluster CronJob below +# every 10 minutes. Registering a new site = one entry in local.sites (plus +# Valia sharing the folder with vbarzin@gmail.com). Full runbook: +# docs/runbooks/valia-sites.md +# +# Per site this stack fans out: +# - cloudflare_pages_project + custom domain .viktorbarzin.me +# - public proxied CNAME -> .pages.dev (manage_dns gate) +# - internal split-horizon CNAME via ConfigMap valia-sites-dns consumed by +# the technitium-ingress-dns-sync script (declarative: add/update/REMOVE) +# - a slot in the shared sync CronJob (rclone mirror -> wrangler deploy) + +locals { + cloudflare_account_id = "02e035473cfc4834fb10c5d35470d8b4" # vbarzin@gmail.com's account (not a secret) + + # THE site registry. Keys are the public subdomain (English, Viktor picks — + # CONTEXT.md "Valia site"). folder_id = the Drive folder Valia shared (the + # Content folder); src_path = subfolder holding servable files ("" = root); + # entry_file = what / must serve (staged as index.html at deploy time). + # manage_dns = false parks a site's public CNAME + internal record while the + # name is still owned elsewhere (used for the stem95su ingress cutover). + sites = { + bridge = { + folder_id = "1YWwAtSTsJD9HOzckGRIFXigWqCgYSGEa" # "мост" — ОбУ „Отец Паисий“ + src_path = "" + entry_file = "index.html" + manage_dns = true + } + stem95su = { + folder_id = "1cmOI2jRyBJdnrVPgbr4kx2cx_4DY6pm_" # "claude" — 95. СУ STEM board + src_path = "stem claude/files" + entry_file = "stem_board.html" + manage_dns = false # flipped true in the cutover commit (record still owned by stacks/stem95su ingress_factory) + } + } + + dns_managed_sites = { for k, v in local.sites : k => v if v.manage_dns } +} + +# --------------------------------------------------------------------------- +# Cloudflare Pages: project + custom domain per site +# --------------------------------------------------------------------------- + +resource "cloudflare_pages_project" "site" { + for_each = local.sites + account_id = local.cloudflare_account_id + name = each.key + production_branch = "main" +} + +# bridge was created by hand (wrangler) on 2026-07-03 — adopt, don't recreate. +import { + to = cloudflare_pages_project.site["bridge"] + id = "02e035473cfc4834fb10c5d35470d8b4/bridge" +} + +resource "cloudflare_pages_domain" "site" { + for_each = local.sites + account_id = local.cloudflare_account_id + project_name = cloudflare_pages_project.site[each.key].name + domain = "${each.key}.viktorbarzin.me" +} + +import { + to = cloudflare_pages_domain.site["bridge"] + id = "02e035473cfc4834fb10c5d35470d8b4/bridge/bridge.viktorbarzin.me" +} + +# Public proxied CNAME. Gated on manage_dns: a site whose name is still served +# by an in-cluster ingress keeps its ingress_factory record until cutover +# (two records can't share one name). +resource "cloudflare_record" "site" { + for_each = local.dns_managed_sites + zone_id = var.cloudflare_zone_id + name = each.key + content = cloudflare_pages_project.site[each.key].subdomain + type = "CNAME" + proxied = true + ttl = 1 +} + +# bridge's record predates this stack (created 2026-07-03 in stacks/cloudflared, +# handed off via removed{} there) — adopt by id. +import { + to = cloudflare_record.site["bridge"] + id = "fd2c5dd4efe8fe38958944e74d0ced6d/ff4fb6f4900744d4b22de50d3fdd219b" +} + +# --------------------------------------------------------------------------- +# Internal split-horizon DNS feed (docs/architecture/dns.md "superset rule"): +# the technitium-ingress-dns-sync script reads this CM and reconciles internal +# CNAMEs for every entry — including deleting stale *.pages.dev records when +# an entry disappears (site retired/renamed). +# --------------------------------------------------------------------------- + +resource "kubernetes_config_map" "valia_sites_dns" { + metadata { + name = "valia-sites-dns" + namespace = "technitium" + labels = { "app.kubernetes.io/managed-by" = "valia-sites" } + } + data = { for k, v in local.dns_managed_sites : k => cloudflare_pages_project.site[k].subdomain } +} + +# --------------------------------------------------------------------------- +# The shared sync CronJob +# --------------------------------------------------------------------------- + +resource "kubernetes_namespace" "valia_sites" { + metadata { + name = "valia-sites" + labels = { + "istio-injection" : "disabled" + tier = local.tiers.aux + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace + ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] + } +} + +# Secrets: shared drive.readonly rclone conf + the SCOPED CF Pages token +# (Pages Read/Write only — the Global API Key never enters a pod). +resource "kubernetes_manifest" "sync_external_secret" { + field_manager { + force_conflicts = true + } + manifest = { + apiVersion = "external-secrets.io/v1" + kind = "ExternalSecret" + metadata = { + name = "valia-sites-sync" + namespace = kubernetes_namespace.valia_sites.metadata[0].name + } + spec = { + refreshInterval = "1h" + secretStoreRef = { + name = "vault-kv" + kind = "ClusterSecretStore" + } + target = { name = "valia-sites-sync" } + data = [ + { + secretKey = "rclone.conf" + remoteRef = { key = "valia-sites", property = "rclone_conf" } + }, + { + secretKey = "CLOUDFLARE_API_TOKEN" + remoteRef = { key = "valia-sites", property = "cloudflare_pages_token" } + }, + { + secretKey = "CLOUDFLARE_ACCOUNT_ID" + remoteRef = { key = "valia-sites", property = "account_id" } + }, + ] + } + } + depends_on = [kubernetes_namespace.valia_sites] +} + +# Site registry rendered for the job (folder ids aren't secrets). +resource "kubernetes_config_map" "sync_config" { + metadata { + name = "valia-sites-config" + namespace = kubernetes_namespace.valia_sites.metadata[0].name + } + data = { + "sites.json" = jsonencode(local.sites) + } +} + +# Last-deployed manifest hash per site — written by the job (merge-patch), so +# TF must never fight it over data. +resource "kubernetes_config_map" "sync_state" { + metadata { + name = "valia-sites-state" + namespace = kubernetes_namespace.valia_sites.metadata[0].name + } + data = {} + lifecycle { + ignore_changes = [data] + } +} + +resource "kubernetes_service_account" "sync" { + metadata { + name = "valia-sites-sync" + namespace = kubernetes_namespace.valia_sites.metadata[0].name + } +} + +resource "kubernetes_role" "sync_state" { + metadata { + name = "valia-sites-sync-state" + namespace = kubernetes_namespace.valia_sites.metadata[0].name + } + rule { + api_groups = [""] + resources = ["configmaps"] + resource_names = ["valia-sites-state"] + verbs = ["get", "patch"] + } +} + +resource "kubernetes_role_binding" "sync_state" { + metadata { + name = "valia-sites-sync-state" + namespace = kubernetes_namespace.valia_sites.metadata[0].name + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "Role" + name = kubernetes_role.sync_state.metadata[0].name + } + subject { + kind = "ServiceAccount" + name = kubernetes_service_account.sync.metadata[0].name + namespace = kubernetes_namespace.valia_sites.metadata[0].name + } +} + +resource "kubernetes_cron_job_v1" "sync" { + metadata { + name = "valia-sites-sync" + namespace = kubernetes_namespace.valia_sites.metadata[0].name + labels = { app = "valia-sites", component = "sync" } + } + spec { + schedule = "*/10 * * * *" + concurrency_policy = "Forbid" + successful_jobs_history_limit = 2 + failed_jobs_history_limit = 3 + job_template { + metadata {} + spec { + backoff_limit = 1 + ttl_seconds_after_finished = 86400 + template { + metadata { labels = { app = "valia-sites", component = "sync" } } + spec { + restart_policy = "OnFailure" + service_account_name = kubernetes_service_account.sync.metadata[0].name + container { + name = "sync" + image = "ghcr.io/viktorbarzin/valia-sites-sync:latest" + # Guards mirror stem95su's proven set: hard-fail on Drive + # list/auth errors (visible as a failed Job — the chosen + # visibility, ADR-0018), skip quietly when a folder is empty or + # missing its entry file (never wipe a live site), capped + # deletes. Deploy ONLY on remote-manifest change: CF Pages caps + # monthly deployments on the free tier, so 144 no-op + # deploys/day is not an option. + command = ["/bin/sh", "-c", <<-EOT + set -u + cp /config/rclone.conf /tmp/rc.conf + APISERVER="https://kubernetes.default.svc" + SA=/var/run/secrets/kubernetes.io/serviceaccount + KTOKEN=$$(cat $$SA/token); NS=$$(cat $$SA/namespace) + STATE_URL="$$APISERVER/api/v1/namespaces/$$NS/configmaps/valia-sites-state" + FAILED=0 + for SITE in $$(jq -r 'keys[]' /sites/sites.json); do + FOLDER=$$(jq -r --arg s "$$SITE" '.[$$s].folder_id' /sites/sites.json) + SRC_PATH=$$(jq -r --arg s "$$SITE" '.[$$s].src_path' /sites/sites.json) + ENTRY=$$(jq -r --arg s "$$SITE" '.[$$s].entry_file' /sites/sites.json) + RC="rclone --config /tmp/rc.conf --drive-root-folder-id=$$FOLDER --drive-skip-gdocs" + # 1. Remote manifest (path+size+hash) — metadata only, no download. + MANIFEST=$$($$RC lsf "gdrive:$$SRC_PATH" -R --files-only --format phs 2>/tmp/lsf.err) || { + echo "FATAL [$$SITE]: Drive list failed (auth/network):"; cat /tmp/lsf.err; FAILED=1; continue; } + N=$$(printf '%s\n' "$$MANIFEST" | grep -c . || true) + if [ "$$N" -lt 1 ] || ! printf '%s\n' "$$MANIFEST" | cut -d';' -f1 | grep -qx "$$ENTRY"; then + echo "GUARD [$$SITE]: N=$$N / $$ENTRY missing -- skipping, site untouched"; continue + fi + HASH=$$(printf '%s' "$$MANIFEST" | sha256sum | cut -d' ' -f1) + LAST=$$(curl -sf --cacert $$SA/ca.crt -H "Authorization: Bearer $$KTOKEN" "$$STATE_URL" | jq -r --arg s "$$SITE" '.data[$$s] // ""') + if [ "$$HASH" = "$$LAST" ]; then echo "OK [$$SITE]: unchanged"; continue; fi + # 2. Content changed — pull and deploy. + $$RC sync "gdrive:$$SRC_PATH" "/work/$$SITE" --exclude ".DS_Store" --fast-list --transfers 4 --max-delete 25 -v || { + echo "FATAL [$$SITE]: rclone sync failed"; FAILED=1; continue; } + if [ "$$ENTRY" != "index.html" ]; then + cp "/work/$$SITE/$$ENTRY" "/work/$$SITE/index.html" + fi + wrangler pages deploy "/work/$$SITE" --project-name="$$SITE" --branch=main --commit-dirty=true || { + echo "FATAL [$$SITE]: wrangler deploy failed"; FAILED=1; continue; } + curl -sf --cacert $$SA/ca.crt -H "Authorization: Bearer $$KTOKEN" \ + -X PATCH -H "Content-Type: application/merge-patch+json" \ + -d "{\"data\":{\"$$SITE\":\"$$HASH\"}}" "$$STATE_URL" > /dev/null || { + echo "WARN [$$SITE]: state patch failed (will redeploy next run)"; FAILED=1; } + echo "DEPLOYED [$$SITE]: $$HASH" + done + exit $$FAILED + EOT + ] + env { + name = "CLOUDFLARE_API_TOKEN" + value_from { + secret_key_ref { + name = "valia-sites-sync" + key = "CLOUDFLARE_API_TOKEN" + } + } + } + env { + name = "CLOUDFLARE_ACCOUNT_ID" + value_from { + secret_key_ref { + name = "valia-sites-sync" + key = "CLOUDFLARE_ACCOUNT_ID" + } + } + } + resources { + requests = { cpu = "25m", memory = "128Mi" } + limits = { memory = "512Mi" } + } + volume_mount { + name = "rclone-config" + mount_path = "/config" + read_only = true + } + volume_mount { + name = "sites-config" + mount_path = "/sites" + read_only = true + } + volume_mount { + name = "work" + mount_path = "/work" + } + } + volume { + name = "rclone-config" + secret { + secret_name = "valia-sites-sync" + items { + key = "rclone.conf" + path = "rclone.conf" + } + } + } + volume { + name = "sites-config" + config_map { name = kubernetes_config_map.sync_config.metadata[0].name } + } + volume { + name = "work" + empty_dir {} + } + } + } + } + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2 + ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] + } + depends_on = [kubernetes_manifest.sync_external_secret] +} diff --git a/stacks/valia-sites/sync-image/Dockerfile b/stacks/valia-sites/sync-image/Dockerfile new file mode 100644 index 00000000..abe93ef3 --- /dev/null +++ b/stacks/valia-sites/sync-image/Dockerfile @@ -0,0 +1,15 @@ +# valia-sites-sync: everything the 10-min Content-folder mirror needs, baked in +# (no runtime installs — CronJob pods must not apk/npm on every start). +# rclone pinned to match the proven stem95su version; wrangler pinned to major 4. +FROM node:22-alpine + +RUN apk add --no-cache curl unzip ca-certificates jq \ + && curl -fsSL https://downloads.rclone.org/v1.74.3/rclone-v1.74.3-linux-amd64.zip -o /tmp/rclone.zip \ + && unzip -j /tmp/rclone.zip '*/rclone' -d /usr/local/bin \ + && chmod +x /usr/local/bin/rclone \ + && rm /tmp/rclone.zip \ + && npm install -g wrangler@4 \ + && npm cache clean --force + +# wrangler writes config/cache under $HOME; the CronJob runs as non-root node (uid 1000) +ENV HOME=/tmp diff --git a/stacks/valia-sites/terragrunt.hcl b/stacks/valia-sites/terragrunt.hcl new file mode 100644 index 00000000..0d1c8e53 --- /dev/null +++ b/stacks/valia-sites/terragrunt.hcl @@ -0,0 +1,8 @@ +include "root" { + path = find_in_parent_folders() +} + +dependency "platform" { + config_path = "../platform" + skip_outputs = true +} diff --git a/stacks/valia-sites/variables.tf b/stacks/valia-sites/variables.tf new file mode 100644 index 00000000..3d4b8d3e --- /dev/null +++ b/stacks/valia-sites/variables.tf @@ -0,0 +1,3 @@ +variable "cloudflare_zone_id" { + type = string +} From 695e020111d51abd91f23e242cf3c45d40685bc4 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 3 Jul 2026 12:31:53 +0000 Subject: [PATCH 6/6] =?UTF-8?q?cloudflared:=20move=20bridge=20removed{}=20?= =?UTF-8?q?to=20stack=20root=20=E2=80=94=20removed=20blocks=20are=20root-m?= =?UTF-8?q?odule-only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pipeline 461 failed terraform init: the removed{} handoff block sat in the stack-local module, but Terraform only allows removed blocks in the root module. Same intent, correct position (from = module.cloudflared.cloudflare_record.bridge_pages, destroy=false). Without this the stale state entry would make the next cloudflared apply destroy the record valia-sites now owns. Co-Authored-By: Claude Fable 5 --- .../cloudflared/modules/cloudflared/cloudflare.tf | 13 +++---------- stacks/cloudflared/removed-bridge.tf | 12 ++++++++++++ 2 files changed, 15 insertions(+), 10 deletions(-) create mode 100644 stacks/cloudflared/removed-bridge.tf diff --git a/stacks/cloudflared/modules/cloudflared/cloudflare.tf b/stacks/cloudflared/modules/cloudflared/cloudflare.tf index e6edc402..bb4f8759 100644 --- a/stacks/cloudflared/modules/cloudflared/cloudflare.tf +++ b/stacks/cloudflared/modules/cloudflared/cloudflare.tf @@ -236,16 +236,9 @@ resource "cloudflare_record" "keyserver" { } # bridge.viktorbarzin.me (Cloudflare Pages, "мост" school site) moved to -# stacks/valia-sites (ADR-0018) — all Valia-site records live there now. -# Forget from this state WITHOUT destroying; valia-sites imports the live -# record by id. Delete this block once both stacks have applied. -removed { - from = cloudflare_record.bridge_pages - - lifecycle { - destroy = false - } -} +# stacks/valia-sites (ADR-0018) — all Valia-site records live there now. The +# state forget lives in the STACK ROOT (../..//removed-bridge.tf): removed{} +# blocks are root-module-only. # Enable HTTP/3 (QUIC) for Cloudflare-proxied domains resource "cloudflare_zone_settings_override" "http3" { diff --git a/stacks/cloudflared/removed-bridge.tf b/stacks/cloudflared/removed-bridge.tf new file mode 100644 index 00000000..f4186c91 --- /dev/null +++ b/stacks/cloudflared/removed-bridge.tf @@ -0,0 +1,12 @@ +# bridge.viktorbarzin.me (Cloudflare Pages) moved to stacks/valia-sites +# (ADR-0018), which has already imported the live record. Forget it from this +# stack's state WITHOUT destroying. removed{} must sit in the root module — +# a module-level attempt broke init (pipeline 461). Delete this file once the +# apply has run. +removed { + from = module.cloudflared.cloudflare_record.bridge_pages + + lifecycle { + destroy = false + } +}