diff --git a/.claude/reference/service-catalog.md b/.claude/reference/service-catalog.md index 82e11c61..6b49b94e 100644 --- a/.claude/reference/service-catalog.md +++ b/.claude/reference/service-catalog.md @@ -120,8 +120,8 @@ | status-page | Status page | status-page | | plotting-book | Book plotting/world-building app | plotting-book | | tripit | Self-hosted TripIt-clone travel-itinerary PWA (FastAPI + SvelteKit SPA, same-origin). CNPG (`tripit` db, Vault static role `pg-tripit`) + RWX NFS trip-doc vault (`/srv/nfs/tripit-documents`) + RWO `proxmox-lvm-encrypted` personal-document vault `tripit-personal-documents` (passports/IDs — AES-256-GCM app-layer envelope, master key `DOCUMENT_ENCRYPTION_KEY` in `secret/tripit`). `auth=required` (Authentik forward-auth, reads `X-authentik-email`); second `auth=none` ingress on `/api/calendar` for HMAC-token-gated `.ics` feed. Email-ingest CronJob `tripit-ingest-plans` (`*/15`) is the SOLE inbound path — forward a booking to plans@viktorbarzin.me (catch-all → spam@), polled read-only and routed ONLY to a registered user / verified linked address (no default-owner fallback; strangers ignored), parsed by local LLM (`qwen3vl-4b`), and the sender is emailed the outcome (Added to trip / Couldn't import). Plus `tripit-poll-flights`, `tripit-run-reminders`, `tripit-transport-nudge`, `tripit-weather-brief`. (The old Gmail-scrape `tripit-ingest-mail` CronJob was removed 2026-06-05.) App secrets in Vault `secret/tripit`. | tripit | -| stem95su | STEM educational platform for **95. СУ „Проф. Иван Шишманов"** (Sofia school) at stem95su.viktorbarzin.me. Public **open** static site (`auth=none` — CrowdSec + ai-bot-block, no login). Stock `nginx:1.28-alpine` serving content **straight off PVE host NFS** `/srv/nfs/stem-site` (RWX `nfs_volume`, mounted read-only) — **NOT** image-baked, so the externally-authored (Gemini-exported) HTML/media updates with no rebuild; auto-backed-up offsite by `nfs-mirror`. **Content source = Google Drive folder "claude"** (id `1cmOI2jRyBJdnrVPgbr4kx2cx_4DY6pm_`, shared Valentina→vbarzin@gmail.com). **Deploy = scheduled mirror** (since 2026-06-09, reversed the earlier on-demand-only call once content went active): CronJob `stem95su-gdrive-sync` (`*/10`, `stacks/stem95su/gdrive-sync.tf`) mounts the content PVC RW and `rclone sync`s the Drive folder onto it (`docker.io/rclone/rclone:1.74.3`, `scope=drive.readonly` — Drive is READ-ONLY; empty-source guard + `--max-delete 25` so a partial listing can't wipe the site). rclone creds (OAuth refresh-token) in Vault `secret/stem95su` (`rclone_conf`) → ESO secret `stem95su-rclone`. **Requires the GCP OAuth app (project home-lab-1700868541205) published to "Production"** or the refresh token expires ~weekly (re-mint + `vault kv put secret/stem95su rclone_conf=…` after publishing); a dead token surfaces as a failed Job. Manual on-demand sync still possible (throwaway rclone container from devvm; recipe in claude-memory). Nextcloud "PVE NFS Pool"/rsync is a manual fallback. Dashboard `stem_board.html` served at `/` via a small nginx ConfigMap (`index`). No DB, no in-cluster secrets. Reference impl for the NFS-backed static-site pattern (see patterns.md). **Pages cutover PARKED** (ADR-0018): `stem_board.html` embeds the 42.9MB `stem_video.mp4` > the 25MB CF Pages per-file cap — stays on this stack until the video shrinks (parked as `manage_dns=false` in stacks/valia-sites; see docs/runbooks/valia-sites.md). | stem95su | -| valia-sites | **Valia-site registry + sync** (ADR-0018): all sites authored by Valia serve OFF-INFRA on Cloudflare Pages (`bridge` live; `stem95su` parked, see above). One map entry in `stacks/valia-sites/main.tf` per site fans out Pages project + custom domain + public CNAME + internal split-horizon CNAME (ConfigMap `valia-sites-dns` → technitium sync, declarative incl. removal). CronJob `valia-sites-sync` (`*/10`, image ghcr `valia-sites-sync`) mirrors each Drive Content folder (rclone `drive.readonly`, stem95su-style guards + 25MB Pages-cap guard) and wrangler-deploys ONLY on manifest change (free-tier deploy cap). Secrets `secret/valia-sites` (shared rclone conf + SCOPED CF Pages token — Global API Key never in pods). Failed-Job-only visibility by choice. Runbook: docs/runbooks/valia-sites.md. | valia-sites | +| stem95su | STEM educational platform for **95. СУ „Проф. Иван Шишманов"** (Sofia school) at stem95su.viktorbarzin.me — **a Valia site on Cloudflare Pages since 2026-07-03** (ADR-0018): registry entry in `stacks/valia-sites`, synced from Drive folder "claude" every 10 min, deploy-on-change. The old in-cluster stack (nginx off PVE NFS + per-site rclone CronJob) is RETIRED — stacks/stem95su is a tombstone; `secret/stem95su` superseded by `secret/valia-sites`; `stem_video.mp4` was compressed 42.9→21.4MB (25MB Pages cap) with Viktor's OK. See docs/runbooks/valia-sites.md. | — | +| valia-sites | **Valia-site registry + sync** (ADR-0018): all sites authored by Valia serve OFF-INFRA on Cloudflare Pages (`bridge` + `stem95su` live). One map entry in `stacks/valia-sites/main.tf` per site fans out Pages project + custom domain + public CNAME + internal split-horizon CNAME (ConfigMap `valia-sites-dns` → technitium sync, declarative incl. removal). CronJob `valia-sites-sync` (`*/10`, image ghcr `valia-sites-sync`) mirrors each Drive Content folder (rclone `drive.readonly`, stem95su-style guards + 25MB Pages-cap guard) and wrangler-deploys ONLY on manifest change (free-tier deploy cap). Secrets `secret/valia-sites` (shared rclone conf + SCOPED CF Pages token — Global API Key never in pods). Failed-Job-only visibility by choice. Runbook: docs/runbooks/valia-sites.md. | valia-sites | | trek | **TRIAL (2026-06-05)** — self-hosted group-trip planner (upstream [TREK](https://github.com/mauriceboe/TREK), `mauriceboe/trek:3.0.22`, AGPL-3.0). Solo evaluation behind Authentik forward-auth (`auth=required`) before deciding build-vs-adopt; covers collaborative trip planning + accommodation records + activities + per-person budget splitting on free OpenStreetMap (no paid maps key). SQLite + uploads on `proxmox-lvm-encrypted` (`trek-data-encrypted` 2Gi, `trek-uploads-encrypted` 5Gi). For the trial only: `ENCRYPTION_KEY` is TREK-auto-generated onto the data PVC and the bootstrap admin (`admin@trek.local`) is printed to pod logs — NO Vault/ESO wiring (graduation TODO: move key to `secret/trek` + ESO, add an app-level SQLite backup CronJob since host file-backup can't read the LUKS PVC, wire TREK↔Authentik OIDC). Pinned image, TF-managed (no CI/Keel). Availability-poll companion (Rallly) deferred. Teardown: `tg destroy` in `stacks/trek`. | trek | ## Cloudflare Domains diff --git a/docs/runbooks/valia-sites.md b/docs/runbooks/valia-sites.md index 0f075d8d..ee10a866 100644 --- a/docs/runbooks/valia-sites.md +++ b/docs/runbooks/valia-sites.md @@ -87,13 +87,12 @@ Rename = retire + add (Pages projects can't be renamed). Retire: ## History -- stem95su still serves from its ORIGINAL in-cluster stack (nginx + NFS + - its own rclone CronJob): its Pages cutover is **parked** (`manage_dns = - false`) because `stem_board.html` embeds the 42.9 MB `stem_video.mp4`, - over the 25 MB Pages per-file cap — the sync guard-skips it until the - video shrinks below 25 MB (or the site is deliberately kept in-cluster - and removed from the map). Once cut over: flip `manage_dns = true`, - set `dns_type = "none"` in `stacks/stem95su`, then retire that stack; - `secret/stem95su` becomes superseded by `secret/valia-sites`. +- stem95su served in-cluster (nginx + NFS + its own rclone CronJob) until + 2026-07-03, when it was cut over to this pattern and the old stack retired + (ADR-0018). The blocking 42.9 MB `stem_video.mp4` was compressed to 21.4 MB + (same 1080p, ~2.5 Mbps H.264) and replaced in Valia's folder with Viktor's + explicit one-time OK. `secret/stem95su` is superseded by + `secret/valia-sites`; `/srv/nfs/stem-site` on the PVE host is a harmless + leftover. - bridge started as a hand-deployed wrangler experiment (2026-07-03, memory id 7085) and was adopted into the stack the same day. diff --git a/stacks/stem95su/gdrive-sync.tf b/stacks/stem95su/gdrive-sync.tf deleted file mode 100644 index 9142c15c..00000000 --- a/stacks/stem95su/gdrive-sync.tf +++ /dev/null @@ -1,122 +0,0 @@ -# Automatic Google Drive -> site sync (added 2026-06-09; supersedes the -# earlier on-demand-only model now that content is actively maintained). -# -# A CronJob mirrors the READ-ONLY Drive folder "claude" (servable content in -# subfolder "stem claude/files/") onto the NFS content volume every 10 min via -# rclone. rclone is delta-aware: an unchanged run lists ~33 files' metadata and -# transfers nothing, so the schedule is cheap (not a 24MB re-download). nginx -# keeps serving the same volume read-only; updates appear within ~5s (actimeo). -# -# Drive is treated strictly READ-ONLY: scope=drive.readonly and rclone only ever -# reads the remote (sync gdrive: -> /data), never writes back. -# -# TOKEN LONGEVITY: the GCP OAuth app (project home-lab-1700868541205) MUST be -# published to "Production" or its refresh token expires ~weekly and this job -# fails. After publishing, re-mint the token and refresh -# `secret/stem95su.rclone_conf`. A failed run surfaces as a failed Job. - -resource "kubernetes_manifest" "rclone_external_secret" { - field_manager { - force_conflicts = true - } - manifest = { - apiVersion = "external-secrets.io/v1" - kind = "ExternalSecret" - metadata = { - name = "stem95su-rclone" - namespace = kubernetes_namespace.stem95su.metadata[0].name - } - spec = { - refreshInterval = "1h" - secretStoreRef = { - name = "vault-kv" - kind = "ClusterSecretStore" - } - target = { name = "stem95su-rclone" } - data = [{ - secretKey = "rclone.conf" - remoteRef = { - key = "stem95su" - property = "rclone_conf" - } - }] - } - } - depends_on = [kubernetes_namespace.stem95su] -} - -resource "kubernetes_cron_job_v1" "gdrive_sync" { - metadata { - name = "stem95su-gdrive-sync" - namespace = kubernetes_namespace.stem95su.metadata[0].name - labels = { run = "stem95su", component = "gdrive-sync" } - } - spec { - schedule = "*/10 * * * *" - concurrency_policy = "Forbid" - successful_jobs_history_limit = 2 - failed_jobs_history_limit = 3 - job_template { - metadata {} - spec { - backoff_limit = 1 - ttl_seconds_after_finished = 86400 - template { - metadata { labels = { run = "stem95su", component = "gdrive-sync" } } - spec { - restart_policy = "OnFailure" - container { - name = "rclone" - image = "docker.io/rclone/rclone:1.74.3" - # Mirror Drive folder -> /data. Guard: hard-fail on auth/list error - # (so an expired token is visible); skip quietly if the source is - # empty / missing the dashboard (never wipe the live site); - # --max-delete caps catastrophic deletes from a partial listing. - command = ["/bin/sh", "-c", <<-EOT - set -eu - cp /config/rclone.conf /tmp/rc.conf - SRC="gdrive:stem claude/files" - LIST=$(rclone --config /tmp/rc.conf lsf "$SRC" --files-only) || { echo "FATAL: Drive list failed (auth/network)"; exit 1; } - N=$(printf '%s\n' "$LIST" | grep -c . || true) - if [ "$N" -lt 1 ] || ! printf '%s\n' "$LIST" | grep -qx "stem_board.html"; then - echo "GUARD: source N=$N / stem_board.html missing -- skipping, site untouched"; exit 0 - fi - echo "source OK ($N files) -- mirroring to /data" - rclone --config /tmp/rc.conf sync "$SRC" /data --exclude ".DS_Store" --fast-list --transfers 4 --max-delete 25 -v - EOT - ] - resources { - requests = { cpu = "10m", memory = "64Mi" } - limits = { memory = "192Mi" } - } - volume_mount { - name = "rclone-config" - mount_path = "/config" - read_only = true - } - volume_mount { - name = "content" - mount_path = "/data" - } - } - volume { - name = "rclone-config" - secret { secret_name = "stem95su-rclone" } - } - volume { - name = "content" - persistent_volume_claim { - claim_name = module.nfs_content.claim_name - } - } - } - } - } - } - } - lifecycle { - # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2 - ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] - } - depends_on = [kubernetes_manifest.rclone_external_secret] -} diff --git a/stacks/stem95su/main.tf b/stacks/stem95su/main.tf index 1fe68f1a..b4741f38 100644 --- a/stacks/stem95su/main.tf +++ b/stacks/stem95su/main.tf @@ -1,176 +1,9 @@ -# STEM educational platform for 95. СУ „Проф. Иван Шишманов" (Sofia). -# Public, open static site at stem95su.viktorbarzin.me. Self-contained HTML -# pages + media authored externally (Gemini exports), served by a stock nginx -# straight off the PVE host NFS — NOT baked into an image, so content can be -# updated out-of-band (Nextcloud "PVE NFS Pool" or rsync to /srv/nfs/stem-site) -# without a rebuild. Auto-backed-up offsite by the existing nfs-mirror job. - -resource "kubernetes_namespace" "stem95su" { - metadata { - name = "stem95su" - labels = { - "istio-injection" : "disabled" - tier = local.tiers.aux - } - } - lifecycle { - # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace - ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] - } -} - -module "tls_secret" { - source = "../../modules/kubernetes/setup_tls_secret" - namespace = kubernetes_namespace.stem95su.metadata[0].name - tls_secret_name = var.tls_secret_name -} - -# Content lives on the PVE host NFS. NOTE: the nfs_volume module creates only -# the K8s PV+PVC — the export subdir (/srv/nfs/stem-site) must already exist on -# 192.168.1.127 or the pod fails to mount (mount.nfs exit 32). It is created -# during deploy and re-created on demand if ever lost. -module "nfs_content" { - source = "../../modules/kubernetes/nfs_volume" - name = "stem95su-content" - namespace = kubernetes_namespace.stem95su.metadata[0].name - nfs_server = var.nfs_server - nfs_path = "/srv/nfs/stem-site" - storage = "1Gi" - access_modes = ["ReadWriteMany"] -} - -# Minimal nginx server block: serve the static dir, with the dashboard -# (stem_board.html) as the directory index so "/" loads the platform home. -# All other pages/assets are reached by their exact filenames (the dashboard -# links to them by name — those must not be renamed). -resource "kubernetes_config_map" "nginx_conf" { - metadata { - name = "stem95su-nginx-conf" - namespace = kubernetes_namespace.stem95su.metadata[0].name - } - data = { - "default.conf" = <<-EOT - server { - listen 80; - server_name _; - root /usr/share/nginx/html; - index stem_board.html index.html; - } - EOT - } -} - -resource "kubernetes_deployment" "stem95su" { - metadata { - name = "stem95su" - namespace = kubernetes_namespace.stem95su.metadata[0].name - labels = { - run = "stem95su" - tier = local.tiers.aux - } - } - spec { - replicas = 1 - selector { - match_labels = { - run = "stem95su" - } - } - template { - metadata { - labels = { - run = "stem95su" - } - } - spec { - container { - image = "nginx:1.28-alpine" - name = "nginx" - resources { - limits = { - memory = "64Mi" - } - requests = { - cpu = "10m" - memory = "64Mi" - } - } - port { - container_port = 80 - } - volume_mount { - name = "content" - mount_path = "/usr/share/nginx/html" - read_only = true - } - volume_mount { - name = "nginx-conf" - mount_path = "/etc/nginx/conf.d" - read_only = true - } - readiness_probe { - http_get { - path = "/" - port = 80 - } - initial_delay_seconds = 3 - period_seconds = 10 - } - } - volume { - name = "content" - persistent_volume_claim { - claim_name = module.nfs_content.claim_name - } - } - volume { - name = "nginx-conf" - config_map { - name = kubernetes_config_map.nginx_conf.metadata[0].name - } - } - } - } - } - lifecycle { - ignore_changes = [ - spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 - ] - } -} - -resource "kubernetes_service" "stem95su" { - metadata { - name = "stem95su" - namespace = kubernetes_namespace.stem95su.metadata[0].name - labels = { - run = "stem95su" - } - } - spec { - selector = { - run = "stem95su" - } - port { - name = "http" - port = "80" - target_port = "80" - } - } -} - -module "ingress" { - source = "../../modules/kubernetes/ingress_factory" - # auth = "none": public static educational site for 95. СУ, open to the internet by design — CrowdSec + ai-bot-block gate bots; no login. - auth = "none" - namespace = kubernetes_namespace.stem95su.metadata[0].name - name = "stem95su" - service_name = kubernetes_service.stem95su.metadata[0].name - port = "80" - host = "stem95su" - # DNS moved to stacks/valia-sites (ADR-0018 cutover): the public CNAME now - # points at Cloudflare Pages, not the tunnel. Ingress kept only until the - # serving stack is retired. - dns_type = "none" - tls_secret_name = var.tls_secret_name -} +# stem95su moved OFF-INFRA to Cloudflare Pages (ADR-0018 cutover, 2026-07-03) — +# registry entry `stem95su` in stacks/valia-sites; runbook +# docs/runbooks/valia-sites.md. This stack intentionally declares NOTHING: +# the apply that landed this file destroyed the old in-cluster serving +# (nginx + NFS content PVC + ingress + per-site gdrive-sync CronJob + +# namespace). Directory kept only so the destroy could run through CI — +# safe to delete the dir + its PG state schema in a later cleanup. +# Harmless leftovers (manual cleanup if ever wanted): /srv/nfs/stem-site on +# the PVE host, and Vault secret/stem95su (superseded by secret/valia-sites). diff --git a/stacks/stem95su/variables.tf b/stacks/stem95su/variables.tf deleted file mode 100644 index 4e8a36b5..00000000 --- a/stacks/stem95su/variables.tf +++ /dev/null @@ -1,9 +0,0 @@ -variable "tls_secret_name" { - type = string - sensitive = true -} - -variable "nfs_server" { - type = string - default = "192.168.1.127" -}