diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index bb1ce653..98dacd41 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -30,7 +30,7 @@ Violations cause state drift, which causes future applies to break or silently r - **New service**: Use `setup-project` skill for full workflow - **Ingress**: `ingress_factory` module. Auth: `protected = true`. Anti-AI: on by default. **DNS**: `dns_type = "proxied"` (Cloudflare CDN) or `"non-proxied"` (direct A/AAAA). DNS records are auto-created — no need to edit `config.tfvars`. - **Docker images**: Always build for `linux/amd64`. Use 8-char git SHA tags — `:latest` causes stale pull-through cache. -- **Private registry**: `forgejo.viktorbarzin.me/viktor/` (Forgejo packages, OAuth-style PAT auth). Use `image: forgejo.viktorbarzin.me/viktor/:` + `imagePullSecrets: [{name: registry-credentials}]`. Kyverno auto-syncs the Secret to all namespaces. Containerd `hosts.toml` on every node redirects to in-cluster Traefik LB `10.0.20.200` to avoid hairpin NAT. Push-side: viktor PAT in Vault `secret/ci/global/forgejo_push_token` (Forgejo container packages are scoped per-user; only the package owner can push, ci-pusher cannot write to viktor/*). Pull-side: cluster-puller PAT in Vault `secret/viktor/forgejo_pull_token`. Retention CronJob (`forgejo-cleanup` in `forgejo` ns, daily 04:00) keeps newest 10 versions + always `:latest`; integrity probed every 15min by `forgejo-integrity-probe` in `monitoring` ns (catalog walk + manifest HEAD on every blob). See `docs/plans/2026-05-07-forgejo-registry-consolidation-{design,plan}.md` for the migration history. Pull-through caches for upstream registries (DockerHub, GHCR, Quay, k8s.gcr, Kyverno) stay on the registry VM at `10.0.20.10` ports 5000/5010/5020/5030/5040 — the old port-5050 R/W private registry was decommissioned 2026-05-07. +- **Private registry**: `registry.viktorbarzin.me` (htpasswd auth, credentials in Vault `secret/viktor`). Use `image: registry.viktorbarzin.me/:` + `imagePullSecrets: [{name: registry-credentials}]`. Kyverno auto-syncs the secret to all namespaces. Build & push from registry VM (`10.0.20.10`). Containerd `hosts.toml` redirects pulls to LAN IP directly. Web UI at `docker.viktorbarzin.me` (Authentik-protected). Engine pinned to `registry:2.8.3` (see post-mortem 2026-04-19); on-VM configs deploy via `.woodpecker/registry-config-sync.yml`; integrity probed every 15m by `registry-integrity-probe` CronJob in `monitoring` ns — the HTTP API is the authoritative integrity check, NOT `/blobs/*/data` presence (revision-link absence is the real failure mode). - **LinuxServer.io containers**: `DOCKER_MODS` runs apt-get on every start — bake slow mods into a custom image (`RUN /docker-mods || true` then `ENV DOCKER_MODS=`). Set `NO_CHOWN=true` to skip recursive chown that hangs on NFS mounts. - **Node memory changes**: When changing VM memory on any k8s node, update kubelet `systemReserved`, `kubeReserved`, and eviction thresholds accordingly. Config: `/var/lib/kubelet/config.yaml`. Template: `stacks/infra/main.tf`. Current values: systemReserved=512Mi, kubeReserved=512Mi, evictionHard=500Mi, evictionSoft=1Gi. - **Node OS disk tuning** (in `stacks/infra/main.tf`): kubelet `imageGCHighThresholdPercent=70` (was 85), `imageGCLowThresholdPercent=60` (was 80), ext4 `commit=60` in fstab (was default 5s), journald `SystemMaxUse=200M` + `MaxRetentionSec=3day`. diff --git a/.claude/reference/service-catalog.md b/.claude/reference/service-catalog.md index a3619212..3d888e93 100644 --- a/.claude/reference/service-catalog.md +++ b/.claude/reference/service-catalog.md @@ -45,8 +45,7 @@ | nextcloud | File sync/share | nextcloud | | calibre | E-book management (may be merged into ebooks stack) | calibre | | onlyoffice | Document editing | onlyoffice | -| f1-stream | F1 streaming (uses chrome-service for hmembeds verifier) | f1-stream | -| chrome-service | Headed Chromium WebSocket pool (`ws://chrome-service.chrome-service.svc:3000/`) for sibling services driving anti-bot embeds | chrome-service | +| f1-stream | F1 streaming | f1-stream | | rybbit | Analytics | rybbit | | isponsorblocktv | SponsorBlock for TV | isponsorblocktv | | actualbudget | Budgeting (factory pattern) | actualbudget | diff --git a/.woodpecker/build-ci-image.yml b/.woodpecker/build-ci-image.yml index 796426ac..40d8e667 100644 --- a/.woodpecker/build-ci-image.yml +++ b/.woodpecker/build-ci-image.yml @@ -14,72 +14,104 @@ steps: - name: build-and-push image: woodpeckerci/plugin-docker-buildx settings: - # Phase 4 of forgejo-registry-consolidation 2026-05-07 — - # registry.viktorbarzin.me dropped, Forgejo is the only target. - repo: - - forgejo.viktorbarzin.me/viktor/infra-ci + repo: registry.viktorbarzin.me:5050/infra-ci dockerfile: ci/Dockerfile context: ci/ tags: - latest - "${CI_COMMIT_SHA:0:8}" platforms: linux/amd64 + registry: registry.viktorbarzin.me:5050 logins: - - registry: forgejo.viktorbarzin.me + - registry: registry.viktorbarzin.me:5050 username: - from_secret: forgejo_user + from_secret: registry_user password: - from_secret: forgejo_push_token + from_secret: registry_password - # Post-push integrity check is now redundant with the every-15min - # forgejo-integrity-probe in stacks/monitoring/, which walks - # /v2/_catalog + HEADs every blob across the entire Forgejo registry. - # If a corruption pattern emerges that the periodic probe misses, - # restore a verify step similar to the pre-Phase-4 version (see - # commit 49f4956f) but pointed at forgejo.viktorbarzin.me. - - # Break-glass tarball: save the just-pushed infra-ci image to disk on the - # registry VM (10.0.20.10) so we can `docker load` it back into a node - # when Forgejo is unreachable. Pulls from Forgejo (the only registry now). - # Best-effort — failure here doesn't fail the pipeline. - # Recovery procedure: docs/runbooks/forgejo-registry-breakglass.md. - - name: breakglass-tarball + # Post-push integrity check. Re-resolves the image we just pushed and HEADs + # every blob it references — top-level manifest (index or single), each child + # platform manifest, each config blob, each layer blob. If any returns !=200 + # the pipeline fails loudly here so we never ship a broken index downstream. + # Historical context: 2026-04-13 and 2026-04-19 incidents both shipped indexes + # whose platform/attestation children had been GC-orphaned on the registry VM. + - name: verify-integrity image: alpine:3.20 - failure: ignore environment: - REGISTRY_SSH_KEY: - from_secret: registry_ssh_key - FORGEJO_USER: - from_secret: forgejo_user - FORGEJO_PASS: - from_secret: forgejo_push_token + REG_USER: + from_secret: registry_user + REG_PASS: + from_secret: registry_password commands: - - apk add --no-cache openssh-client - - mkdir -p ~/.ssh && chmod 700 ~/.ssh - - printf '%s\n' "$REGISTRY_SSH_KEY" > ~/.ssh/id_ed25519 - - chmod 600 ~/.ssh/id_ed25519 - - ssh-keyscan -t ed25519 10.0.20.10 >> ~/.ssh/known_hosts 2>/dev/null + - apk add --no-cache curl jq + - REG=registry.viktorbarzin.me:5050 + - REPO=infra-ci - SHA=${CI_COMMIT_SHA:0:8} + - AUTH="$REG_USER:$REG_PASS" - | - ssh -n -o BatchMode=yes root@10.0.20.10 " - set -e - mkdir -p /opt/registry/data/private/_breakglass - IMAGE=forgejo.viktorbarzin.me/viktor/infra-ci:$SHA - echo \$FORGEJO_PASS | docker login forgejo.viktorbarzin.me -u \$FORGEJO_USER --password-stdin - docker pull \$IMAGE - docker save \$IMAGE | gzip > /opt/registry/data/private/_breakglass/infra-ci-$SHA.tar.gz - ln -sfn infra-ci-$SHA.tar.gz /opt/registry/data/private/_breakglass/infra-ci-latest.tar.gz - ls -t /opt/registry/data/private/_breakglass/infra-ci-*.tar.gz \ - | grep -v 'latest' | tail -n +6 | xargs -r rm -v - ls -lh /opt/registry/data/private/_breakglass/ - " + set -euo pipefail + ACCEPT='Accept: application/vnd.oci.image.index.v1+json,application/vnd.oci.image.manifest.v1+json,application/vnd.docker.distribution.manifest.list.v2+json,application/vnd.docker.distribution.manifest.v2+json' + + fetch_manifest() { + # Prints the body to $2, returns the HTTP code as stdout. + curl -sk -u "$AUTH" -H "$ACCEPT" \ + -o "$2" -w '%{http_code}' \ + "https://$REG/v2/$REPO/manifests/$1" + } + head_blob() { + curl -sk -u "$AUTH" -o /dev/null -w '%{http_code}' \ + -I "https://$REG/v2/$REPO/blobs/$1" + } + + verify_single_manifest() { + local ref="$1" tmp=/tmp/m-$$.json + local rc cfg + rc=$(fetch_manifest "$ref" "$tmp") + if [ "$rc" != "200" ]; then + echo "FAIL: manifest $ref returned HTTP $rc"; return 1 + fi + cfg=$(jq -r '.config.digest // empty' "$tmp") + if [ -n "$cfg" ]; then + rc=$(head_blob "$cfg") + [ "$rc" = "200" ] || { echo "FAIL: config blob $cfg returned HTTP $rc"; return 1; } + fi + jq -r '.layers[]?.digest' "$tmp" > /tmp/layers-$$.txt + while IFS= read -r layer; do + [ -z "$layer" ] && continue + rc=$(head_blob "$layer") + [ "$rc" = "200" ] || { echo "FAIL: layer blob $layer returned HTTP $rc"; return 1; } + done < /tmp/layers-$$.txt + return 0 + } + + echo "=== Verifying push integrity for $REPO:$SHA ===" + TOP=/tmp/top-$$.json + rc=$(fetch_manifest "$SHA" "$TOP") + [ "$rc" = "200" ] || { echo "FAIL: top manifest :$SHA returned HTTP $rc"; exit 1; } + + MT=$(jq -r '.mediaType // empty' "$TOP") + echo "Top-level media type: ${MT:-}" + + if echo "$MT" | grep -Eq 'manifest\.list|image\.index'; then + jq -r '.manifests[].digest' "$TOP" > /tmp/children-$$.txt + echo "Multi-platform index: $(wc -l - │ - ┌───────────────────────────────┼───────────────────────────────┐ - │ caller pod │ chrome-service pod - │ (e.g. f1-stream) │ (single replica) - │ │ - │ CHROME_WS_URL ──────────────┘ - │ CHROME_WS_TOKEN ─── from `secret/chrome-service.api_bearer_token` (ESO) - │ - │ await chromium.connect(f"{ws}/{token}") - │ await ctx.add_init_script(STEALTH_JS) - │ page.goto("https://upstream.com/embed/...") - │ - └─── ←── pages render under Xvfb, headed Chromium ──── ─────────┘ -``` - -## Image pin - -Both the server image (`mcr.microsoft.com/playwright:v1.48.0-noble` in -`stacks/chrome-service/main.tf`) and the Python client -(`playwright==1.48.0` in callers' `requirements.txt`) **must match -minor-versions**. Bump in lockstep — Playwright protocol changes between -minors and the client cannot connect to a mismatched server. - -The Microsoft image ships only the browser binaries, not the `playwright` -npm SDK; the start command runs `npx -y playwright@1.48.0 launch-server` -which downloads the SDK on first start (cached under `$HOME/.npm` via the -PVC) and reuses it on subsequent restarts. - -## Storage - -- **`chrome-service-profile-encrypted`** (PVC, 2Gi → 10Gi autoresize, - `proxmox-lvm-encrypted`) — Chromium user-data dir + npm cache. - Encrypted because cookies/localStorage may include third-party auth tokens - for sites callers drive. `HOME=/profile` so npx caches there. -- **`chrome-service-backup-host`** (NFS, RWX) — destination for a 6-hourly - CronJob that `tar -czf /backup/.tar.gz -C /profile .`, - retention 30 days. - -## Auth + secrets - -- Vault KV `secret/chrome-service.api_bearer_token` — 32-byte URL-safe - random, rotated by hand: - `vault kv put secret/chrome-service api_bearer_token=$(python3 -c 'import secrets; print(secrets.token_urlsafe(32))')`. -- ESO syncs into namespace-local Secret `chrome-service-secrets` - (server pod) and `chrome-service-client-secrets` (each caller pod). -- Reloader (`reloader.stakater.com/auto = "true"`) cascades token rotation - to both server and any annotated caller — no manual rollout. - -## Network controls - -- **`kubernetes_network_policy_v1.ws_ingress`** — two separate ingress - rules on the same policy: - - **TCP/3000** (Playwright WS): only namespaces labelled - `chrome-service.viktorbarzin.me/client = "true"` (plus an explicit - fallback for `f1-stream` by `kubernetes.io/metadata.name`). - - **TCP/6080** (noVNC HTTP+WS): only the `traefik` namespace, since - the public-facing path is `chrome.viktorbarzin.me` ingress → - Traefik → sidecar. Authentik forward-auth still gates external - access at the Traefik layer. -- **WS port 3000** is internal-only (no ingress, no Cloudflare DNS). -- **noVNC sidecar** (`forgejo.viktorbarzin.me/viktor/chrome-service-novnc`) - exposes a live HTML5 view of the headed Chromium session via - `x11vnc` (connected to Xvfb on `localhost:6099`) bridged to - `websockify` on port 6080. Service `chrome` maps :80 → :6080 and is - exposed via `ingress_factory` at `chrome.viktorbarzin.me`, - Authentik-gated. Both static page and WebSocket upgrade share the - same path — Cloudflare proxy, Cloudflared tunnel, Traefik, and - Authentik forward-auth all preserve `Upgrade: websocket`. - -## Adding a new caller - -See `stacks/chrome-service/README.md` for the four-step recipe: - -1. Label the caller's namespace. -2. Add an `ExternalSecret` pulling `secret/chrome-service`. -3. Inject `CHROME_WS_URL` + `CHROME_WS_TOKEN` env vars. -4. Vendor `stealth.js` and apply via `await context.add_init_script(...)` - after every `new_context()`. - -## Limits + risks - -- **Anti-bot vs stealth arms race** — when an upstream beats us (DRM - license check, device-fingerprint mismatch, hotlink protection that - whitelists specific parent domains), the verifier returns - `is_playable=False` and the extractor moves on. No user-visible - breakage, just empty stream lists for that source. -- **JWPlayer DRM error 102630** — observed with several hmembeds embeds - even from the headed chrome-service. The license check bails because - the request origin isn't on the embed's allowlist; this is upstream - policy, not an infra defect. -- **Single replica + RWO PVC** — the deployment uses `Recreate` strategy. - Brief outage on rollout, ~30s for browser warmup. -- **No `/metrics` endpoint** — the cluster's generic - `KubePodCrashLooping` rule covers basic alerting. A Prometheus scrape - exporter is day-2 work. diff --git a/docs/architecture/ci-cd.md b/docs/architecture/ci-cd.md index 4c0c020b..6a7fd2f9 100644 --- a/docs/architecture/ci-cd.md +++ b/docs/architecture/ci-cd.md @@ -19,7 +19,7 @@ graph LR I --> J[Pull from DockerHub
or Pull-Through Cache] K[Pull-Through Cache
10.0.20.10] -.-> J - L[forgejo.viktorbarzin.me
Private Registry on Forgejo] -.-> J + L[registry.viktorbarzin.me
Private Registry] -.-> J style B fill:#2088ff style F fill:#4c9e47 @@ -33,7 +33,7 @@ graph LR | GitHub Actions | Cloud | `.github/workflows/build-and-deploy.yml` | Build Docker images, push to DockerHub | | Woodpecker CI | Self-hosted | `ci.viktorbarzin.me` | Deploy to Kubernetes cluster | | DockerHub | Cloud | `viktorbarzin/*` | Public image registry | -| Private Registry | Forgejo Packages | `forgejo.viktorbarzin.me/viktor` | Private container images (PAT auth, retention CronJob) — migrated from registry.viktorbarzin.me 2026-05-07 | +| Private Registry | Custom | `registry.viktorbarzin.me` | Private images, htpasswd auth | | Pull-Through Cache | Custom | `10.0.20.10:5000` (docker.io)
`10.0.20.10:5010` (ghcr.io) | LAN cache for remote registries | | Kyverno | Cluster | `kyverno` namespace | Auto-sync registry credentials to all namespaces | | Vault | Cluster | `vault.viktorbarzin.me` | K8s auth for Woodpecker pipelines | @@ -102,7 +102,7 @@ Woodpecker API uses numeric IDs (not owner/name): 1. **Containerd hosts.toml** redirects pulls from docker.io and ghcr.io to pull-through cache at `10.0.20.10` 2. **Pull-through cache** serves cached images from LAN, fetches from upstream on cache miss 3. **Kyverno ClusterPolicy** auto-syncs `registry-credentials` Secret to all namespaces for private registry access -4. **Private registry** has been Forgejo's built-in OCI registry at `forgejo.viktorbarzin.me/viktor/` since 2026-05-07. Auth via PAT (Vault `secret/ci/global/forgejo_push_token` for push, `secret/viktor/forgejo_pull_token` for pull). The pre-migration `registry:2.8.3`-based private registry on `registry.viktorbarzin.me:5050` was the root cause of three orphan-index incidents in three weeks (2026-04-13, 2026-04-19, 2026-05-04 — see `docs/post-mortems/2026-04-19-registry-orphan-index.md` and the full migration writeup at `docs/plans/2026-05-07-forgejo-registry-consolidation-{design,plan}.md`). The five pull-through caches on `10.0.20.10` (ports 5000/5010/5020/5030/5040) stay in place for upstream registries. +4. **Private registry** (`registry.viktorbarzin.me`) uses htpasswd auth, credentials stored in Vault. Runs `registry:2.8.3` (pinned — floating `registry:2` was the root cause of the 2026-04-13 + 2026-04-19 orphan-index incidents; see `docs/post-mortems/2026-04-19-registry-orphan-index.md`). 5. **Integrity probe** (`registry-integrity-probe` CronJob in `monitoring` ns, every 15m) walks `/v2/_catalog` → tags → indexes → child manifests via HEAD and pushes `registry_manifest_integrity_failures` to Pushgateway; alerts `RegistryManifestIntegrityFailure` / `RegistryIntegrityProbeStale` / `RegistryCatalogInaccessible` page on broken state. Authoritative check (HTTP API, not filesystem). ### Infra Pipelines (Woodpecker-only) diff --git a/docs/architecture/monitoring.md b/docs/architecture/monitoring.md index 3b9d915d..5fa3bbba 100644 --- a/docs/architecture/monitoring.md +++ b/docs/architecture/monitoring.md @@ -63,7 +63,7 @@ graph TB | External Monitor Sync | Python 3.12 | `stacks/uptime-kuma/` | CronJob (10min) syncs `[External]` monitors from `cloudflare_proxied_names` | | dcgm-exporter | Configurable resources | `stacks/monitoring/modules/monitoring/` | NVIDIA GPU metrics collection | | Email Roundtrip Probe | Python 3.12 | `stacks/mailserver/modules/mailserver/` | E2E email delivery verification via Mailgun API + IMAP | -| Forgejo Registry Integrity Probe | Alpine 3.20 + curl/jq | `stacks/monitoring/modules/monitoring/main.tf` | CronJob every 15m: walks `/v2/_catalog` on `forgejo.viktorbarzin.me` (HTTP via in-cluster service), HEADs every tagged manifest + index child; emits `registry_manifest_integrity_*` metrics to Pushgateway. Replaces the legacy `registry-integrity-probe` against `registry.viktorbarzin.me:5050` decommissioned in Phase 4 of forgejo-registry-consolidation 2026-05-07. | +| Registry Integrity Probe | Alpine 3.20 + curl/jq | `stacks/monitoring/modules/monitoring/main.tf` | CronJob every 15m: walks `/v2/_catalog` on `registry.viktorbarzin.me:5050`, HEADs every tagged manifest + index child; emits `registry_manifest_integrity_*` metrics to Pushgateway. Catches orphan OCI-index state that filesystem scans miss. | ## How It Works diff --git a/docs/plans/2026-05-07-forgejo-registry-consolidation-design.md b/docs/plans/2026-05-07-forgejo-registry-consolidation-design.md deleted file mode 100644 index 5e88bd36..00000000 --- a/docs/plans/2026-05-07-forgejo-registry-consolidation-design.md +++ /dev/null @@ -1,195 +0,0 @@ -# Forgejo Registry Consolidation — Design - -**Date**: 2026-05-07 -**Status**: Approved - -## Problem - -`registry-private` (the `registry:2` container on the docker-registry -VM at `10.0.20.10`) has hit `distribution#3324` corruption three -times in three weeks (2026-04-13, 2026-04-19, 2026-05-04). Each -incident required manual blob recovery and another round of -hardening to `cleanup-tags.sh` and the GC procedure. The integrity -probe catches it within 15 minutes now, but every hit still costs -~1h of cleanup, and we keep tightening the same loose screw. - -Root cause is a known race in `distribution`: tag deletes that race -with concurrent garbage collection produce orphan OCI-index children. -Upstream has not patched it; our mitigations (probe, blob -fix-up script, idempotent cleanup) reduce blast radius but don't -remove the failure mode. - -Forgejo (deployed for OAuth and personal repos at -`forgejo.viktorbarzin.me`) ships a built-in OCI registry as part of -the Packages feature, default-on in v11. Using it removes -`distribution`-the-engine from the path entirely, replaces it with -Forgejo's own implementation backed by Forgejo's DB+blob store, and -gets us source hosting + image hosting in one resource. - -The PVE host RAM upgrade from 142GB to 272GB (memory id=569) means -the cluster can absorb the resource bump Forgejo needs for the -registry workload (1Gi → 1Gi). - -## Decision - -Move every image currently on `registry.viktorbarzin.me:5050` to -Forgejo's OCI registry at `forgejo.viktorbarzin.me`. Decommission -`registry-private` after a 14-day dual-push bake. - -Pull-through caches for upstream registries (DockerHub, GHCR, Quay, -k8s.gcr, Kyverno) stay on the registry VM permanently — Forgejo -won't serve as a pull-through, so the chicken-and-egg of "Forgejo -pulling its own image through itself" never arises. - -## Design - -### Registry hostname - -Image references become `forgejo.viktorbarzin.me/viktor/:`. -The `viktor/` prefix is the Forgejo owner namespace; all current -private images ship under that single owner. - -### Auth - -Two service-account users: - -| User | Scope | Vault key | Used by | -|---|---|---|---| -| `cluster-puller` | `read:package` | `secret/viktor/forgejo_pull_token` | cluster-wide `registry-credentials` Secret, monitoring probe | -| `ci-pusher` | `write:package` | `secret/ci/global/forgejo_push_token` | Woodpecker pipelines (synced via `vault-woodpecker-sync` CronJob) | - -A third PAT (`secret/viktor/forgejo_cleanup_token`, also belongs to -`ci-pusher`) drives the retention CronJob — kept separate from the -push PAT so a leaked CI token doesn't immediately enable mass deletes. - -PATs have no expiry. Rotation policy: regenerate via Forgejo Web UI -and `vault kv patch` if a leak is suspected; ESO/sync downstream is -automatic. - -### Cluster pull path - -`registry-credentials` is a single Secret in `kyverno` ns, cloned -into every namespace by the existing -`sync-registry-credentials` ClusterPolicy. We extend its -`dockerconfigjson` `auths` map with a fourth entry for -`forgejo.viktorbarzin.me`. **No new Secret, no new ClusterPolicy, -no `imagePullSecrets =` line edits across stacks.** - -Containerd `hosts.toml` redirects `forgejo.viktorbarzin.me` → in-cluster -Traefik LB at `10.0.20.200`, the same pattern used for -`registry.viktorbarzin.me` → `10.0.20.10:5050`. Avoids hairpin NAT -through the WAN gateway for in-cluster pulls. - -### Push path - -Woodpecker pipelines push to BOTH targets during the bake: - -```yaml -- name: build-and-push - image: woodpeckerci/plugin-docker-buildx - settings: - repo: - - registry.viktorbarzin.me/ - - forgejo.viktorbarzin.me/viktor/ - logins: - - registry: registry.viktorbarzin.me - username: - from_secret: registry_user - password: - from_secret: registry_password - - registry: forgejo.viktorbarzin.me - username: - from_secret: forgejo_user - password: - from_secret: forgejo_push_token -``` - -The `vault-woodpecker-sync` CronJob (every 6h) propagates -`secret/ci/global` keys to every Woodpecker repo as global secrets. - -### Retention - -Forgejo's per-package "Cleanup Rules" UI is per-user runtime DB -state, not Terraform-driven. Retention runs as a CronJob in the -`forgejo` namespace, schedule `0 4 * * *`, that: - -1. Lists all container packages under the `viktor` owner. -2. Groups by package name. -3. Keeps newest 10 versions + always keeps `latest`. -4. DELETEs the rest via `/api/v1/packages/{owner}/{type}/{name}/{version}`. - -First 7 days run with `DRY_RUN=true` — script logs what it would -delete but issues no DELETE calls. After log review, flip the -`forgejo_cleanup_dry_run` local in `cleanup.tf` to false. - -### Integrity monitoring - -Mirror the existing `registry-integrity-probe` CronJob: walk -`/v2/_catalog`, walk every tag, HEAD every manifest + index child, -push `registry_manifest_integrity_*` metrics. Existing -Prometheus alerts fire on the `instance` label, so they cover both -probes automatically once the alert annotations are made -instance-aware (done in this change). - -### Source migration - -Projects currently living as plain dirs in the local-only monorepo -become standalone Forgejo repos. Two GitHub-hosted private repos -(`beadboard`, `claude-memory-mcp`) move to Forgejo and are archived -on GitHub. - -CI standardises on Woodpecker for everything in scope. The two -projects that used GHA (build + Woodpecker-deploy via GHA-hosted -DockerHub push) keep DockerHub for legacy compatibility but their -canonical image source becomes Forgejo. - -### Break-glass for infra-ci - -`infra-ci` is the Docker image used by all infra Woodpecker -pipelines, including `default.yml` (terragrunt apply). If Forgejo is -unreachable at the moment we need to apply, `infra-ci` is -unreachable, and we can't apply our way out. - -Mitigation: dual-push step also `docker save | gzip` the built -infra-ci image to: - -- `/opt/registry/data/private/_breakglass/infra-ci-.tar.gz` on - the registry VM disk (Copy 1) -- `/srv/nfs/forgejo-breakglass/` on the NAS (Copy 2) - -A `latest` symlink in each location points at the most recent. -Recovery procedure (`docs/runbooks/forgejo-registry-breakglass.md`): -scp tarball → `docker load` → `ctr -n k8s.io images import` → fix -Forgejo via that node. - -### Cutover style - -**Dual-push bake**: pipelines push to both registries for ≥14 days. -Pods continue pulling from `registry.viktorbarzin.me`. After bake: - -1. Per-project PR: flip `image=` lines in Terraform stacks. Pod - re-pull naturally on next rollout. -2. Phase 4: stop `registry-private` container, remove its - `auths` entry from the cluster Secret, drop containerd hosts.toml - entry. - -## Why not alternatives - -| Option | Rejected because | -|---|---| -| Stay on `registry-private` | Three corruption incidents in three weeks; mitigation cost rising | -| Run a fresh registry container alongside (no Forgejo) | Same upstream, same `distribution#3324` failure mode | -| GHCR / DockerHub for all private images | Public-by-default model + push rate limits; loses owner-owned blob storage | -| Harbor | Heavier than Forgejo registry, would need its own DB + ingress, no source-hosting integration | - -## Risks - -See plan doc § "Risk register" for the full table. Top three: - -1. **Forgejo registry hits the same corruption pattern.** Mitigated - by 14-day bake + integrity probe within 15 min. -2. **Forgejo down → infra-ci unreachable → can't apply.** Mitigated - by tarball break-glass on VM + NAS. -3. **Pod re-pulls fail after `image=` flip due to containerd cache - poisoning.** Mitigated by hosts.toml deployment + per-project - `kubectl rollout restart` in Phase 3. diff --git a/docs/plans/2026-05-07-forgejo-registry-consolidation-plan.md b/docs/plans/2026-05-07-forgejo-registry-consolidation-plan.md deleted file mode 100644 index 1634d48e..00000000 --- a/docs/plans/2026-05-07-forgejo-registry-consolidation-plan.md +++ /dev/null @@ -1,152 +0,0 @@ -# Forgejo Registry Consolidation — Plan - -**Date**: 2026-05-07 -**Status**: Approved — execution in progress (Phase 0) -**Design**: `2026-05-07-forgejo-registry-consolidation-design.md` - -This is the implementation roadmap for migrating off `registry-private` -onto Forgejo's OCI registry. See the design doc for problem -statement and rationale. Execution spans 5 phases over ≥3 weeks. - -## Phase 0 — Prepare Forgejo (1 PR, no cutover risk) - -| Task | File / artifact | -|---|---| -| Bump Forgejo memory request+limit 384Mi → 1Gi | `infra/stacks/forgejo/main.tf` | -| Add `FORGEJO__packages__ENABLED=true` and `FORGEJO__packages__CHUNKED_UPLOAD_PATH=/data/tmp/package-upload` env vars (defensive — already default in v11) | `infra/stacks/forgejo/main.tf` | -| Bump Forgejo PVC 5Gi → 15Gi, auto-resize cap 20Gi → 50Gi | `infra/stacks/forgejo/main.tf` | -| Bump ingress `max_body_size = "5g"` (wired into ingress_factory as a Buffering middleware) | `infra/stacks/forgejo/main.tf`, `infra/modules/kubernetes/ingress_factory/main.tf` | -| Create `cluster-puller` (read:package), `ci-pusher` (write:package), and a third `cleanup` PAT on `ci-pusher`; store PATs in Vault | runbook: `docs/runbooks/forgejo-registry-setup.md` | -| Extend `registry-credentials` Secret with 4th `auths` entry for `forgejo.viktorbarzin.me` | `infra/stacks/kyverno/modules/kyverno/registry-credentials.tf` | -| Add containerd `hosts.toml` entry redirecting `forgejo.viktorbarzin.me` → in-cluster Traefik LB `10.0.20.200` | `infra/stacks/infra/main.tf` cloud-init + new `infra/scripts/setup-forgejo-containerd-mirror.sh` for existing nodes | -| Forgejo retention CronJob (`0 4 * * *`, dry-run for first 7 days) | new `infra/stacks/forgejo/cleanup.tf` + `infra/stacks/forgejo/files/cleanup.sh` | -| Forgejo integrity probe CronJob (`*/15 * * * *`) | `infra/stacks/monitoring/modules/monitoring/main.tf` | -| Make existing alerts instance-aware so they cover both registries | `infra/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl` | - -**Smoke test (must pass before declaring Phase 0 done):** - -- `docker login forgejo.viktorbarzin.me` succeeds. -- Push a hello-world image to `forgejo.viktorbarzin.me/viktor/smoketest:1` succeeds. -- `crictl pull forgejo.viktorbarzin.me/viktor/smoketest:1` from a k8s - node succeeds, using the auto-synced `registry-credentials` Secret. -- A fresh namespace gets the cloned Secret with 4 `auths` entries. -- Delete the smoketest package via API. -- Forgejo integrity probe completes once and pushes metrics. - -## Phase 1 — Source migration (parallel-safe, no production impact) - -For each project the recipe is identical: - -1. `git init` + push to `forgejo.viktorbarzin.me/viktor/` — - register in Woodpecker via OAuth. -2. Add `.woodpecker.yml` based on `payslip-ingest/.woodpecker.yml`. - Push step uses `woodpeckerci/plugin-docker-buildx` with TWO - `repo:` entries (dual-push). -3. Confirm first build pushes to BOTH registries. - -Projects (bake clock starts at "all dual-push"): - -| Project | Action | -|---|---| -| `claude-agent-service` | Extract from monorepo to Forgejo. New `.woodpecker.yml`. | -| `fire-planner` | Extract from monorepo to Forgejo. New `.woodpecker.yml`. | -| `wealthfolio-sync` | Extract from monorepo to Forgejo. New `.woodpecker.yml`. | -| `hmrc-sync` | Extract from monorepo to Forgejo. New `.woodpecker.yml`. | -| `freedify` | Push from monorepo to Forgejo. New `.woodpecker.yml`. (Upstream is gone.) | -| `payslip-ingest` | Already on Forgejo. Add second `repo:` entry to `.woodpecker.yml`. | -| `job-hunter` | Already on Forgejo. Add second `repo:` entry. | -| `beadboard` | Push to Forgejo. New `.woodpecker.yml`. Disable GHA workflow. **Don't archive GitHub yet** (deferred to Phase 3). | -| `claude-memory-mcp` | Push to Forgejo. New `.woodpecker.yml`. | -| `infra-ci` | Edit `.woodpecker/build-ci-image.yml` to dual-push. ALSO `docker save | gzip` to `/opt/registry/data/private/_breakglass/` on VM AND `/srv/nfs/forgejo-breakglass/` on NAS. Pin a `latest` symlink. | - -Break-glass runbook (`docs/runbooks/forgejo-registry-breakglass.md`) -documents the recovery path. - -## Phase 2 — Bake (≥14 days) - -- No `image=` lines change. Pods still pull from - `registry.viktorbarzin.me`. -- **Daily smoke check**: pull a recent image from Forgejo as - `cluster-puller`, verify integrity (HEAD on manifest + each blob). -- **Bake exit criteria**: - - Zero `RegistryManifestIntegrityFailure` alerts on Forgejo. - - Zero `ContainerNearOOM` for the forgejo pod. - - Retention CronJob has run ≥14 times successfully. - - At least one full Sunday GC cycle has elapsed. - - Switch retention CronJob to `DRY_RUN=false` on day 7, observe - until day 14. - -## Phase 3 — Cutover (one PR per project, single session) - -Order = lowest blast radius first. Each step: -`image=` flip → `kubectl rollout restart` → verify pull from Forgejo. - -1. `payslip-ingest` (`infra/stacks/payslip-ingest/main.tf`) -2. `job-hunter` (`infra/stacks/job-hunter/main.tf`) -3. `claude-agent-service` (`infra/stacks/claude-agent-service/main.tf`) -4. `fire-planner` (`infra/stacks/fire-planner/main.tf`) -5. `wealthfolio-sync` (`infra/stacks/wealthfolio/main.tf`) -6. `freedify` (`infra/stacks/freedify/factory/main.tf`) -7. `chrome-service` (`infra/stacks/chrome-service/main.tf`) -8. `beads-server` / `beadboard` (`infra/stacks/beads-server/main.tf`). - Then `gh repo archive ViktorBarzin/beadboard`. -9. `infra-ci` — flip `image:` references in 4 `.woodpecker/*.yml` - files in the infra repo. Verify next push to master applies cleanly. -10. `claude-memory-mcp` — update `CLAUDE.md` install instruction from - `claude plugins install github:ViktorBarzin/claude-memory-mcp` to - `claude plugins install https://forgejo.viktorbarzin.me/viktor/claude-memory-mcp.git`. - `gh repo archive ViktorBarzin/claude-memory-mcp`. - -## Phase 4 — Decommission - -| Step | File / location | -|---|---| -| Stop `registry-private` container on VM (10.0.20.10): edit `/opt/registry/docker-compose.yml`, comment out service, `docker compose up -d --remove-orphans`. (Manual SSH — cloud-init won't redeploy on TF apply per memory id=1078.) | live VM | -| Update cloud-init template to match the new compose file | `infra/stacks/infra/main.tf:288` | -| Delete `auths` entries for `registry.viktorbarzin.me` / `:5050` / `10.0.20.10:5050` from the dockerconfigjson | `infra/stacks/kyverno/modules/kyverno/registry-credentials.tf` | -| Drop `registry.viktorbarzin.me` and `10.0.20.10:5050` `hosts.toml` entries on each node + cloud-init template | `infra/stacks/infra/main.tf` cloud-init + ad-hoc script | -| After 1 week of no incidents, delete `/opt/registry/data/private/` blob storage on the VM (~2.6GB freed) | manual SSH | - -## Phase 5 — Docs - -In the same commit as the Phase 4 closing: - -| Doc | Update | -|---|---| -| `docs/runbooks/registry-vm.md` | Note `registry-private` is gone; pull-through caches and break-glass tarballs only | -| `docs/runbooks/registry-rebuild-image.md` | Replaced by NEW `forgejo-registry-rebuild-image.md` | -| `docs/runbooks/forgejo-registry-rebuild-image.md` (NEW) | Forgejo PVC restore procedure | -| `docs/runbooks/forgejo-registry-breakglass.md` (NEW) | infra-ci tarball recovery | -| `docs/architecture/ci-cd.md` | Image registry section flips to Forgejo | -| `docs/architecture/monitoring.md` | Integrity probe target updated | -| `infra/.claude/CLAUDE.md` | Registry references updated | -| `CLAUDE.md` (monorepo root) | claude-memory-mcp install URL updated | -| `infra/.claude/reference/service-catalog.md` | Cross-reference checked | - -## Critical files modified - -| File | Phase | What | -|---|---|---| -| `infra/stacks/forgejo/main.tf` | 0 | Memory bump, packages env vars, PVC bump, ingress max_body_size | -| `infra/stacks/forgejo/cleanup.tf` (NEW) | 0 | Retention CronJob | -| `infra/stacks/forgejo/files/cleanup.sh` (NEW) | 0 | Retention script (mounted via ConfigMap) | -| `infra/modules/kubernetes/ingress_factory/main.tf` | 0 | Wire `max_body_size` into a Traefik Buffering middleware | -| `infra/stacks/kyverno/modules/kyverno/registry-credentials.tf` | 0 | Add 4th `auths` entry | -| `infra/stacks/infra/main.tf` | 0 + 4 | Containerd hosts.toml block (add Forgejo, later remove registry-private); compose template update | -| `infra/scripts/setup-forgejo-containerd-mirror.sh` (NEW) | 0 | One-shot rollout for existing nodes | -| `infra/stacks/monitoring/modules/monitoring/main.tf` | 0 | Forgejo integrity probe CronJob | -| `infra/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl` | 0 | Make alerts instance-aware | -| `infra/stacks/monitoring/main.tf` | 0 | Plumb `forgejo_pull_token` into module | -| `infra/.woodpecker/build-ci-image.yml` | 1 | Dual-push to add Forgejo target + tarball break-glass | -| `/.woodpecker.yml` | 1 | Dual-push (NEW for fire-planner, wealthfolio-sync, hmrc-sync, freedify, beadboard, claude-memory-mcp; EDIT for payslip-ingest, job-hunter, claude-agent-service) | -| `infra/.woodpecker/{default,drift-detection,build-cli}.yml` | 3 | Flip `image:` to Forgejo for infra-ci | -| `infra/stacks/{beads-server,chrome-service,claude-agent-service,fire-planner,freedify/factory,job-hunter,payslip-ingest,wealthfolio}/main.tf` | 3 | Flip `image =` to Forgejo | - -## Verification - -- **Push** (Phase 0/1): `docker push forgejo.viktorbarzin.me/viktor/` visible in Forgejo Web UI under viktor/. -- **Pull** (Phase 0): `crictl pull forgejo.viktorbarzin.me/viktor/smoketest:1` succeeds with auto-synced Secret. -- **Dual-push** (Phase 1): every Woodpecker pipeline run pushes to BOTH endpoints — confirmed via HEAD checks on `:` for both. -- **Bake** (Phase 2): existing daily Forgejo `/api/healthz` external monitor stays green; integrity probe stays green; no `ContainerNearOOM` for forgejo pod. -- **Cutover** (Phase 3): `kubectl rollout status deploy/ -n ` succeeds. `kubectl describe pod` shows the image was pulled from `forgejo.viktorbarzin.me`. -- **Decommission** (Phase 4): `docker ps` on registry VM no longer shows `registry-private`. Brand-new namespace gets the Secret with only the Forgejo `auths` entry. Pull still works. diff --git a/docs/runbooks/forgejo-registry-breakglass.md b/docs/runbooks/forgejo-registry-breakglass.md deleted file mode 100644 index 664893d5..00000000 --- a/docs/runbooks/forgejo-registry-breakglass.md +++ /dev/null @@ -1,126 +0,0 @@ -# Runbook: Forgejo registry break-glass — recovering infra-ci - -Last updated: 2026-05-07 - -## When to use this runbook - -When **all** of the following are true: - -1. Forgejo (`forgejo.viktorbarzin.me`) is unreachable. -2. `registry-private` is also gone (post-Phase 4 of the consolidation), - so you can't fall back to `registry.viktorbarzin.me:5050/infra-ci`. -3. You need to run an infra Woodpecker pipeline (apply, build-cli, - drift-detection, etc.) — but those pipelines pull `infra-ci` and - crash because the registry is down. - -If only Forgejo is down but `registry-private` is still alive, the -pipelines work — `image:` references in `infra/.woodpecker/*.yml` -still hit `registry.viktorbarzin.me:5050/infra-ci` until Phase 3 -flips them. Skip this runbook entirely. - -## What's available - -The `build-ci-image.yml` Woodpecker pipeline saves a tarball after -each successful push: - -| Location | Path | -|---|---| -| Registry VM disk (10.0.20.10) | `/opt/registry/data/private/_breakglass/infra-ci-.tar.gz` | -| Registry VM disk (latest symlink) | `/opt/registry/data/private/_breakglass/infra-ci-latest.tar.gz` | -| Synology NAS (offsite copy via daily-backup sync) | `/volume1/Backup/Viki/pve-backup/_forgejo-breakglass/` | - -The registry VM keeps the last 5 tarballs. Synology mirrors them -through the existing offsite-sync-backup job (`/usr/local/bin/ -offsite-sync-backup`). - -## Recovery procedure - -The goal is to get a working `infra-ci` image onto a k8s node so -Woodpecker pods can run it. Then run a Woodpecker pipeline that -restores Forgejo from PVC backup or rebuilds it. - -### Step 1 — copy the tarball to a node - -From your workstation (the registry VM is reachable but Forgejo is -not — the rest of the cluster might be in a similar partial state): - -```bash -ssh wizard@10.0.20.103 # any responsive k8s node -sudo mkdir -p /var/breakglass -sudo scp root@10.0.20.10:/opt/registry/data/private/_breakglass/infra-ci-latest.tar.gz \ - /var/breakglass/ -``` - -If the registry VM is also down, fall back to Synology: - -```bash -sudo scp 192.168.1.13:/volume1/Backup/Viki/pve-backup/_forgejo-breakglass/infra-ci-latest.tar.gz \ - /var/breakglass/ -``` - -### Step 2 — load into containerd - -`docker load` won't help on a k8s node — it loads into the docker -daemon, which kubelet/containerd doesn't see. Use `ctr`: - -```bash -sudo ctr -n k8s.io images import /var/breakglass/infra-ci-latest.tar.gz -sudo ctr -n k8s.io images list | grep infra-ci -``` - -Confirm the image is tagged with the original repository name -(`registry.viktorbarzin.me:5050/infra-ci:` — the tarball was -saved with that tag, NOT the Forgejo name). - -### Step 3 — pin pods to this node - -Add a node selector or taint-toleration to whatever pipeline you -need to run. Simplest: cordon the other nodes briefly so Woodpecker -schedules onto this one. - -```bash -for n in $(kubectl get nodes -o name | grep -v $(hostname)); do - kubectl cordon ${n#node/} -done -``` - -Run the pipeline. After it completes: - -```bash -for n in $(kubectl get nodes -o name); do - kubectl uncordon ${n#node/} -done -``` - -### Step 4 — fix the underlying problem - -The pipeline you just ran was meant to restore Forgejo. Common -options: - -- **Forgejo PVC corrupt** — `docs/runbooks/forgejo-registry-rebuild-image.md` - walks through PVC restore from LVM snapshot or PVE backup. -- **Forgejo OOM-loop** — bump memory request+limit in - `infra/stacks/forgejo/main.tf` and apply. -- **Forgejo unreachable due to network** — check Traefik, MetalLB, - pfSense. - -Once Forgejo is back, run `build-ci-image.yml` manually so the -tarball regenerates with the latest commit. - -## Why this exists - -The 2026-04-19 post-mortem on the registry-orphan-index incident -showed that a single registry going corrupt could block ALL infra -pipelines (because every pipeline pulls `infra-ci` from that -registry). The dual-push to Forgejo + registry-private removes that -single-point-of-failure during the bake. After Phase 4 -decommissions registry-private, the tarball is the last line of -defense. - -## Why on the registry VM and not in-cluster - -The Forgejo pod and registry-private pod both depend on cluster -networking + storage. The registry VM is an independent -non-clustered VM with local storage. If the cluster is in a bad -state, the VM's disk is still readable from any other host on the -LAN. diff --git a/docs/runbooks/forgejo-registry-rebuild-image.md b/docs/runbooks/forgejo-registry-rebuild-image.md deleted file mode 100644 index fc917b8c..00000000 --- a/docs/runbooks/forgejo-registry-rebuild-image.md +++ /dev/null @@ -1,128 +0,0 @@ -# Runbook: Rebuild an Image on the Forgejo OCI Registry - -Last updated: 2026-05-07 - -## When to use this - -Pipelines pulling from `forgejo.viktorbarzin.me/viktor/` fail with: - -- `failed to resolve reference … : not found` -- `manifest unknown` -- HEAD on a manifest/blob digest returns 404 -- `forgejo-integrity-probe` CronJob in `monitoring` reports - `registry_manifest_integrity_failures > 0` for - `instance="forgejo.viktorbarzin.me"` - -This is the Forgejo equivalent of the registry-private orphan-index -failure mode (`docs/post-mortems/2026-04-19-registry-orphan-index.md`). -Cause is usually package-version delete races with an in-flight pull, -or PVC corruption. Fix is to rebuild the image from source and -re-push, so Forgejo receives a complete, fresh upload. - -If the symptom is different (Forgejo unreachable, PVC OOM, -authentication failure), use: -- `docs/runbooks/forgejo-registry-setup.md` for auth + token issues -- `docs/runbooks/forgejo-registry-breakglass.md` if Forgejo + the - cluster are both unreachable -- `docs/runbooks/restore-pvc-from-backup.md` for PVC corruption - -## Phase 1 — Confirm the diagnosis - -From any host: - -```sh -REG=forgejo.viktorbarzin.me -USER=cluster-puller -PASS="$(vault kv get -field=forgejo_pull_token secret/viktor)" -IMAGE=viktor/payslip-ingest -TAG=latest - -# 1. Confirm the manifest exists at all. -curl -sk -u "$USER:$PASS" \ - -H 'Accept: application/vnd.oci.image.index.v1+json,application/vnd.oci.image.manifest.v1+json' \ - "https://$REG/v2/$IMAGE/manifests/$TAG" | jq '.mediaType, .manifests[].digest // .config.digest' - -# 2. HEAD each child / config / layer digest. Any non-200 = confirmed. -for d in $(curl -sk -u "$USER:$PASS" -H 'Accept: application/vnd.oci.image.index.v1+json' \ - "https://$REG/v2/$IMAGE/manifests/$TAG" | jq -r '.manifests[].digest // empty'); do - code=$(curl -sk -u "$USER:$PASS" -o /dev/null -w '%{http_code}' \ - -I "https://$REG/v2/$IMAGE/manifests/$d") - echo "$d → $code" -done -``` - -The probe's last log run is also a fast way to see what's affected: - -```sh -kubectl -n monitoring logs \ - $(kubectl -n monitoring get pods -l job-name -o name \ - | grep forgejo-integrity-probe | head -1) -``` - -## Phase 2 — Rebuild and re-push - -Forgejo lets you delete a specific package version through the API. -Doing this **before** the rebuild ensures the new push doesn't -collide with the half-broken existing entry. - -```sh -# Delete the broken version (replace TAG with the actual tag). -curl -X DELETE -H "Authorization: token $(vault kv get -field=forgejo_cleanup_token secret/viktor)" \ - "https://$REG/api/v1/packages/viktor/container/$(basename $IMAGE)/$TAG" -``` - -Rebuild via Woodpecker (manual run if the pipeline isn't triggered -by a code change): - -1. Open `https://ci.viktorbarzin.me/repos//manual` for the - project. -2. Click **Run pipeline** with `branch=master`. -3. Wait for the build-and-push step to complete. -4. Confirm the new version is visible in Forgejo Web UI under - `viktor/` → Packages → Container. - -## Phase 3 — Restart consumers - -Pods that already cached the broken digest may continue using it. -Force a fresh pull: - -```sh -kubectl rollout restart deploy/ -n -``` - -If the pod still fails, the new manifest digest may not have -propagated through containerd's cache. Drain + restart containerd on -the affected node: - -```sh -kubectl drain --ignore-daemonsets --delete-emptydir-data -ssh wizard@ sudo systemctl restart containerd -kubectl uncordon -``` - -## Phase 4 — Verify integrity recovery - -The next probe run (every 15 min) will report: - -``` -registry_manifest_integrity_failures{instance="forgejo.viktorbarzin.me"} 0 -``` - -The `RegistryManifestIntegrityFailure` alert resolves automatically -30 minutes after the metric goes back to 0. - -## Why this happens - -Forgejo's OCI registry stores blobs in its own DB+filesystem. Unlike -`registry:2` + `distribution`, it doesn't have the -[`distribution#3324`](https://github.com/distribution/distribution/issues/3324) -GC-vs-tag-delete race. But it can still reach a broken state if: - -- The retention CronJob deletes a version while a pull is in flight - on the same digest. -- The PVC fills up mid-push (`docs/runbooks/restore-pvc-from-backup.md`). -- A Forgejo upgrade migrates the package schema and a row is dropped. - -In all cases the recovery procedure is identical: delete the broken -version through the API, rebuild from source, force consumers to -re-pull. diff --git a/docs/runbooks/forgejo-registry-setup.md b/docs/runbooks/forgejo-registry-setup.md deleted file mode 100644 index 16637c6d..00000000 --- a/docs/runbooks/forgejo-registry-setup.md +++ /dev/null @@ -1,163 +0,0 @@ -# Runbook: Forgejo OCI registry — initial setup - -Last updated: 2026-05-07 - -This runbook covers the **one-time** bootstrap of Forgejo's container -registry, executed during Phase 0 of the registry consolidation plan -(`docs/plans/2026-05-07-forgejo-registry-consolidation-plan.md`). - -After this runbook is complete, the Forgejo OCI registry at -`forgejo.viktorbarzin.me` accepts pushes from CI and pulls from the -cluster, with retention and integrity monitoring in place. - -## Order of operations - -The Terraform stacks reference Vault keys that don't exist on a fresh -cluster. Create the keys **before** running `scripts/tg apply`. - -1. Apply the resource bumps (memory, PVC, ingress body size, - packages env vars) — these don't depend on the new Vault keys. -2. Create the service-account users + PATs in Forgejo. -3. Push the PATs to Vault. -4. Apply the rest of Phase 0 (registry-credentials extension, - monitoring probe, retention CronJob). - -### Step 1 — apply Forgejo deployment bumps - -```bash -cd infra/stacks/forgejo -scripts/tg apply -``` - -Wait for the new pod to come up at the bumped 1Gi memory request and -the resized 15Gi PVC. Verify packages are enabled: - -```bash -kubectl exec -n forgejo deploy/forgejo -- forgejo manager flush-queues -kubectl exec -n forgejo deploy/forgejo -- env | grep PACKAGES -``` - -### Step 2 — create service-account users - -`forgejo admin user create` is idempotent only with -`--must-change-password=false`. Re-running it on an existing user -errors out — that's fine; skip on rerun. - -```bash -# cluster-puller — read:package PAT for in-cluster pulls. -kubectl exec -n forgejo deploy/forgejo -- \ - forgejo admin user create \ - --username cluster-puller \ - --email cluster-puller@viktorbarzin.me \ - --password "$(openssl rand -base64 24)" \ - --must-change-password=false - -# ci-pusher — write:package PAT for CI dual-push, also reused as the -# cleanup CronJob credential (write:package includes delete). -kubectl exec -n forgejo deploy/forgejo -- \ - forgejo admin user create \ - --username ci-pusher \ - --email ci-pusher@viktorbarzin.me \ - --password "$(openssl rand -base64 24)" \ - --must-change-password=false -``` - -The user passwords are throwaway — we only ever auth via PAT. Forgejo -admin can reset them at any time from the Web UI. - -### Step 3 — generate the PATs - -PATs **must** be generated through the Web UI logged in as the -respective user (the CLI doesn't expose token creation). To log in -without OAuth (registration is disabled for everyone except `viktor`, -the admin), use the per-user temporary password from step 2. - -For each of `cluster-puller` and `ci-pusher`: - -1. Sign out of `viktor`. -2. Go to `https://forgejo.viktorbarzin.me/user/login` and sign in - with the throwaway password. -3. Settings → Applications → Generate new token. -4. Name: `cluster-pull` / `ci-push`. **Expiration: never.** -5. Scopes: - - `cluster-puller`: `read:package` - - `ci-pusher`: `write:package` (covers read+write+delete) -6. Save the token shown on the next page — it is **not** displayed again. - -For the cleanup CronJob, generate a third PAT on `ci-pusher`: - -7. Repeat steps 4-6 with name `cleanup`, scope `write:package`. - -### Step 4 — push PATs to Vault - -```bash -vault login -method=oidc - -# Read-only, used by the cluster-wide registry-credentials Secret and -# by the Forgejo integrity probe. -vault kv patch secret/viktor \ - forgejo_pull_token= - -# Write+delete, used by the retention CronJob inside Forgejo's -# namespace. -vault kv patch secret/viktor \ - forgejo_cleanup_token= - -# Write, propagated by vault-woodpecker-sync to all Woodpecker repos. -vault kv patch secret/ci/global \ - forgejo_user=ci-pusher \ - forgejo_push_token= -``` - -### Step 5 — apply the rest of Phase 0 - -```bash -# Registry credential Secret (now reads forgejo_pull_token). -cd infra/stacks/kyverno && scripts/tg apply - -# Monitoring probe + retention CronJob. -cd infra/stacks/monitoring && scripts/tg apply -cd infra/stacks/forgejo && scripts/tg apply - -# Containerd hosts.toml on each existing k8s node — VM cloud-init -# only fires on first boot. -infra/scripts/setup-forgejo-containerd-mirror.sh -``` - -## Verification - -```bash -# Login from a workstation with docker. -echo "" | docker login forgejo.viktorbarzin.me -u ci-pusher --password-stdin - -# Push a smoketest image. -docker pull alpine:3.20 -docker tag alpine:3.20 forgejo.viktorbarzin.me/viktor/smoketest:1 -docker push forgejo.viktorbarzin.me/viktor/smoketest:1 - -# Pull from a k8s node. -ssh wizard@ sudo crictl pull forgejo.viktorbarzin.me/viktor/smoketest:1 - -# Confirm the cluster-wide Secret was synced into a fresh namespace. -kubectl create namespace forgejo-smoketest -kubectl get secret -n forgejo-smoketest registry-credentials \ - -o jsonpath='{.data.\.dockerconfigjson}' | base64 -d | jq '.auths | keys' -# Expect: ["10.0.20.10:5050", "forgejo.viktorbarzin.me", -# "registry.viktorbarzin.me", "registry.viktorbarzin.me:5050"] -kubectl delete namespace forgejo-smoketest - -# Delete the smoketest package via API. -curl -X DELETE -H "Authorization: token " \ - https://forgejo.viktorbarzin.me/api/v1/packages/viktor/container/smoketest/1 -``` - -## When to revisit - -- **PAT rotation**: PATs created here have no expiry by design. If a - PAT leaks, regenerate via the Web UI and `vault kv patch` the new - value into the same key — the next `terragrunt apply` will sync it - to all consumers within minutes (Kyverno ClusterPolicy clones the - Secret, vault-woodpecker-sync runs every 6h). -- **New service account**: if a future workload needs different - scopes, add a parallel user/PAT here rather than expanding existing - PAT scope. Principle of least privilege. diff --git a/docs/runbooks/registry-vm.md b/docs/runbooks/registry-vm.md index 95f7b637..b5fed938 100644 --- a/docs/runbooks/registry-vm.md +++ b/docs/runbooks/registry-vm.md @@ -1,30 +1,12 @@ # Runbook: Registry VM (docker-registry, 10.0.20.10) -Last updated: 2026-05-07 +Last updated: 2026-04-19 -The registry VM is an Ubuntu 24.04 VM on the cluster LAN subnet -`10.0.20.0/24`, with a static netplan config (no DHCP). Because it -sits on a subnet that only has pfSense as its gateway, its DNS must -be statically configured. - -**As of Phase 4 of forgejo-registry-consolidation 2026-05-07** the VM -no longer hosts the private R/W registry. It hosts pull-through -caches only: - -| Port | Upstream | -|---|---| -| 5000 | docker.io (Docker Hub) — auth via dockerhub_registry_password | -| 5010 | ghcr.io | -| 5020 | quay.io | -| 5030 | registry.k8s.io | -| 5040 | reg.kyverno.io | - -The decommissioned private registry (port 5050) is now hosted on -Forgejo at `forgejo.viktorbarzin.me/viktor/`. See -`docs/plans/2026-05-07-forgejo-registry-consolidation-plan.md` for the -migration. Break-glass tarballs of `infra-ci` are still produced on -each build to `/opt/registry/data/private/_breakglass/` — see -`docs/runbooks/forgejo-registry-breakglass.md`. +The registry VM hosts `registry.viktorbarzin.me` (private Docker +registry, htpasswd-auth, NGINX → registry:2). It is an Ubuntu 24.04 +VM on the cluster LAN subnet `10.0.20.0/24`, with a static netplan +config (no DHCP). Because it sits on a subnet that only has pfSense +as its gateway, its DNS must be statically configured. ## DNS configuration diff --git a/docs/runbooks/woodpecker-onboard-forgejo-repo.md b/docs/runbooks/woodpecker-onboard-forgejo-repo.md deleted file mode 100644 index 0a4de682..00000000 --- a/docs/runbooks/woodpecker-onboard-forgejo-repo.md +++ /dev/null @@ -1,73 +0,0 @@ -# Runbook: Onboarding a new Forgejo repo to Woodpecker - -Last updated: 2026-05-07 - -When you create a new repo on `forgejo.viktorbarzin.me`, Woodpecker -does NOT auto-discover it via the cluster's existing OAuth session. -The `forgejo` user inside Woodpecker (Forgejo-OAuth'd) needs to: - -1. Open `https://ci.viktorbarzin.me/` in a browser. -2. Log in via Forgejo OAuth (the "Sign in with Forgejo" button). -3. Click "Add Repository" — your new repo should appear. -4. Click the toggle to activate it. Woodpecker will: - - Add a webhook on the Forgejo repo (push, PR, release events). - - Register the repo's `forge_remote_id` in its DB so subsequent - hooks deserialize correctly. -5. Push a commit (or hit "Run pipeline" in Woodpecker UI) — first - build fires. - -## Why API-only doesn't work - -The webhook URL contains a JWT signed with a per-server key that's -stored in the DB and only accessible at OAuth-flow time. POST'ing -`/api/repos` as the admin (`ViktorBarzin` GitHub user) returns 500 -because the lookup queries forge-side OAuth state for THAT user, -which doesn't exist for the Forgejo `viktor` user. We confirmed: - -- Direct `POST /api/repos?forge_remote_id=N` → HTTP 500 server-side. -- Generating a JWT with the agent secret → "token is unverifiable" - on hook delivery (the signing key is repo-specific, not the - global agent secret). - -There's no admin endpoint that side-steps the OAuth flow. - -## Bootstrap when UI access isn't available - -If you absolutely need to bootstrap a new image without UI access -(e.g., during an outage), the workaround is: - -1. Build locally: - ```bash - docker build -t forgejo.viktorbarzin.me/viktor/: /path/to/source - docker push forgejo.viktorbarzin.me/viktor/: - ``` -2. Or pull from another already-built source and retag: - ```bash - docker pull viktorbarzin/: # DockerHub - docker tag viktorbarzin/: forgejo.viktorbarzin.me/viktor/: - docker push forgejo.viktorbarzin.me/viktor/: - ``` -3. Flip the cluster `image=` reference and restart deployments. - -Document the bootstrap in the relevant stack so future maintainers -know the image was put there by hand. After Woodpecker UI onboarding, -the next pipeline run replaces the bootstrap image with a CI-built one. - -## Repos onboarded in flight 2026-05-07 - -These were created during the forgejo-registry-consolidation but the -UI step above hasn't been done yet — their `.woodpecker.yml` / -`.woodpecker/build.yml` exists on Forgejo but no pipeline fires: - -- `viktor/broker-sync` — image bootstrapped via DockerHub (see - `infra/stacks/wealthfolio/main.tf` comment). -- `viktor/fire-planner` — image bootstrapped via local docker build. -- `viktor/hmrc-sync` -- `viktor/freedify` -- `viktor/claude-agent-service` -- `viktor/beadboard` — image bootstrapped via local docker build. -- `viktor/claude-memory-mcp` - -Walk through each in the Woodpecker UI to enable. Pipelines for -already-onboarded repos (payslip-ingest, job-hunter, infra) fired -correctly after the v3.13 → v3.14 upgrade. diff --git a/modules/docker-registry/docker-compose.yml b/modules/docker-registry/docker-compose.yml index 2a2b88c4..083e6bba 100644 --- a/modules/docker-registry/docker-compose.yml +++ b/modules/docker-registry/docker-compose.yml @@ -89,26 +89,35 @@ services: retries: 3 start_period: 10s - # registry-private decommissioned in Phase 4 of - # forgejo-registry-consolidation 2026-05-07 — image migration completed, - # cluster flipped to forgejo.viktorbarzin.me/viktor/. The remaining - # five services on this VM are pull-through caches for upstream registries. - # After 1 week of no incidents, `rm -rf /opt/registry/data/private/` on the - # VM frees ~2.6 GB. The tarball break-glass under - # /opt/registry/data/private/_breakglass/ stays — it's how we recover - # infra-ci if Forgejo ever goes fully down. + registry-private: + image: registry:2.8.3 + container_name: registry-private + restart: always + volumes: + - /opt/registry/data/private:/var/lib/registry + - /opt/registry/config-private.yml:/etc/docker/registry/config.yml:ro + - /opt/registry/htpasswd:/auth/htpasswd:ro + networks: + - registry + healthcheck: + # 401 is expected (auth required) — any HTTP response means the registry is healthy + test: ["CMD", "sh", "-c", "wget -qS -O /dev/null http://127.0.0.1:5000/v2/ 2>&1 | grep -q 'HTTP/'"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s nginx: image: nginx:alpine container_name: registry-nginx restart: always - # 5050 dropped Phase 4 of forgejo-registry-consolidation 2026-05-07. ports: - "5000:5000" - "5010:5010" - "5020:5020" - "5030:5030" - "5040:5040" + - "5050:5050" volumes: - /opt/registry/nginx.conf:/etc/nginx/nginx.conf:ro - /opt/registry/tls:/etc/nginx/tls:ro @@ -126,6 +135,8 @@ services: condition: service_healthy registry-kyverno: condition: service_healthy + registry-private: + condition: service_healthy healthcheck: test: ["CMD", "sh", "-c", "wget -qO- http://127.0.0.1:5000/v2/ >/dev/null 2>&1"] interval: 30s diff --git a/modules/docker-registry/nginx_registry.conf b/modules/docker-registry/nginx_registry.conf index 46d7c16a..ec433340 100644 --- a/modules/docker-registry/nginx_registry.conf +++ b/modules/docker-registry/nginx_registry.conf @@ -33,9 +33,10 @@ http { keepalive 32; } - # `upstream private` removed in Phase 4 of forgejo-registry-consolidation - # 2026-05-07. The /v2/ private registry is now Forgejo at - # forgejo.viktorbarzin.me/viktor/. + upstream private { + server registry-private:5000; + keepalive 32; + } # --- Docker Hub (port 5000) --- @@ -167,8 +168,37 @@ http { } } - # --- Private R/W Registry (port 5050) decommissioned Phase 4 2026-05-07 --- - # The TLS port 5050 server block previously fronted `registry-private`. - # Migrated to Forgejo at forgejo.viktorbarzin.me/viktor/. Both - # docker-compose.yml and this nginx config no longer reference port 5050. + # --- Private R/W Registry (port 5050, TLS) --- + + server { + listen 5050 ssl; + server_name registry.viktorbarzin.me; + + ssl_certificate /etc/nginx/tls/fullchain.pem; + ssl_certificate_key /etc/nginx/tls/privkey.pem; + ssl_protocols TLSv1.2 TLSv1.3; + + client_max_body_size 0; + proxy_request_buffering off; + proxy_buffering off; + chunked_transfer_encoding on; + + location /v2/ { + proxy_pass http://private; + proxy_http_version 1.1; + proxy_set_header Host $http_host; + proxy_set_header Connection ""; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_read_timeout 900; + proxy_send_timeout 900; + } + + location / { + return 200 'ok'; + add_header Content-Type text/plain; + } + } } diff --git a/modules/kubernetes/ingress_factory/main.tf b/modules/kubernetes/ingress_factory/main.tf index 1975658e..8e893dca 100644 --- a/modules/kubernetes/ingress_factory/main.tf +++ b/modules/kubernetes/ingress_factory/main.tf @@ -40,9 +40,8 @@ variable "ingress_path" { default = ["/"] } variable "max_body_size" { - type = string - default = null - description = "Maximum request body size, e.g. '5g'. null = no limit (Traefik default). When set, a per-ingress Buffering middleware is created and attached." + type = string + default = "50m" } variable "extra_annotations" { default = {} @@ -204,17 +203,6 @@ locals { "gethomepage.dev/href" = "https://${local.effective_host}" "gethomepage.dev/icon" = "${replace(var.name, "-", "")}.png" } : {} - - # Parse "5g"/"50m"/"1024k"/"42" into bytes. Traefik's Buffering middleware - # takes maxRequestBodyBytes as an integer. Empty unit = bytes. - body_size_match = var.max_body_size == null ? null : regex("^([0-9]+)([kmgKMG]?)$", var.max_body_size) - body_size_unit_multiplier = var.max_body_size == null ? 0 : ( - lower(local.body_size_match[1]) == "g" ? 1073741824 : - lower(local.body_size_match[1]) == "m" ? 1048576 : - lower(local.body_size_match[1]) == "k" ? 1024 : - 1 - ) - max_body_size_bytes = var.max_body_size == null ? 0 : tonumber(local.body_size_match[0]) * local.body_size_unit_multiplier } @@ -257,7 +245,6 @@ resource "kubernetes_ingress_v1" "proxied-ingress" { var.protected ? "traefik-authentik-forward-auth@kubernetescrd" : null, var.allow_local_access_only ? "traefik-local-only@kubernetescrd" : null, var.custom_content_security_policy != null ? "${var.namespace}-custom-csp-${var.name}@kubernetescrd" : null, - var.max_body_size != null ? "${var.namespace}-buffering-${var.name}@kubernetescrd" : null, ], var.extra_middlewares))) "traefik.ingress.kubernetes.io/router.entrypoints" = "websecure" }, local.homepage_defaults, var.extra_annotations, @@ -315,27 +302,6 @@ resource "kubernetes_manifest" "custom_csp" { } } -# Buffering middleware - created per service when max_body_size is set. -# Traefik default is unlimited; setting maxRequestBodyBytes enforces a limit -# (e.g. Forgejo container pushes can ship multi-GB layer blobs). -resource "kubernetes_manifest" "buffering" { - count = var.max_body_size != null ? 1 : 0 - - manifest = { - apiVersion = "traefik.io/v1alpha1" - kind = "Middleware" - metadata = { - name = "buffering-${var.name}" - namespace = var.namespace - } - spec = { - buffering = { - maxRequestBodyBytes = local.max_body_size_bytes - } - } - } -} - # Cloudflare DNS records — created automatically when dns_type is set. # Proxied: CNAME to Cloudflare tunnel. Non-proxied: A + AAAA to public IP. resource "cloudflare_record" "proxied" { diff --git a/scripts/forgejo-migrate-orphan-images.sh b/scripts/forgejo-migrate-orphan-images.sh deleted file mode 100755 index 2bd77e35..00000000 --- a/scripts/forgejo-migrate-orphan-images.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env bash -# One-shot migration of every private image on registry.viktorbarzin.me to -# Forgejo. Used as a stop-gap when the dual-push CI pipelines aren't -# producing Forgejo images on their own (Forgejo-Woodpecker forge driver -# context-deadline-exceeded issue, see bd code-d3y / 2026-05-07). -# -# Pulls each image from registry.viktorbarzin.me, retags, pushes to -# forgejo.viktorbarzin.me/viktor/: — preserving the blob bytes -# verbatim so the cluster can flip image= without a rebuild. -# -# Run from any host with docker + network reach to BOTH registries. Auth -# from `docker login` (~/.docker/config.json) — make sure both registries -# are logged in: -# docker login registry.viktorbarzin.me -u viktorbarzin -# docker login forgejo.viktorbarzin.me -u viktor # use viktor PAT, not ci-pusher -# -# (ci-pusher CANNOT push to viktor/ — Forgejo container packages -# are scoped to the pushing user. Only viktor's PAT can write to viktor/*.) -# -# After the script, the new image lives at -# forgejo.viktorbarzin.me/viktor/: -# Phase 3 of the consolidation flips infra/stacks//main.tf image= -# to that path. - -set -euo pipefail - -OLD_REG=registry.viktorbarzin.me -NEW_REG=forgejo.viktorbarzin.me/viktor - -# Image list: :. Generated 2026-05-07 from `grep -rEn 'image\s*=\s* -# "registry\.viktorbarzin\.me'` across infra/stacks/. -# -# Excluded: -# - wealthfolio-sync: registry repo exists but has 0 tags (CronJob has been -# broken for 36+ days, separate decision needed). User to triage before -# migration. -# - fire-planner: registry repo exists but has 0 tags. Dockerfile + CI added -# in this session (commit 8b53d99e); rebuild via Woodpecker before flipping. -IMAGES=( - "chrome-service-novnc:v4" - "chrome-service-novnc:latest" - "payslip-ingest:latest" - "job-hunter:latest" - "claude-agent-service:latest" - "freedify:latest" - "beadboard:latest" - "infra-ci:latest" -) - -for img in "${IMAGES[@]}"; do - echo "=== $img ===" - src="$OLD_REG/$img" - dst="$NEW_REG/$img" - - if ! docker pull "$src" 2>&1 | tee /tmp/pull-$$ | grep -q 'Status: '; then - if grep -q 'not found' /tmp/pull-$$; then - echo " SKIP — image not present in source registry" - rm -f /tmp/pull-$$ - continue - fi - fi - rm -f /tmp/pull-$$ - - echo " tag → $dst" - docker tag "$src" "$dst" - - echo " push $dst" - docker push "$dst" 2>&1 | tail -2 - - echo " cleanup local copy" - docker rmi "$src" "$dst" 2>&1 | tail -1 || true -done - -echo "" -echo "Done. Verify in Forgejo Web UI: https://forgejo.viktorbarzin.me/viktor/-/packages?type=container" -echo "Phase 3 of the plan flips infra/stacks/{wealthfolio,fire-planner}/main.tf image= references." diff --git a/scripts/setup-forgejo-containerd-mirror.sh b/scripts/setup-forgejo-containerd-mirror.sh deleted file mode 100755 index 1e4625fd..00000000 --- a/scripts/setup-forgejo-containerd-mirror.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env bash -# One-shot deployment of the forgejo.viktorbarzin.me containerd hosts.toml -# entry across every k8s node. Cloud-init only fires on VM provision, so -# existing nodes need this manual rollout. -# -# What it does, per node: -# 1. drain (ignore-daemonsets, delete-emptydir-data) -# 2. ssh in: mkdir + write /etc/containerd/certs.d/forgejo.viktorbarzin.me/hosts.toml -# 3. systemctl restart containerd -# 4. uncordon -# -# hosts.toml is documented as hot-reloaded but the post-2026-04-19 -# containerd corruption playbook calls for an explicit restart so the -# config is unambiguously in effect. Running drain/uncordon around it -# avoids pulling against an in-flight containerd restart. -# -# Re-run is safe: writes are idempotent. - -set -euo pipefail - -CERTS_DIR=/etc/containerd/certs.d/forgejo.viktorbarzin.me -HOSTS_TOML='server = "https://forgejo.viktorbarzin.me" - -[host."https://10.0.20.200"] - capabilities = ["pull", "resolve"] -' - -NODES=$(kubectl get nodes -o name | sed 's|^node/||') -if [[ -z "$NODES" ]]; then - echo "ERROR: no nodes returned from kubectl get nodes" >&2 - exit 1 -fi - -for n in $NODES; do - echo "=== $n ===" - kubectl drain "$n" --ignore-daemonsets --delete-emptydir-data --force --grace-period=60 - - ssh -o StrictHostKeyChecking=accept-new "wizard@$n" sudo bash < "$CERTS_DIR/hosts.toml" <<'TOML' -$HOSTS_TOML -TOML -systemctl restart containerd -EOF - - kubectl uncordon "$n" - - # Wait for the node to report Ready before moving to the next one. - for i in {1..30}; do - if kubectl get node "$n" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' | grep -q True; then - echo " node Ready" - break - fi - sleep 2 - done -done - -echo "All nodes updated." diff --git a/stacks/beads-server/main.tf b/stacks/beads-server/main.tf index 006b9db6..e11b0ac7 100644 --- a/stacks/beads-server/main.tf +++ b/stacks/beads-server/main.tf @@ -567,8 +567,7 @@ resource "kubernetes_deployment" "beadboard" { container { name = "beadboard" - # Phase 3 cutover 2026-05-07 — Forgejo registry consolidation. - image = "forgejo.viktorbarzin.me/viktor/beadboard:${var.beadboard_image_tag}" + image = "registry.viktorbarzin.me:5050/beadboard:${var.beadboard_image_tag}" port { name = "http" @@ -726,8 +725,7 @@ resource "kubernetes_config_map" "beads_metadata" { } locals { - # Phase 3 cutover 2026-05-07 — Forgejo registry consolidation. - claude_agent_service_image = "forgejo.viktorbarzin.me/viktor/claude-agent-service:${var.claude_agent_service_image_tag}" + claude_agent_service_image = "registry.viktorbarzin.me/claude-agent-service:${var.claude_agent_service_image_tag}" beadboard_internal_url = "http://${kubernetes_service.beadboard.metadata[0].name}.${kubernetes_namespace.beads.metadata[0].name}.svc.cluster.local" beads_script_prelude = <<-EOT diff --git a/stacks/chrome-service/README.md b/stacks/chrome-service/README.md deleted file mode 100644 index b18e9116..00000000 --- a/stacks/chrome-service/README.md +++ /dev/null @@ -1,90 +0,0 @@ -# chrome-service - -In-cluster headed Chromium exposed over Playwright's WebSocket protocol. -Sibling services drive it instead of running their own in-process browser -— useful when the upstream tries to detect headless mode (e.g. hmembeds' -`disable-devtool.js` redirect-to-google trap). - -## Connect - -```python -from playwright.async_api import async_playwright - -WS_URL = "ws://chrome-service.chrome-service.svc.cluster.local:3000" -WS_TOKEN = os.environ["CHROME_WS_TOKEN"] # 32-byte URL-safe random - -async with async_playwright() as p: - browser = await p.chromium.connect(f"{WS_URL}/{WS_TOKEN}", timeout=15_000) - context = await browser.new_context() - await context.add_init_script(STEALTH_JS) # see files/stealth.js - page = await context.new_page() - ... - await browser.close() -``` - -The token comes from Vault KV `secret/chrome-service.api_bearer_token`, -which ESO syncs into a per-namespace K8s Secret in each caller stack -(see f1-stream's `chrome-service-client-secrets`). - -## Add a new caller - -1. **Label the caller's namespace** so the chrome-service NetworkPolicy - admits it: - ```hcl - resource "kubernetes_namespace" "" { - metadata { - labels = { - "chrome-service.viktorbarzin.me/client" = "true" - } - } - } - ``` -2. **Add an ExternalSecret** in the caller stack pulling the token: - ```hcl - resource "kubernetes_manifest" "chrome_token" { - manifest = { - apiVersion = "external-secrets.io/v1beta1" - kind = "ExternalSecret" - metadata = { name = "chrome-service-client-secrets", namespace = "" } - spec = { - refreshInterval = "15m" - secretStoreRef = { name = "vault-kv", kind = "ClusterSecretStore" } - target = { name = "chrome-service-client-secrets" } - dataFrom = [{ extract = { key = "chrome-service" } }] - } - } - } - ``` -3. **Inject `CHROME_WS_URL` + `CHROME_WS_TOKEN`** into the caller's pod env. - Use `secret_key_ref` for the token; the URL is a plain value. -4. **Vendor `stealth.js`** into the caller (or just paste — it's ~40 lines) - and apply via `await context.add_init_script(STEALTH_JS)` after every - `new_context()`. Without it, hmembeds-class anti-bot still trips. - -## Image pin - -Both the server image (`mcr.microsoft.com/playwright:v1.48.0-noble` in -`main.tf`) and the client (`playwright==1.48.0` in callers' requirements) -must match minor-versions. Bump in lockstep — Playwright protocol changes -between minors. - -## Operations - -- **Storage**: encrypted PVC at `/profile` for cookies + npm cache. Ephemeral - contexts (`browser.new_context()`) bypass the profile; persistent contexts - share it. Backed up tar+gzip every 6h to `/srv/nfs/chrome-service-backup/`, - 30-day retention. -- **Probes**: TCP/3000. Playwright run-server has no HTTP `/health`; a TCP - open is the only liveness signal available without spinning a browser. -- **Health page**: visit `https://chrome.viktorbarzin.me` (Authentik-gated) - to confirm the pod is up. The WS port stays internal-only. -- **Token rotation**: `vault kv put secret/chrome-service api_bearer_token=$(python3 -c 'import secrets; print(secrets.token_urlsafe(32))')`. - Reloader cascades the rotation to both the server pod and any caller - whose secret has the `reloader.stakater.com/auto = "true"` annotation. - -## Why headed (Xvfb) instead of headless? - -`disable-devtool.js` and similar libraries detect `navigator.webdriver`, -console-clear timing, and the `HeadlessChromium/...` user-agent suffix. -Running headed inside `Xvfb :99` reports as a normal Chromium, and the -stealth init script handles the JS-visible giveaways. diff --git a/stacks/chrome-service/files/novnc/Dockerfile b/stacks/chrome-service/files/novnc/Dockerfile deleted file mode 100644 index e447a6da..00000000 --- a/stacks/chrome-service/files/novnc/Dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -FROM docker.io/library/ubuntu:24.04 - -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - x11vnc \ - novnc \ - websockify \ - ca-certificates \ - && rm -rf /var/lib/apt/lists/* - -# noVNC ships /usr/share/novnc/vnc.html; alias to index.html so / works. -RUN ln -sf /usr/share/novnc/vnc.html /usr/share/novnc/index.html - -EXPOSE 6080 - -COPY entrypoint.sh /entrypoint.sh -RUN chmod +x /entrypoint.sh - -CMD ["/entrypoint.sh"] diff --git a/stacks/chrome-service/files/novnc/entrypoint.sh b/stacks/chrome-service/files/novnc/entrypoint.sh deleted file mode 100644 index 1ec6657f..00000000 --- a/stacks/chrome-service/files/novnc/entrypoint.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env bash -# Connect to the chrome-service container's Xvfb (shared pod network, TCP) -# and serve the noVNC HTML5 client + websockify bridge on :6080. -set -e - -for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do - if echo > /dev/tcp/127.0.0.1/6099 2>/dev/null; then - echo "Xvfb TCP up after attempt $i" - break - fi - echo "waiting for Xvfb TCP 6099 attempt=$i" - sleep 2 -done - -# websockify runs as PID 1; x11vnc is a child so its logs land on container stdout -# `-noshm` skips MIT-SHM probes that fail across container boundaries (each -# container has its own /dev/shm); `-noxdamage` skips XDAMAGE which Xvfb -# doesn't expose; `-quiet` keeps the polling chatter out of pod logs. -echo "starting x11vnc -> :5900" -x11vnc -display localhost:99 -nopw -listen 0.0.0.0 -rfbport 5900 \ - -forever -shared -noshm -noxdamage -quiet 2>&1 & -X11VNC_PID=$! - -for i in 1 2 3 4 5 6 7 8 9 10; do - if echo > /dev/tcp/127.0.0.1/5900 2>/dev/null; then - echo "x11vnc bound 5900 after attempt $i" - break - fi - echo "waiting for x11vnc :5900 attempt=$i" - sleep 2 -done - -if ! echo > /dev/tcp/127.0.0.1/5900 2>/dev/null; then - echo "ERROR: x11vnc did not bind 5900" - exit 1 -fi - -echo "starting websockify -> :6080" -exec websockify --web=/usr/share/novnc 6080 localhost:5900 diff --git a/stacks/chrome-service/files/stealth.js b/stacks/chrome-service/files/stealth.js deleted file mode 100644 index dfae98a8..00000000 --- a/stacks/chrome-service/files/stealth.js +++ /dev/null @@ -1,54 +0,0 @@ -// Minimal stealth init script for Playwright-driven Chromium. -// Vendored from puppeteer-extra-plugin-stealth/evasions/* (MIT) — covers: -// webdriver, chrome.runtime, navigator.plugins, navigator.languages, -// Permissions.query, WebGL getParameter (vendor + renderer spoof). -// Run via context.add_init_script() so it executes before any page script. -(() => { - // navigator.webdriver — most common detection, removed entirely. - Object.defineProperty(Navigator.prototype, 'webdriver', { get: () => undefined }); - - // window.chrome.runtime — many sites check that real Chrome exposes this. - if (!window.chrome) window.chrome = {}; - window.chrome.runtime = window.chrome.runtime || {}; - - // navigator.plugins — headless reports zero; spoof a plausible PDF viewer. - Object.defineProperty(navigator, 'plugins', { - get: () => [{ name: 'Chrome PDF Plugin' }, { name: 'Chrome PDF Viewer' }, { name: 'Native Client' }], - }); - - // navigator.languages — headless returns empty array. - Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); - - // Permissions.query — headless returns 'denied' for notifications instead of 'default'. - const origQuery = window.navigator.permissions && window.navigator.permissions.query; - if (origQuery) { - window.navigator.permissions.query = (parameters) => - parameters && parameters.name === 'notifications' - ? Promise.resolve({ state: Notification.permission }) - : origQuery(parameters); - } - - // WebGL getParameter — spoof vendor + renderer strings to a real GPU. - const spoofGl = (proto) => { - if (!proto) return; - const orig = proto.getParameter; - proto.getParameter = function (parameter) { - if (parameter === 37445) return 'Intel Inc.'; // UNMASKED_VENDOR_WEBGL - if (parameter === 37446) return 'Intel Iris OpenGL Engine'; // UNMASKED_RENDERER_WEBGL - return orig.apply(this, arguments); - }; - }; - spoofGl(window.WebGLRenderingContext && window.WebGLRenderingContext.prototype); - spoofGl(window.WebGL2RenderingContext && window.WebGL2RenderingContext.prototype); - - // disable-devtool.js (theajack/disable-devtool) auto-inits via a script - // tag with `disable-devtool-auto`. Its Performance detector trips under - // Playwright (CDP adds console.log latency vs console.table) and the - // redirect URL is hard-coded — for hmembeds that's google.com. - // Hide the auto-init marker so the library's IIFE exits early. - const origQS = Document.prototype.querySelector; - Document.prototype.querySelector = function (sel) { - if (typeof sel === 'string' && sel.indexOf('disable-devtool-auto') !== -1) return null; - return origQS.apply(this, arguments); - }; -})(); diff --git a/stacks/chrome-service/main.tf b/stacks/chrome-service/main.tf deleted file mode 100644 index 13ab49ee..00000000 --- a/stacks/chrome-service/main.tf +++ /dev/null @@ -1,504 +0,0 @@ -variable "tls_secret_name" { - type = string - sensitive = true -} -variable "nfs_server" { type = string } - -locals { - namespace = "chrome-service" - labels = { - app = "chrome-service" - } - # Pin to the same Playwright minor that the Python client requires. - # If you bump this image, also bump `playwright==X.Y.Z` in the client - # (currently f1-stream) and re-run the connect smoke test. - image = "mcr.microsoft.com/playwright:v1.48.0-noble" -} - -# --- Namespace --- - -resource "kubernetes_namespace" "chrome_service" { - metadata { - name = local.namespace - labels = { - "istio-injection" = "disabled" - tier = local.tiers.aux - "chrome-service.viktorbarzin.me/server" = "true" - } - } - lifecycle { - # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace - ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] - } -} - -# --- Secrets (single-key extract: api_bearer_token) --- - -resource "kubernetes_manifest" "external_secret" { - manifest = { - apiVersion = "external-secrets.io/v1beta1" - kind = "ExternalSecret" - metadata = { - name = "chrome-service-secrets" - namespace = local.namespace - } - spec = { - refreshInterval = "15m" - secretStoreRef = { - name = "vault-kv" - kind = "ClusterSecretStore" - } - target = { - name = "chrome-service-secrets" - } - dataFrom = [{ - extract = { - key = "chrome-service" - } - }] - } - } - depends_on = [kubernetes_namespace.chrome_service] -} - -# tls-secret for the chrome.viktorbarzin.me ingress is auto-cloned into -# every namespace by Kyverno's `sync-tls-secret` ClusterPolicy — no local -# module call needed. - -# --- Encrypted profile PVC --- -# Holds Chromium user data: cookies, localStorage, IndexedDB. Sites we -# drive may set auth tokens or session cookies — encrypted is correct. -resource "kubernetes_persistent_volume_claim" "profile_encrypted" { - wait_until_bound = false - metadata { - name = "chrome-service-profile-encrypted" - namespace = kubernetes_namespace.chrome_service.metadata[0].name - annotations = { - "resize.topolvm.io/threshold" = "80%" - "resize.topolvm.io/increase" = "100%" - "resize.topolvm.io/storage_limit" = "10Gi" - } - } - spec { - access_modes = ["ReadWriteOnce"] - storage_class_name = "proxmox-lvm-encrypted" - resources { - requests = { - storage = "2Gi" - } - } - } -} - -# --- NFS backup target --- -module "nfs_chrome_service_backup_host" { - source = "../../modules/kubernetes/nfs_volume" - name = "chrome-service-backup-host" - namespace = kubernetes_namespace.chrome_service.metadata[0].name - nfs_server = "192.168.1.127" - nfs_path = "/srv/nfs/chrome-service-backup" -} - -# --- Deployment --- - -resource "kubernetes_deployment" "chrome_service" { - metadata { - name = "chrome-service" - namespace = kubernetes_namespace.chrome_service.metadata[0].name - labels = merge(local.labels, { - tier = local.tiers.aux - }) - annotations = { - "reloader.stakater.com/auto" = "true" - } - } - spec { - replicas = 1 - strategy { - type = "Recreate" - } - selector { - match_labels = local.labels - } - template { - metadata { - labels = local.labels - } - spec { - # The noVNC sidecar pulls from registry.viktorbarzin.me which needs - # auth. Kyverno's `sync-registry-credentials` ClusterPolicy syncs - # the secret into every namespace. - image_pull_secrets { - name = "registry-credentials" - } - security_context { - run_as_user = 1000 - run_as_group = 1000 - fs_group = 1000 - seccomp_profile { - type = "RuntimeDefault" - } - } - - # Fix profile dir ownership (PVC may have root-owned files from prior run). - init_container { - name = "fix-perms" - image = "busybox:1.37" - command = ["sh", "-c", "chown -R 1000:1000 /profile"] - security_context { - run_as_user = 0 - } - volume_mount { - name = "profile" - mount_path = "/profile" - } - resources { - requests = { memory = "32Mi" } - limits = { memory = "64Mi" } - } - } - - container { - name = "chrome-service" - image = local.image - image_pull_policy = "IfNotPresent" - - # `launch-server` (not `run-server`) lets us pin headed mode + - # specific args. `run-server` defaults to headless, which the - # disable-devtool.js Performance detector trips under Playwright - # (CDP adds latency to console.log; lib detects + redirects). - # The Microsoft image ships only the browsers, not the playwright - # npm package itself — `npx -y playwright@` downloads it on - # first start (cached under $HOME/.npm via the PVC) and pins to - # the same minor as the Python client. Bump in lockstep. - command = ["bash", "-c"] - args = [ - <<-EOT - set -e - # `-listen tcp` enables localhost:6099 so the noVNC sidecar can - # connect over the pod's shared network namespace (Ubuntu 24.04 - # defaults Xvfb to -nolisten tcp). - # `-ac` disables X access control so the noVNC sidecar can - # attach without an MIT-MAGIC-COOKIE; safe because Xvfb only - # listens on localhost (pod's lo). - Xvfb :99 -screen 0 1280x720x24 -listen tcp -ac & - sleep 1 - cat > /tmp/launch.json < regardless of upstream policy. -2. Inject + a frame-buster-defeat -""" - - -def _decode(encoded_url: str) -> str: - try: - return decode_url(encoded_url) - except Exception as e: - raise HTTPException(status_code=400, detail=f"Invalid encoded URL: {e}") - - -def _filter_headers(upstream_headers: httpx.Headers) -> dict[str, str]: - """Forward upstream headers minus the ones we strip.""" - out: dict[str, str] = {} - for k, v in upstream_headers.items(): - if k.lower() in STRIP_RESPONSE_HEADERS: - continue - out[k] = v - # Always allow our domain to embed and load cross-origin - out["Access-Control-Allow-Origin"] = "*" - out["X-Frame-Options-Stripped"] = "by-f1-embed-proxy" - return out - - -def _make_referer(upstream_url: str) -> str: - """Build a plausible Referer header — the upstream's own root.""" - parsed = urlparse(upstream_url) - return f"{parsed.scheme}://{parsed.netloc}/" - - -def _make_origin(upstream_url: str) -> str: - parsed = urlparse(upstream_url) - return f"{parsed.scheme}://{parsed.netloc}" - - -def _inject_into_head(html: str, upstream_url: str) -> str: - """Inject tag + frame-buster defeat script into the response HTML.""" - parsed = urlparse(upstream_url) - base_href = f"{parsed.scheme}://{parsed.netloc}/" - - # The frame-buster-defeat script. Use the upstream's own URL as the spoofed referrer. - busted = _FRAME_BUSTER_DEFEAT_TEMPLATE.format(referrer=upstream_url) - - base_tag = f'' - - injection = base_tag + busted - - # Drop any inline CSP tags first so they can't override our header strip. - html = re.sub( - r']+http-equiv=[\'"]?Content-Security-Policy[\'"]?[^>]*>', - "", - html, - flags=re.IGNORECASE, - ) - - # Strip disable-devtool.js script tags. The library runs detection heuristics - # and redirects on match. Removing it reduces attack surface even with our - # location-setter lockdown — saves redundant work and one fewer thing to - # bypass in case the lockdown misses an edge case. - html = re.sub( - r']+(?:disable-devtool|devtool|disabledevtool)[^<]*', - "", - html, - flags=re.IGNORECASE, - ) - html = re.sub( - r']+src=["\'][^"\']*disable-devtool[^"\']*["\'][^>]*>', - "", - html, - flags=re.IGNORECASE, - ) - - # Insert immediately after the opening (case-insensitive). - head_match = re.search(r"]*>", html, flags=re.IGNORECASE) - if head_match: - idx = head_match.end() - return html[:idx] + injection + html[idx:] - - # No — prepend at the start of the document so the script runs first. - return injection + html - - -def _looks_blocked_by_anti_bot(content: str) -> bool: - """Detect Cloudflare-style challenge interstitials in the upstream body.""" - sample = content[:4096].lower() - markers = ( - "cf-chl-bypass", - "checking your browser", - "just a moment", - "attention required", - "cf-browser-verification", - ) - return any(m in sample for m in markers) - - -async def fetch_embed(encoded_url: str) -> tuple[bytes, dict[str, str], int]: - """Fetch an upstream embed page, rewrite the HTML, and return the response. - - Returns: (body_bytes, headers_dict, status_code). - Raises HTTPException on transport errors. - """ - url = _decode(encoded_url) - logger.info("Embed-proxying: %s", url) - - upstream_headers = { - "User-Agent": USER_AGENT, - "Referer": _make_referer(url), - "Origin": _make_origin(url), - "Accept": ( - "text/html,application/xhtml+xml,application/xml;q=0.9," - "image/avif,image/webp,*/*;q=0.8" - ), - "Accept-Language": "en-US,en;q=0.9", - } - - try: - async with httpx.AsyncClient( - timeout=EMBED_TIMEOUT, - follow_redirects=True, - ) as client: - response = await client.get(url, headers=upstream_headers) - except httpx.TimeoutException: - raise HTTPException(status_code=504, detail="Upstream embed timeout") - except httpx.HTTPError as e: - raise HTTPException(status_code=502, detail=f"Upstream embed error: {e}") - - status_code = response.status_code - upstream_ct = response.headers.get("content-type", "") - headers_out = _filter_headers(response.headers) - - body = response.content - - # Detect Cloudflare-style challenge so the frontend can show a clear error. - if "html" in upstream_ct.lower(): - text = response.text - if _looks_blocked_by_anti_bot(text): - logger.warning("Upstream returned anti-bot challenge: %s", url) - raise HTTPException( - status_code=502, - detail="Upstream returned anti-bot challenge — proxy cannot bypass", - ) - - rewritten = _inject_into_head(text, url) - body = rewritten.encode("utf-8") - headers_out["Content-Type"] = "text/html; charset=utf-8" - - return body, headers_out, status_code - - -async def relay_asset( - encoded_url: str, range_header: str | None -) -> tuple[AsyncGenerator[bytes, None], dict[str, str], int]: - """Relay an upstream subresource (JS/CSS/image/font) as a chunked stream. - - Used as a fallback when an upstream blocks hotlinked assets via Referer - or Origin checks. The injected tag handles most of these cases - by letting the browser hit upstream directly — the relay is only for - the awkward few that need a proxied origin. - """ - url = _decode(encoded_url) - logger.debug("Embed-asset relay: %s", url) - - headers = { - "User-Agent": USER_AGENT, - "Referer": _make_referer(url), - "Origin": _make_origin(url), - "Accept": "*/*", - } - if range_header: - headers["Range"] = range_header - - client = httpx.AsyncClient(timeout=ASSET_TIMEOUT, follow_redirects=True) - - try: - response = await client.send( - client.build_request("GET", url, headers=headers), - stream=True, - ) - except httpx.TimeoutException: - await client.aclose() - raise HTTPException(status_code=504, detail="Upstream asset timeout") - except httpx.HTTPError as e: - await client.aclose() - raise HTTPException(status_code=502, detail=f"Upstream asset error: {e}") - - if response.status_code >= 400: - await response.aclose() - await client.aclose() - raise HTTPException( - status_code=502, - detail=f"Upstream asset returned HTTP {response.status_code}", - ) - - headers_out = _filter_headers(response.headers) - - async def _stream() -> AsyncGenerator[bytes, None]: - try: - async for chunk in response.aiter_bytes(chunk_size=RELAY_CHUNK_SIZE): - yield chunk - finally: - await response.aclose() - await client.aclose() - - return _stream(), headers_out, response.status_code diff --git a/stacks/f1-stream/files/backend/extractors/__init__.py b/stacks/f1-stream/files/backend/extractors/__init__.py index 72e4d667..49b5c4d7 100644 --- a/stacks/f1-stream/files/backend/extractors/__init__.py +++ b/stacks/f1-stream/files/backend/extractors/__init__.py @@ -12,20 +12,12 @@ Example: """ from backend.extractors.aceztrims import AceztrimsExtractor -from backend.extractors.chrome_browser import ChromeBrowserExtractor -from backend.extractors.curated import CuratedExtractor -from backend.extractors.dd12 import DD12Extractor -from backend.extractors.stremio import StremioAddonExtractor -from backend.extractors.subreddit import SubredditExtractor from backend.extractors.daddylive import DaddyLiveExtractor -from backend.extractors.discord_source import DiscordExtractor +from backend.extractors.demo import DemoExtractor from backend.extractors.models import ExtractedStream -from backend.extractors.pitsport import PitsportExtractor -from backend.extractors.ppv import PPVExtractor from backend.extractors.registry import ExtractorRegistry from backend.extractors.service import ExtractionService from backend.extractors.streamed import StreamedExtractor -from backend.extractors.timstreams import TimStreamsExtractor __all__ = [ "ExtractedStream", @@ -44,36 +36,10 @@ def create_registry() -> ExtractorRegistry: registry = ExtractorRegistry() # --- Register extractors below --- - # CuratedExtractor previously surfaced two hmembeds 24/7 channels (Sky - # Sports F1, DAZN F1) but their JW Player decoder produces an empty - # playlist in our environment (error 102630) regardless of headed mode, - # IP, or fingerprint we tried. The streams loaded the upstream's ad - # overlay but never produced a video element, so they confused users — - # disabled until/unless we find a working bypass. - # registry.register(CuratedExtractor()) + registry.register(DemoExtractor()) registry.register(StreamedExtractor()) - # ChromeBrowserExtractor drives the in-cluster chrome-service via the - # CHROME_WS_URL / CHROME_WS_TOKEN env vars to scrape JS-rendered - # pages whose m3u8 is computed at runtime. - registry.register(ChromeBrowserExtractor()) - # SubredditExtractor pulls live-stream posts from motorsport subreddits. - # Returns embed-type streams; the verifier will visit each via - # chrome-service to confirm playability. - registry.register(SubredditExtractor()) - # DD12Extractor scrapes DD12Streams' per-channel pages for the inline - # JW Player file URL. The site embeds the m3u8 in HTML so curl-based - # parsing is enough — no browser needed. - registry.register(DD12Extractor()) - # StremioAddonExtractor calls Stremio addon HTTP APIs (TvVoo, StremVerse) - # which already index Sky F1 / DAZN F1 / Vavoo IPTV channels. No - # Stremio client needed — just /stream//.json calls. - registry.register(StremioAddonExtractor()) registry.register(DaddyLiveExtractor()) registry.register(AceztrimsExtractor()) - registry.register(PitsportExtractor()) - registry.register(PPVExtractor()) - registry.register(TimStreamsExtractor()) - registry.register(DiscordExtractor()) return registry diff --git a/stacks/f1-stream/files/backend/extractors/chrome_browser.py b/stacks/f1-stream/files/backend/extractors/chrome_browser.py deleted file mode 100644 index 299790d4..00000000 --- a/stacks/f1-stream/files/backend/extractors/chrome_browser.py +++ /dev/null @@ -1,243 +0,0 @@ -"""Generic chrome-service-driven extractor. - -Drives the in-cluster headed Chromium pool (chrome-service) to load a list -of stream/aggregator pages, captures any HLS playlist URL the page fetches -at runtime, and returns one ExtractedStream per discovered playlist. - -Unlike the API-based extractors (pitsport/streamed/ppv) this one handles -sites where the m3u8 is computed by JavaScript at page load time — the -URL only exists after the page evaluates an obfuscated decoder, fetches a -token, etc. Curl can't see it; a real browser can. - -Add new targets via the `TARGETS` constant below. Each entry is a (label, -title, page_url) tuple. The extractor visits each URL with a stealthed -context, waits for the JS to settle, and yields any captured HLS URL. -""" - -import asyncio -import logging -import os -import re -import urllib.parse -from dataclasses import dataclass - -from backend.extractors.base import BaseExtractor -from backend.extractors.models import ExtractedStream - -logger = logging.getLogger(__name__) - -# Best-effort pause between navigation and capture. The decoder usually -# fires within 5s; 12s gives slow JS time to settle without dragging the -# extraction round. -DEFAULT_SETTLE_SECONDS = 12 - -USER_AGENT = ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/605.1.15 (KHTML, like Gecko) " - "Version/17.4 Safari/605.1.15" -) - - -@dataclass(frozen=True) -class _Target: - label: str # site_name (homepage label in the UI) - title: str # human-readable stream title - url: str # page to navigate - settle: int = DEFAULT_SETTLE_SECONDS - - -# --------------------------------------------------------------------------- -# Target list. F1-relevant 24/7 channels and motorsport aggregator pages -# whose m3u8 is JS-computed. Add freely — each one takes ~12s to scrape. -# --------------------------------------------------------------------------- -TARGETS: tuple[_Target, ...] = ( - # MotoMundo embed pages — the community-curated WordPress site for - # MotoGP. Each /e/ URL is one of the iframes their "Watch Online" - # post lists for the active session (FP/Q/Race). The m3u8 is - # JS-computed at load time so a real browser is required to capture - # it. Update IDs each weekend to match the current race; subreddit.py - # discovers them from the Reddit "[Watch / Download]" thread. - _Target( - label="MotoMundo", - title="MotoGP Live (MotoMundo) — French GP / Le Mans", - url="https://motomundo.top/e/9yzn08jk9py4", - settle=15, - ), - _Target( - label="MotoMundo", - title="MotoGP Live (MotoMundo upns) — French GP / Le Mans", - url="https://motomundo.upns.xyz/#kqasde", - settle=15, - ), -) - - -# Heuristic to recognise an HLS playlist URL from network capture. Most CDNs -# use `.m3u8`; some (pushembdz/oe1.ossfeed) disguise the playlist as `.css` -# under a /out/v… or /hls/ path. Filter out obvious junk (.css for actual -# stylesheets, .ts segments — we only want the playlist). -_HLS_URL_RE = re.compile(r"\.m3u8(\?|$)|/out/v[0-9]+/.+\.css(\?|$)|/hls/.+/master\.css(\?|$)") -_SEGMENT_EXT_RE = re.compile(r"\.(ts|m4s|aac|key)(\?|$)") - - -def _looks_like_hls_playlist(url: str) -> bool: - if _SEGMENT_EXT_RE.search(url): - return False - return bool(_HLS_URL_RE.search(url)) - - -def _resolve_chrome_ws() -> str | None: - base = os.getenv("CHROME_WS_URL") - token = os.getenv("CHROME_WS_TOKEN") - if not base or not token: - return None - return f"{base.rstrip('/')}/{token}" - - -class ChromeBrowserExtractor(BaseExtractor): - """Drive chrome-service to capture m3u8 URLs from JS-heavy pages.""" - - @property - def site_key(self) -> str: - return "chrome-browser" - - @property - def site_name(self) -> str: - return "Chrome Browser" - - async def extract(self) -> list[ExtractedStream]: - ws_url = _resolve_chrome_ws() - if not ws_url: - logger.warning( - "[chrome-browser] CHROME_WS_URL/TOKEN not set — extractor disabled" - ) - return [] - - try: - from playwright.async_api import async_playwright - except ImportError: - logger.warning("[chrome-browser] playwright not installed — disabled") - return [] - - # One Playwright instance + one browser connection per extraction - # round. Contexts are cheap; the browser is shared. - async with async_playwright() as p: - try: - browser = await p.chromium.connect(ws_url, timeout=15_000) - except Exception: - logger.exception("[chrome-browser] connect to chrome-service failed") - return [] - - results: list[ExtractedStream] = [] - for target in TARGETS: - try: - stream = await self._scrape(browser, target) - if stream: - results.append(stream) - except Exception: - logger.exception( - "[chrome-browser] failed to scrape %s", target.url - ) - - try: - await browser.close() - except Exception: - pass - - logger.info("[chrome-browser] returned %d stream(s)", len(results)) - return results - - async def _scrape(self, browser, target: _Target) -> ExtractedStream | None: - ctx = await browser.new_context( - user_agent=USER_AGENT, - viewport={"width": 1280, "height": 720}, - bypass_csp=True, - ) - # Inject the same stealth script the verifier uses so anti-bot - # checks don't trip the page before its decoder runs. - try: - from backend.stealth import STEALTH_JS - await ctx.add_init_script(STEALTH_JS) - except Exception: - pass - - page = await ctx.new_page() - captured: list[str] = [] - - def on_response(resp): - try: - if _looks_like_hls_playlist(resp.url): - captured.append(resp.url) - except Exception: - pass - - page.on("response", on_response) - # Some pages (DD12 variants) load the player in a child iframe; - # frame events catch nested navigations. - page.on( - "framenavigated", - lambda fr: captured.append(fr.url) if _looks_like_hls_playlist(fr.url) else None, - ) - - try: - await page.goto(target.url, wait_until="domcontentloaded", timeout=20_000) - except Exception as e: - logger.debug("[chrome-browser] %s goto failed: %s", target.url, e) - await ctx.close() - return None - - # Let the page's JS settle. - await asyncio.sleep(target.settle) - - # Also probe child iframes — `pushembdz`, `pooembed`, `embedsports` - # all live behind one. Collect any HLS URL the iframes loaded. - for fr in page.frames: - if fr is page.main_frame: - continue - try: - # JW Player and Clappr both expose the playing source via - # a