diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 08b14870..e57a4aba 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -144,7 +144,7 @@ Repo IDs: infra=1, Website=2, finance=3, health=4, travel_blog=5, webhook-handle - **PDBs**: minAvailable=2 on Traefik and Authentik. - **Fallback proxies**: basicAuth when Authentik is down, fail-open when poison-fountain is down. - **CrowdSec bouncer**: graceful degradation mode (fail-open on error). -- **Rate limiting**: Return 429 (not 503). Per-service tuning: Immich/Nextcloud need higher limits. +- **Rate limiting**: Return 429 (not 503). Per-service tuning via dedicated middleware + `skip_default_rate_limit` (default 10/s burst 50): Immich 1000/20000, ActualBudget 50/300 (app boot = ~70 parallel revalidations). - **Retry middleware**: 2 attempts, 100ms — in default ingress chain. - **Entrypoint transport timeouts** (`websecure` `respondingTimeouts`): `writeTimeout=0` (unlimited download duration), `readTimeout=3600s` (uploads ≤1h), `idleTimeout=600s`. These are **HARD total-duration caps**, not nginx-style per-read idle timeouts — a finite `writeTimeout` truncates *any* large download at that wall-clock mark (a prior `writeTimeout=60s` silently cut Immich videos at 60s). **Do NOT re-tighten `writeTimeout`**; keep `readTimeout` finite (slow-loris backstop) but ≥ longest expected upload. Full rationale: `docs/architecture/networking.md` → "Entrypoint Transport Timeouts". - **HTTP/3 (QUIC)**: Enabled on Traefik. Works for **direct (non-proxied) apps** via the dedicated LB IP below (ETP=Local). Proxied apps get QUIC at the Cloudflare edge. diff --git a/docs/architecture/networking.md b/docs/architecture/networking.md index 09437069..c34e9944 100644 --- a/docs/architecture/networking.md +++ b/docs/architecture/networking.md @@ -247,7 +247,7 @@ Every ingress created by the `ingress_factory` module follows this chain: 1. **CrowdSec Bouncer**: Checks IP against threat database. **Fail-open** mode — if LAPI is unreachable, traffic passes through to prevent outages. 2. **Authentik Forward-Auth** (if `protected = true`): SSO authentication via OIDC. Non-authenticated users are redirected to login. Auth headers are stripped before forwarding to backend. -3. **Rate Limiting**: Per-IP throttling. Returns **429 Too Many Requests** (not 503) when limit exceeded. Default limits are generous; services like Immich and Nextcloud have higher custom limits. +3. **Rate Limiting**: Per-IP throttling. Returns **429 Too Many Requests** (not 503) when limit exceeded. Default is `rate-limit` (average 10 req/s, burst 50). Services whose clients legitimately burst harder get a dedicated middleware via `skip_default_rate_limit = true` + `extra_middlewares`: Immich (`immich-rate-limit`, 1000/20000, photo uploads) and ActualBudget (`actualbudget-rate-limit`, 50/300 — the Actual web app boots with ~70 parallel asset/migration revalidations; the default burst 429'd the tail and stalled every page load). 4. **Retry**: 2 attempts with 100ms delay on transient failures (5xx errors, connection errors). Additional middleware: @@ -515,11 +515,11 @@ Containerd on all K8s nodes uses `hosts.toml` to redirect pulls to the local cac ### Rate Limiter Blocks Legitimate Traffic -**Symptoms**: Users report 429 errors during normal usage (e.g., Immich uploads). +**Symptoms**: Users report 429 errors during normal usage (e.g., Immich uploads, ActualBudget's "Server returned an error while checking its status" boot screen). **Diagnosis**: Check Traefik middleware config for the affected IngressRoute. -**Fix**: Increase rate limit in `ingress_factory` module. Default is 100 req/min per IP. Immich and Nextcloud use 500 req/min. +**Fix**: Give the service a dedicated higher-limit middleware (don't loosen the shared default): define `-rate-limit` in `stacks/traefik/modules/traefik/middleware.tf`, then set `skip_default_rate_limit = true` + `extra_middlewares = ["traefik--rate-limit@kubernetescrd"]` on its `ingress_factory` call. Shared default is average 10 req/s / burst 50; Immich uses 1000/20000, ActualBudget 50/300. ### Large Downloads or Uploads Truncate / Fail Partway diff --git a/stacks/actualbudget/factory/main.tf b/stacks/actualbudget/factory/main.tf index d458857d..3a7df141 100644 --- a/stacks/actualbudget/factory/main.tf +++ b/stacks/actualbudget/factory/main.tf @@ -183,6 +183,11 @@ module "ingress" { tls_secret_name = var.tls_secret_name dns_type = "proxied" extra_annotations = var.homepage_annotations + # Actual's app boot fires ~70 parallel asset/migration revalidations + # (max-age=0); the default 10/50 limiter 429s the tail and stalls every + # load. Dedicated higher-burst limiter, same pattern as Immich. + skip_default_rate_limit = true + extra_middlewares = ["traefik-actualbudget-rate-limit@kubernetescrd"] } diff --git a/stacks/traefik/modules/traefik/middleware.tf b/stacks/traefik/modules/traefik/middleware.tf index bd09b67e..ef34f991 100644 --- a/stacks/traefik/modules/traefik/middleware.tf +++ b/stacks/traefik/modules/traefik/middleware.tf @@ -294,6 +294,31 @@ resource "kubernetes_manifest" "middleware_immich_rate_limit" { depends_on = [helm_release.traefik] } +# ActualBudget-specific rate limit. The Actual web app boots with ~70 +# near-parallel requests (55 /data/migrations/*.sql + statics, all served +# max-age=0 so every load re-validates them); the default 10/50 limiter +# 429s the tail and stalls every page load with retry backoff (the +# "Server returned an error while checking its status" screen). Burst must +# absorb a few simultaneous device boots from one client IP. +resource "kubernetes_manifest" "middleware_actualbudget_rate_limit" { + manifest = { + apiVersion = "traefik.io/v1alpha1" + kind = "Middleware" + metadata = { + name = "actualbudget-rate-limit" + namespace = kubernetes_namespace.traefik.metadata[0].name + } + spec = { + rateLimit = { + average = 50 + burst = 300 + } + } + } + + depends_on = [helm_release.traefik] +} + # Compress responses to clients at the entrypoint level (outermost). # Applied at websecure entrypoint so all responses get compressed. # Uses includedContentTypes (whitelist) instead of excludedContentTypes: