diff --git a/docs/architecture/networking.md b/docs/architecture/networking.md index b93df195..8d383d32 100644 --- a/docs/architecture/networking.md +++ b/docs/architecture/networking.md @@ -261,7 +261,7 @@ Traefik chain: 1. **Anti-AI bot-block** (`ai-bot-block` ForwardAuth, on by default via `ingress_factory`): blocks/tarpits known AI crawlers. **Fail-open** (currently a no-op `return 200` — poison-fountain scaled to 0; see `docs/architecture/security.md`). 2. **Authentik Forward-Auth** (if `protected = true`): SSO authentication via OIDC. Non-authenticated users are redirected to login. Auth headers are stripped before forwarding to backend. -3. **Rate Limiting**: Per-IP throttling. Returns **429 Too Many Requests** (not 503) when limit exceeded. Default is `rate-limit` (average 10 req/s, burst 50). Services whose clients legitimately burst harder get a dedicated middleware via `skip_default_rate_limit = true` + `extra_middlewares`: Immich (`immich-rate-limit`, 1000/20000, photo uploads), ActualBudget (`actualbudget-rate-limit`, 50/300 — the Actual web app boots with ~70 parallel asset/migration revalidations; the default burst 429'd the tail and stalled every page load), and authentik (`authentik-rate-limit`, 100/1000, on `/` and `/static` — the login SPA cold-loads ~70 flow-executor JS/CSS chunks from `/static`; the default burst 429'd the tail and a failed ES-module import left a blank login screen for cold/incognito/NAT-shared clients). +3. **Rate Limiting**: Per-IP throttling. Returns **429 Too Many Requests** (not 503) when limit exceeded. Default is `rate-limit` (average 10 req/s, burst 50). Services whose clients legitimately burst harder get a dedicated middleware via `skip_default_rate_limit = true` + `extra_middlewares`: Immich (`immich-rate-limit`, 1000/20000, photo uploads), ActualBudget (`actualbudget-rate-limit`, 50/300 — the Actual web app boots with ~70 parallel asset/migration revalidations; the default burst 429'd the tail and stalled every page load), authentik (`authentik-rate-limit`, 100/1000, on `/` and `/static` — the login SPA cold-loads ~70 flow-executor JS/CSS chunks from `/static`; the default burst 429'd the tail and a failed ES-module import left a blank login screen for cold/incognito/NAT-shared clients), tripit (`tripit-rate-limit`, 100/1000, photo-tab thumbnail bursts), health (`health-rate-limit`, 100/1000, SPA shell + API burst per page), and dawarich (`dawarich-rate-limit`, 100/1000 — the Rails app self-serves all fingerprinted assets and the map adds an API burst per load; the default burst 429'd the asset tail and risked dropping OwnTracks/mobile location POSTs on the same host). 4. **Retry**: 2 attempts with 100ms delay on transient failures (5xx errors, connection errors). Additional middleware: @@ -552,7 +552,7 @@ chain — a CrowdSec/LAPI outage cannot cause 503s; it only stops new bans.) Che **Diagnosis**: Check Traefik middleware config for the affected IngressRoute. -**Fix**: Give the service a dedicated higher-limit middleware (don't loosen the shared default): define `-rate-limit` in `stacks/traefik/modules/traefik/middleware.tf`, then set `skip_default_rate_limit = true` + `extra_middlewares = ["traefik--rate-limit@kubernetescrd"]` on its `ingress_factory` call. Shared default is average 10 req/s / burst 50; Immich uses 1000/20000, ActualBudget 50/300, authentik 100/1000 (login SPA `/static` chunk burst → blank screen). +**Fix**: Give the service a dedicated higher-limit middleware (don't loosen the shared default): define `-rate-limit` in `stacks/traefik/modules/traefik/middleware.tf`, then set `skip_default_rate_limit = true` + `extra_middlewares = ["traefik--rate-limit@kubernetescrd"]` on its `ingress_factory` call. Shared default is average 10 req/s / burst 50; Immich uses 1000/20000, ActualBudget 50/300, and tripit/health/authentik/dawarich each 100/1000 (SPA or asset-heavy page loads bursting past the default from one client IP). ### Large Downloads or Uploads Truncate / Fail Partway diff --git a/stacks/dawarich/main.tf b/stacks/dawarich/main.tf index 3eeb1540..1d2d1f81 100644 --- a/stacks/dawarich/main.tf +++ b/stacks/dawarich/main.tf @@ -16,7 +16,7 @@ resource "kubernetes_namespace" "dawarich" { name = "dawarich" labels = { "istio-injection" : "disabled" - tier = local.tiers.edge + tier = local.tiers.edge "keel.sh/enrolled" = "true" } } @@ -330,7 +330,7 @@ resource "kubernetes_deployment" "dawarich" { } lifecycle { ignore_changes = [ - spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 + spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates metadata[0].annotations["keel.sh/policy"], metadata[0].annotations["keel.sh/trigger"], @@ -458,6 +458,13 @@ module "ingress" { namespace = kubernetes_namespace.dawarich.metadata[0].name name = "dawarich" tls_secret_name = var.tls_secret_name + # Rails serves all its fingerprinted assets itself and the map view adds an + # API burst per page load — the default 10/50 limiter 429s the asset tail + # from a single client IP (and risks dropping OwnTracks/mobile ingestion + # POSTs on the same host). Dedicated 100/1000 limiter defined in + # stacks/traefik/modules/traefik/middleware.tf. + skip_default_rate_limit = true + extra_middlewares = ["traefik-dawarich-rate-limit@kubernetescrd"] extra_annotations = { "gethomepage.dev/enabled" = "true" "gethomepage.dev/name" = "Dawarich" diff --git a/stacks/traefik/modules/traefik/middleware.tf b/stacks/traefik/modules/traefik/middleware.tf index 11f9d57b..08e7daa4 100644 --- a/stacks/traefik/modules/traefik/middleware.tf +++ b/stacks/traefik/modules/traefik/middleware.tf @@ -368,6 +368,33 @@ resource "kubernetes_manifest" "middleware_authentik_rate_limit" { depends_on = [helm_release.traefik] } +# Dawarich-specific rate limit. The Rails app serves all its fingerprinted +# assets itself (JS/CSS chunks, SVG store badges, favicons, webmanifest) and +# the map view adds a points/API burst on load — a single page load from one +# client IP blows past the default 10/50 limiter and 429s the asset tail +# (seventh instance of the burst pattern, after ha-sofia, ActualBudget, noVNC, +# tripit, health and authentik). Background location ingestion (OwnTracks +# bridge + mobile api_key POSTs) rides the same host, so 429s here also risk +# dropped pings. Burst absorbs a couple of full page loads back-to-back. +resource "kubernetes_manifest" "middleware_dawarich_rate_limit" { + manifest = { + apiVersion = "traefik.io/v1alpha1" + kind = "Middleware" + metadata = { + name = "dawarich-rate-limit" + namespace = kubernetes_namespace.traefik.metadata[0].name + } + spec = { + rateLimit = { + average = 100 + burst = 1000 + } + } + } + + depends_on = [helm_release.traefik] +} + # Compress responses to clients at the entrypoint level (outermost). # Applied at websecure entrypoint so all responses get compressed. # Uses includedContentTypes (whitelist) instead of excludedContentTypes: