anubis: pull f1 off Anubis (XHR-vs-challenge collision) + add latency alerts

f1.viktorbarzin.me is a SPA whose JS fetches /schedule, /embed, /embed-asset, … on the same path tree. With Anubis fronting `/`, those XHRs land on the challenge HTML even when the cookie *should* be valid, breaking the page with `Unexpected token '<', "<!doctype " ... is not valid JSON`. Removed Anubis from f1 — would need a path carve-out (the way wrongmove does for /api) to re-enable. Added a top-of-block comment so future me remembers why. Plus four new Prometheus alerts in `Slow Ingress Latency` group (stacks/monitoring/.../prometheus_chart_values.tpl): - IngressTTFBHigh (warn, 10m, avg latency >1s) - IngressTTFBCritical (crit, 5m, avg latency >3s) - IngressErrorRate5xxHigh (crit, 5m, 5xx >5%) - AnubisChallengeStoreErrors (crit, 5m, any 5xx on *anubis* services via Traefik — proxies for the in-pod challenge-store error since Anubis itself only exposes Go-runtime metrics) Notes from the alert author: avg-not-p95 because the existing Prometheus scrape config drops traefik bucket series; once those are restored, swap to histogram_quantile(0.95). TraefikDown inhibit rule extended to suppress these four during a Traefik outage.
2026-05-10 01:01:52 +00:00 · 2026-05-10 01:01:52 +00:00 · a89d4a7d2a
commit a89d4a7d2a
parent 8197842646
2 changed files with 71 additions and 11 deletions
--- a/stacks/f1-stream/main.tf
+++ b/stacks/f1-stream/main.tf
@ -228,23 +228,18 @@ module "tls_secret" {
 }


-module "anubis" {
-  source     = "../../modules/kubernetes/anubis_instance"
-  name       = "f1"
-  namespace  = kubernetes_namespace.f1-stream.metadata[0].name
-  target_url = "http://${kubernetes_service.f1-stream.metadata[0].name}.${kubernetes_namespace.f1-stream.metadata[0].name}.svc.cluster.local"
-}
-
+# NOTE: f1-stream serves its SPA + JSON data endpoints (/schedule, /embed,
+# /embed-asset, …) all on the same path tree, so putting Anubis in front
+# breaks XHR data fetches with "Unexpected token '<', '<!doctype '" — the
+# challenge HTML lands where JSON is expected. Anubis is removed for f1
+# until/unless we add a /api carve-out the way wrongmove does.
 module "ingress" {
  source           = "../../modules/kubernetes/ingress_factory"
  dns_type         = "non-proxied"
  namespace        = kubernetes_namespace.f1-stream.metadata[0].name
  name             = "f1"
-  service_name     = module.anubis.service_name
-  port             = module.anubis.service_port
  tls_secret_name  = var.tls_secret_name
  exclude_crowdsec = true
-  anti_ai_scraping = false
  extra_annotations = {
    "gethomepage.dev/enabled"      = "true"
    "gethomepage.dev/name"         = "F1 Stream"
--- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
@ -83,7 +83,7 @@ alertmanager:
      - source_matchers:
          - alertname = TraefikDown
        target_matchers:
-          - alertname =~ "HighServiceErrorRate|HighService4xxRate|HighServiceLatency|TraefikHighOpenConnections"
+          - alertname =~ "HighServiceErrorRate|HighService4xxRate|HighServiceLatency|TraefikHighOpenConnections|IngressTTFBHigh|IngressTTFBCritical|IngressErrorRate5xxHigh|AnubisChallengeStoreErrors"
      # Traefik down makes ForwardAuth alerts redundant
      - source_matchers:
          - alertname = TraefikDown
@ -1882,6 +1882,71 @@ serverFiles:
          #     summary: OpenWRT high memory usage. Can cause services getting stuck.
          # MailServerDown, HackmdDown, PrivatebinDown moved to "Application Health" group
          # New Tailscale client moved to "Infrastructure Health" group
+      - name: "Slow Ingress Latency"
+        # Per-host slow-latency + Anubis-specific 5xx alerts. Sourced from
+        # `traefik_service_*` metrics scraped via `kubernetes-pods` (only fresh
+        # samples we have — `*_bucket` series are scraped but the `traefik`
+        # job's metric_relabel drops them, so `histogram_quantile` produces no
+        # samples). Once buckets are restored, replace the avg expressions with
+        # `histogram_quantile(0.95, ...)`. The `service` label format is
+        # `<ns>-<release>-<port>@kubernetes` and maps roughly 1:1 to a public
+        # host (e.g. `travel-blog-anubis-travel-8080@kubernetes`).
+        rules:
+          - alert: IngressTTFBHigh
+            expr: |
+              (
+                sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
+                / sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
+              ) > 1
+              and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service) > 0.05
+              and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Slow ingress on {{ $labels.service }}: avg latency {{ $value | printf \"%.2f\" }}s (threshold: 1s for 10m)"
+          - alert: IngressTTFBCritical
+            expr: |
+              (
+                sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
+                / sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
+              ) > 3
+              and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service) > 0.05
+              and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              summary: "Critically slow ingress on {{ $labels.service }}: avg latency {{ $value | printf \"%.2f\" }}s (threshold: 3s for 5m)"
+          - alert: IngressErrorRate5xxHigh
+            expr: |
+              (
+                sum(rate(traefik_service_requests_total{code=~"5..", service!~".*nextcloud.*"}[5m])) by (service)
+                / sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service)
+                * 100
+              ) > 5
+              and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service) > 0.1
+              and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              summary: "5xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 5% for 5m)"
+          - alert: AnubisChallengeStoreErrors
+            # Anubis exposes only Go-runtime metrics on :9090 (no anubis_* /
+            # challenge_* counters), so we proxy via Traefik 5xx on services
+            # whose name contains `anubis`. Catches the "store: key not found"
+            # 500 we saw — every Anubis 5xx is suspicious because the only
+            # legitimate path through it is /.within.website/x/cmd/anubis or a
+            # redirect to the upstream, both 200/3xx in healthy operation.
+            expr: |
+              sum(rate(traefik_service_requests_total{service=~".*anubis.*",code=~"5.."}[5m])) by (service) > 0
+              and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              summary: "Anubis service {{ $labels.service }} returning 5xx ({{ $value | printf \"%.2f\" }} req/s) — likely challenge-store error"
      - name: "Networking & Access"
        rules:
          - alert: CloudflaredDown