From a89d4a7d2ae31e29c8186428474f50a8fc856800 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 10 May 2026 01:01:52 +0000 Subject: [PATCH] anubis: pull f1 off Anubis (XHR-vs-challenge collision) + add latency alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit f1.viktorbarzin.me is a SPA whose JS fetches /schedule, /embed, /embed-asset, … on the same path tree. With Anubis fronting `/`, those XHRs land on the challenge HTML even when the cookie *should* be valid, breaking the page with `Unexpected token '<', "1s) - IngressTTFBCritical (crit, 5m, avg latency >3s) - IngressErrorRate5xxHigh (crit, 5m, 5xx >5%) - AnubisChallengeStoreErrors (crit, 5m, any 5xx on *anubis* services via Traefik — proxies for the in-pod challenge-store error since Anubis itself only exposes Go-runtime metrics) Notes from the alert author: avg-not-p95 because the existing Prometheus scrape config drops traefik bucket series; once those are restored, swap to histogram_quantile(0.95). TraefikDown inhibit rule extended to suppress these four during a Traefik outage. --- stacks/f1-stream/main.tf | 15 ++--- .../monitoring/prometheus_chart_values.tpl | 67 ++++++++++++++++++- 2 files changed, 71 insertions(+), 11 deletions(-) diff --git a/stacks/f1-stream/main.tf b/stacks/f1-stream/main.tf index c2f5edaf..042e3b51 100644 --- a/stacks/f1-stream/main.tf +++ b/stacks/f1-stream/main.tf @@ -228,23 +228,18 @@ module "tls_secret" { } -module "anubis" { - source = "../../modules/kubernetes/anubis_instance" - name = "f1" - namespace = kubernetes_namespace.f1-stream.metadata[0].name - target_url = "http://${kubernetes_service.f1-stream.metadata[0].name}.${kubernetes_namespace.f1-stream.metadata[0].name}.svc.cluster.local" -} - +# NOTE: f1-stream serves its SPA + JSON data endpoints (/schedule, /embed, +# /embed-asset, …) all on the same path tree, so putting Anubis in front +# breaks XHR data fetches with "Unexpected token '<', '--@kubernetes` and maps roughly 1:1 to a public + # host (e.g. `travel-blog-anubis-travel-8080@kubernetes`). + rules: + - alert: IngressTTFBHigh + expr: | + ( + sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service) + / sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service) + ) > 1 + and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service) > 0.05 + and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 + for: 10m + labels: + severity: warning + annotations: + summary: "Slow ingress on {{ $labels.service }}: avg latency {{ $value | printf \"%.2f\" }}s (threshold: 1s for 10m)" + - alert: IngressTTFBCritical + expr: | + ( + sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service) + / sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service) + ) > 3 + and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service) > 0.05 + and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 + for: 5m + labels: + severity: critical + annotations: + summary: "Critically slow ingress on {{ $labels.service }}: avg latency {{ $value | printf \"%.2f\" }}s (threshold: 3s for 5m)" + - alert: IngressErrorRate5xxHigh + expr: | + ( + sum(rate(traefik_service_requests_total{code=~"5..", service!~".*nextcloud.*"}[5m])) by (service) + / sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service) + * 100 + ) > 5 + and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service) > 0.1 + and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 + for: 5m + labels: + severity: critical + annotations: + summary: "5xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 5% for 5m)" + - alert: AnubisChallengeStoreErrors + # Anubis exposes only Go-runtime metrics on :9090 (no anubis_* / + # challenge_* counters), so we proxy via Traefik 5xx on services + # whose name contains `anubis`. Catches the "store: key not found" + # 500 we saw — every Anubis 5xx is suspicious because the only + # legitimate path through it is /.within.website/x/cmd/anubis or a + # redirect to the upstream, both 200/3xx in healthy operation. + expr: | + sum(rate(traefik_service_requests_total{service=~".*anubis.*",code=~"5.."}[5m])) by (service) > 0 + and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 + for: 5m + labels: + severity: critical + annotations: + summary: "Anubis service {{ $labels.service }} returning 5xx ({{ $value | printf \"%.2f\" }} req/s) — likely challenge-store error" - name: "Networking & Access" rules: - alert: CloudflaredDown