anubis: pull f1 off Anubis (XHR-vs-challenge collision) + add latency alerts

f1.viktorbarzin.me is a SPA whose JS fetches /schedule, /embed,
/embed-asset, … on the same path tree. With Anubis fronting `/`,
those XHRs land on the challenge HTML even when the cookie *should*
be valid, breaking the page with `Unexpected token '<', "<!doctype "
... is not valid JSON`. Removed Anubis from f1 — would need a path
carve-out (the way wrongmove does for /api) to re-enable. Added a
top-of-block comment so future me remembers why.

Plus four new Prometheus alerts in `Slow Ingress Latency` group
(stacks/monitoring/.../prometheus_chart_values.tpl):

- IngressTTFBHigh         (warn, 10m, avg latency >1s)
- IngressTTFBCritical     (crit, 5m,  avg latency >3s)
- IngressErrorRate5xxHigh (crit, 5m,  5xx >5%)
- AnubisChallengeStoreErrors (crit, 5m, any 5xx on *anubis* services
  via Traefik — proxies for the in-pod challenge-store error since
  Anubis itself only exposes Go-runtime metrics)

Notes from the alert author: avg-not-p95 because the existing
Prometheus scrape config drops traefik bucket series; once those
are restored, swap to histogram_quantile(0.95). TraefikDown inhibit
rule extended to suppress these four during a Traefik outage.
This commit is contained in:
Viktor Barzin 2026-05-10 01:01:52 +00:00
parent 8197842646
commit a89d4a7d2a
2 changed files with 71 additions and 11 deletions

View file

@ -228,23 +228,18 @@ module "tls_secret" {
}
module "anubis" {
source = "../../modules/kubernetes/anubis_instance"
name = "f1"
namespace = kubernetes_namespace.f1-stream.metadata[0].name
target_url = "http://${kubernetes_service.f1-stream.metadata[0].name}.${kubernetes_namespace.f1-stream.metadata[0].name}.svc.cluster.local"
}
# NOTE: f1-stream serves its SPA + JSON data endpoints (/schedule, /embed,
# /embed-asset, ) all on the same path tree, so putting Anubis in front
# breaks XHR data fetches with "Unexpected token '<', '<!doctype '" the
# challenge HTML lands where JSON is expected. Anubis is removed for f1
# until/unless we add a /api carve-out the way wrongmove does.
module "ingress" {
source = "../../modules/kubernetes/ingress_factory"
dns_type = "non-proxied"
namespace = kubernetes_namespace.f1-stream.metadata[0].name
name = "f1"
service_name = module.anubis.service_name
port = module.anubis.service_port
tls_secret_name = var.tls_secret_name
exclude_crowdsec = true
anti_ai_scraping = false
extra_annotations = {
"gethomepage.dev/enabled" = "true"
"gethomepage.dev/name" = "F1 Stream"

View file

@ -83,7 +83,7 @@ alertmanager:
- source_matchers:
- alertname = TraefikDown
target_matchers:
- alertname =~ "HighServiceErrorRate|HighService4xxRate|HighServiceLatency|TraefikHighOpenConnections"
- alertname =~ "HighServiceErrorRate|HighService4xxRate|HighServiceLatency|TraefikHighOpenConnections|IngressTTFBHigh|IngressTTFBCritical|IngressErrorRate5xxHigh|AnubisChallengeStoreErrors"
# Traefik down makes ForwardAuth alerts redundant
- source_matchers:
- alertname = TraefikDown
@ -1882,6 +1882,71 @@ serverFiles:
# summary: OpenWRT high memory usage. Can cause services getting stuck.
# MailServerDown, HackmdDown, PrivatebinDown moved to "Application Health" group
# New Tailscale client moved to "Infrastructure Health" group
- name: "Slow Ingress Latency"
# Per-host slow-latency + Anubis-specific 5xx alerts. Sourced from
# `traefik_service_*` metrics scraped via `kubernetes-pods` (only fresh
# samples we have — `*_bucket` series are scraped but the `traefik`
# job's metric_relabel drops them, so `histogram_quantile` produces no
# samples). Once buckets are restored, replace the avg expressions with
# `histogram_quantile(0.95, ...)`. The `service` label format is
# `<ns>-<release>-<port>@kubernetes` and maps roughly 1:1 to a public
# host (e.g. `travel-blog-anubis-travel-8080@kubernetes`).
rules:
- alert: IngressTTFBHigh
expr: |
(
sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
/ sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
) > 1
and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service) > 0.05
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
for: 10m
labels:
severity: warning
annotations:
summary: "Slow ingress on {{ $labels.service }}: avg latency {{ $value | printf \"%.2f\" }}s (threshold: 1s for 10m)"
- alert: IngressTTFBCritical
expr: |
(
sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
/ sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
) > 3
and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service) > 0.05
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
for: 5m
labels:
severity: critical
annotations:
summary: "Critically slow ingress on {{ $labels.service }}: avg latency {{ $value | printf \"%.2f\" }}s (threshold: 3s for 5m)"
- alert: IngressErrorRate5xxHigh
expr: |
(
sum(rate(traefik_service_requests_total{code=~"5..", service!~".*nextcloud.*"}[5m])) by (service)
/ sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service)
* 100
) > 5
and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service) > 0.1
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
for: 5m
labels:
severity: critical
annotations:
summary: "5xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 5% for 5m)"
- alert: AnubisChallengeStoreErrors
# Anubis exposes only Go-runtime metrics on :9090 (no anubis_* /
# challenge_* counters), so we proxy via Traefik 5xx on services
# whose name contains `anubis`. Catches the "store: key not found"
# 500 we saw — every Anubis 5xx is suspicious because the only
# legitimate path through it is /.within.website/x/cmd/anubis or a
# redirect to the upstream, both 200/3xx in healthy operation.
expr: |
sum(rate(traefik_service_requests_total{service=~".*anubis.*",code=~"5.."}[5m])) by (service) > 0
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
for: 5m
labels:
severity: critical
annotations:
summary: "Anubis service {{ $labels.service }} returning 5xx ({{ $value | printf \"%.2f\" }} req/s) — likely challenge-store error"
- name: "Networking & Access"
rules:
- alert: CloudflaredDown