anubis: pull f1 off Anubis (XHR-vs-challenge collision) + add latency alerts
f1.viktorbarzin.me is a SPA whose JS fetches /schedule, /embed, /embed-asset, … on the same path tree. With Anubis fronting `/`, those XHRs land on the challenge HTML even when the cookie *should* be valid, breaking the page with `Unexpected token '<', "<!doctype " ... is not valid JSON`. Removed Anubis from f1 — would need a path carve-out (the way wrongmove does for /api) to re-enable. Added a top-of-block comment so future me remembers why. Plus four new Prometheus alerts in `Slow Ingress Latency` group (stacks/monitoring/.../prometheus_chart_values.tpl): - IngressTTFBHigh (warn, 10m, avg latency >1s) - IngressTTFBCritical (crit, 5m, avg latency >3s) - IngressErrorRate5xxHigh (crit, 5m, 5xx >5%) - AnubisChallengeStoreErrors (crit, 5m, any 5xx on *anubis* services via Traefik — proxies for the in-pod challenge-store error since Anubis itself only exposes Go-runtime metrics) Notes from the alert author: avg-not-p95 because the existing Prometheus scrape config drops traefik bucket series; once those are restored, swap to histogram_quantile(0.95). TraefikDown inhibit rule extended to suppress these four during a Traefik outage.
This commit is contained in:
parent
8197842646
commit
a89d4a7d2a
2 changed files with 71 additions and 11 deletions
|
|
@ -228,23 +228,18 @@ module "tls_secret" {
|
|||
}
|
||||
|
||||
|
||||
module "anubis" {
|
||||
source = "../../modules/kubernetes/anubis_instance"
|
||||
name = "f1"
|
||||
namespace = kubernetes_namespace.f1-stream.metadata[0].name
|
||||
target_url = "http://${kubernetes_service.f1-stream.metadata[0].name}.${kubernetes_namespace.f1-stream.metadata[0].name}.svc.cluster.local"
|
||||
}
|
||||
|
||||
# NOTE: f1-stream serves its SPA + JSON data endpoints (/schedule, /embed,
|
||||
# /embed-asset, …) all on the same path tree, so putting Anubis in front
|
||||
# breaks XHR data fetches with "Unexpected token '<', '<!doctype '" — the
|
||||
# challenge HTML lands where JSON is expected. Anubis is removed for f1
|
||||
# until/unless we add a /api carve-out the way wrongmove does.
|
||||
module "ingress" {
|
||||
source = "../../modules/kubernetes/ingress_factory"
|
||||
dns_type = "non-proxied"
|
||||
namespace = kubernetes_namespace.f1-stream.metadata[0].name
|
||||
name = "f1"
|
||||
service_name = module.anubis.service_name
|
||||
port = module.anubis.service_port
|
||||
tls_secret_name = var.tls_secret_name
|
||||
exclude_crowdsec = true
|
||||
anti_ai_scraping = false
|
||||
extra_annotations = {
|
||||
"gethomepage.dev/enabled" = "true"
|
||||
"gethomepage.dev/name" = "F1 Stream"
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ alertmanager:
|
|||
- source_matchers:
|
||||
- alertname = TraefikDown
|
||||
target_matchers:
|
||||
- alertname =~ "HighServiceErrorRate|HighService4xxRate|HighServiceLatency|TraefikHighOpenConnections"
|
||||
- alertname =~ "HighServiceErrorRate|HighService4xxRate|HighServiceLatency|TraefikHighOpenConnections|IngressTTFBHigh|IngressTTFBCritical|IngressErrorRate5xxHigh|AnubisChallengeStoreErrors"
|
||||
# Traefik down makes ForwardAuth alerts redundant
|
||||
- source_matchers:
|
||||
- alertname = TraefikDown
|
||||
|
|
@ -1882,6 +1882,71 @@ serverFiles:
|
|||
# summary: OpenWRT high memory usage. Can cause services getting stuck.
|
||||
# MailServerDown, HackmdDown, PrivatebinDown moved to "Application Health" group
|
||||
# New Tailscale client moved to "Infrastructure Health" group
|
||||
- name: "Slow Ingress Latency"
|
||||
# Per-host slow-latency + Anubis-specific 5xx alerts. Sourced from
|
||||
# `traefik_service_*` metrics scraped via `kubernetes-pods` (only fresh
|
||||
# samples we have — `*_bucket` series are scraped but the `traefik`
|
||||
# job's metric_relabel drops them, so `histogram_quantile` produces no
|
||||
# samples). Once buckets are restored, replace the avg expressions with
|
||||
# `histogram_quantile(0.95, ...)`. The `service` label format is
|
||||
# `<ns>-<release>-<port>@kubernetes` and maps roughly 1:1 to a public
|
||||
# host (e.g. `travel-blog-anubis-travel-8080@kubernetes`).
|
||||
rules:
|
||||
- alert: IngressTTFBHigh
|
||||
expr: |
|
||||
(
|
||||
sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
|
||||
/ sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
|
||||
) > 1
|
||||
and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service) > 0.05
|
||||
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Slow ingress on {{ $labels.service }}: avg latency {{ $value | printf \"%.2f\" }}s (threshold: 1s for 10m)"
|
||||
- alert: IngressTTFBCritical
|
||||
expr: |
|
||||
(
|
||||
sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
|
||||
/ sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
|
||||
) > 3
|
||||
and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service) > 0.05
|
||||
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critically slow ingress on {{ $labels.service }}: avg latency {{ $value | printf \"%.2f\" }}s (threshold: 3s for 5m)"
|
||||
- alert: IngressErrorRate5xxHigh
|
||||
expr: |
|
||||
(
|
||||
sum(rate(traefik_service_requests_total{code=~"5..", service!~".*nextcloud.*"}[5m])) by (service)
|
||||
/ sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service)
|
||||
* 100
|
||||
) > 5
|
||||
and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service) > 0.1
|
||||
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "5xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 5% for 5m)"
|
||||
- alert: AnubisChallengeStoreErrors
|
||||
# Anubis exposes only Go-runtime metrics on :9090 (no anubis_* /
|
||||
# challenge_* counters), so we proxy via Traefik 5xx on services
|
||||
# whose name contains `anubis`. Catches the "store: key not found"
|
||||
# 500 we saw — every Anubis 5xx is suspicious because the only
|
||||
# legitimate path through it is /.within.website/x/cmd/anubis or a
|
||||
# redirect to the upstream, both 200/3xx in healthy operation.
|
||||
expr: |
|
||||
sum(rate(traefik_service_requests_total{service=~".*anubis.*",code=~"5.."}[5m])) by (service) > 0
|
||||
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Anubis service {{ $labels.service }} returning 5xx ({{ $value | printf \"%.2f\" }} req/s) — likely challenge-store error"
|
||||
- name: "Networking & Access"
|
||||
rules:
|
||||
- alert: CloudflaredDown
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue