anubis: pull f1 off Anubis (XHR-vs-challenge collision) + add latency alerts
f1.viktorbarzin.me is a SPA whose JS fetches /schedule, /embed, /embed-asset, … on the same path tree. With Anubis fronting `/`, those XHRs land on the challenge HTML even when the cookie *should* be valid, breaking the page with `Unexpected token '<', "<!doctype " ... is not valid JSON`. Removed Anubis from f1 — would need a path carve-out (the way wrongmove does for /api) to re-enable. Added a top-of-block comment so future me remembers why. Plus four new Prometheus alerts in `Slow Ingress Latency` group (stacks/monitoring/.../prometheus_chart_values.tpl): - IngressTTFBHigh (warn, 10m, avg latency >1s) - IngressTTFBCritical (crit, 5m, avg latency >3s) - IngressErrorRate5xxHigh (crit, 5m, 5xx >5%) - AnubisChallengeStoreErrors (crit, 5m, any 5xx on *anubis* services via Traefik — proxies for the in-pod challenge-store error since Anubis itself only exposes Go-runtime metrics) Notes from the alert author: avg-not-p95 because the existing Prometheus scrape config drops traefik bucket series; once those are restored, swap to histogram_quantile(0.95). TraefikDown inhibit rule extended to suppress these four during a Traefik outage.
This commit is contained in:
parent
efd28ccce5
commit
b5f48e7b99
2 changed files with 71 additions and 11 deletions
|
|
@ -228,23 +228,18 @@ module "tls_secret" {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
module "anubis" {
|
# NOTE: f1-stream serves its SPA + JSON data endpoints (/schedule, /embed,
|
||||||
source = "../../modules/kubernetes/anubis_instance"
|
# /embed-asset, …) all on the same path tree, so putting Anubis in front
|
||||||
name = "f1"
|
# breaks XHR data fetches with "Unexpected token '<', '<!doctype '" — the
|
||||||
namespace = kubernetes_namespace.f1-stream.metadata[0].name
|
# challenge HTML lands where JSON is expected. Anubis is removed for f1
|
||||||
target_url = "http://${kubernetes_service.f1-stream.metadata[0].name}.${kubernetes_namespace.f1-stream.metadata[0].name}.svc.cluster.local"
|
# until/unless we add a /api carve-out the way wrongmove does.
|
||||||
}
|
|
||||||
|
|
||||||
module "ingress" {
|
module "ingress" {
|
||||||
source = "../../modules/kubernetes/ingress_factory"
|
source = "../../modules/kubernetes/ingress_factory"
|
||||||
dns_type = "non-proxied"
|
dns_type = "non-proxied"
|
||||||
namespace = kubernetes_namespace.f1-stream.metadata[0].name
|
namespace = kubernetes_namespace.f1-stream.metadata[0].name
|
||||||
name = "f1"
|
name = "f1"
|
||||||
service_name = module.anubis.service_name
|
|
||||||
port = module.anubis.service_port
|
|
||||||
tls_secret_name = var.tls_secret_name
|
tls_secret_name = var.tls_secret_name
|
||||||
exclude_crowdsec = true
|
exclude_crowdsec = true
|
||||||
anti_ai_scraping = false
|
|
||||||
extra_annotations = {
|
extra_annotations = {
|
||||||
"gethomepage.dev/enabled" = "true"
|
"gethomepage.dev/enabled" = "true"
|
||||||
"gethomepage.dev/name" = "F1 Stream"
|
"gethomepage.dev/name" = "F1 Stream"
|
||||||
|
|
|
||||||
|
|
@ -83,7 +83,7 @@ alertmanager:
|
||||||
- source_matchers:
|
- source_matchers:
|
||||||
- alertname = TraefikDown
|
- alertname = TraefikDown
|
||||||
target_matchers:
|
target_matchers:
|
||||||
- alertname =~ "HighServiceErrorRate|HighService4xxRate|HighServiceLatency|TraefikHighOpenConnections"
|
- alertname =~ "HighServiceErrorRate|HighService4xxRate|HighServiceLatency|TraefikHighOpenConnections|IngressTTFBHigh|IngressTTFBCritical|IngressErrorRate5xxHigh|AnubisChallengeStoreErrors"
|
||||||
# Traefik down makes ForwardAuth alerts redundant
|
# Traefik down makes ForwardAuth alerts redundant
|
||||||
- source_matchers:
|
- source_matchers:
|
||||||
- alertname = TraefikDown
|
- alertname = TraefikDown
|
||||||
|
|
@ -1882,6 +1882,71 @@ serverFiles:
|
||||||
# summary: OpenWRT high memory usage. Can cause services getting stuck.
|
# summary: OpenWRT high memory usage. Can cause services getting stuck.
|
||||||
# MailServerDown, HackmdDown, PrivatebinDown moved to "Application Health" group
|
# MailServerDown, HackmdDown, PrivatebinDown moved to "Application Health" group
|
||||||
# New Tailscale client moved to "Infrastructure Health" group
|
# New Tailscale client moved to "Infrastructure Health" group
|
||||||
|
- name: "Slow Ingress Latency"
|
||||||
|
# Per-host slow-latency + Anubis-specific 5xx alerts. Sourced from
|
||||||
|
# `traefik_service_*` metrics scraped via `kubernetes-pods` (only fresh
|
||||||
|
# samples we have — `*_bucket` series are scraped but the `traefik`
|
||||||
|
# job's metric_relabel drops them, so `histogram_quantile` produces no
|
||||||
|
# samples). Once buckets are restored, replace the avg expressions with
|
||||||
|
# `histogram_quantile(0.95, ...)`. The `service` label format is
|
||||||
|
# `<ns>-<release>-<port>@kubernetes` and maps roughly 1:1 to a public
|
||||||
|
# host (e.g. `travel-blog-anubis-travel-8080@kubernetes`).
|
||||||
|
rules:
|
||||||
|
- alert: IngressTTFBHigh
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
|
||||||
|
/ sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
|
||||||
|
) > 1
|
||||||
|
and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service) > 0.05
|
||||||
|
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Slow ingress on {{ $labels.service }}: avg latency {{ $value | printf \"%.2f\" }}s (threshold: 1s for 10m)"
|
||||||
|
- alert: IngressTTFBCritical
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
|
||||||
|
/ sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service)
|
||||||
|
) > 3
|
||||||
|
and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*nextcloud.*|.*immich.*",protocol!="websocket"}[5m])) by (service) > 0.05
|
||||||
|
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Critically slow ingress on {{ $labels.service }}: avg latency {{ $value | printf \"%.2f\" }}s (threshold: 3s for 5m)"
|
||||||
|
- alert: IngressErrorRate5xxHigh
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
sum(rate(traefik_service_requests_total{code=~"5..", service!~".*nextcloud.*"}[5m])) by (service)
|
||||||
|
/ sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service)
|
||||||
|
* 100
|
||||||
|
) > 5
|
||||||
|
and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service) > 0.1
|
||||||
|
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "5xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 5% for 5m)"
|
||||||
|
- alert: AnubisChallengeStoreErrors
|
||||||
|
# Anubis exposes only Go-runtime metrics on :9090 (no anubis_* /
|
||||||
|
# challenge_* counters), so we proxy via Traefik 5xx on services
|
||||||
|
# whose name contains `anubis`. Catches the "store: key not found"
|
||||||
|
# 500 we saw — every Anubis 5xx is suspicious because the only
|
||||||
|
# legitimate path through it is /.within.website/x/cmd/anubis or a
|
||||||
|
# redirect to the upstream, both 200/3xx in healthy operation.
|
||||||
|
expr: |
|
||||||
|
sum(rate(traefik_service_requests_total{service=~".*anubis.*",code=~"5.."}[5m])) by (service) > 0
|
||||||
|
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Anubis service {{ $labels.service }} returning 5xx ({{ $value | printf \"%.2f\" }} req/s) — likely challenge-store error"
|
||||||
- name: "Networking & Access"
|
- name: "Networking & Access"
|
||||||
rules:
|
rules:
|
||||||
- alert: CloudflaredDown
|
- alert: CloudflaredDown
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue