aiostreams: 1h stream cache + canary stream-count probe + 3 alerts

Hardening pass following the empty-stream-list incident:

1. STREAM_CACHE_TTL=3600 — re-enables stream payload cache (was -1 /
   disabled). Default behaviour hit all 5 upstream addons on every
   Stremio request; with a 1h TTL repeat requests for the same title
   are instant, while RD cache invalidations still propagate quickly.

2. aiostreams-stream-probe CronJob (every 5 min): fetches the user's
   encryptedPassword via the internal ClusterIP, runs a canary stream
   search for Breaking Bad S01E01, pushes streams_count + probe_success
   to Pushgateway. Uses an ExternalSecret pulling UUID + password from
   Vault secret/viktor. Same pattern as email-roundtrip-monitor.

3. Three alerts in monitoring's prometheus_chart_values.tpl:
   - AIOStreamsStreamCountLow  (< 50 streams for 30m)
   - AIOStreamsProbeFailing    (probe_success == 0 for 30m)
   - AIOStreamsProbeStale      (last_run_timestamp > 30min for 10m)

Verified: probe returned streams=411 success=1 on first run; all 3
alerts loaded into Prometheus with state=inactive health=ok.
This commit is contained in:
Viktor Barzin 2026-05-15 21:38:50 +00:00
parent fba5ee2df4
commit 95b9f7bc89
2 changed files with 153 additions and 0 deletions

View file

@ -2298,6 +2298,30 @@ serverFiles:
severity: warning
annotations:
summary: "Email round-trip monitor never reported - check CronJob in mailserver namespace"
- alert: AIOStreamsStreamCountLow
expr: aiostreams_stream_count{job="aiostreams-stream-probe"} < 50
for: 30m
labels:
severity: warning
annotations:
summary: "AIOStreams returning <50 streams for the canary title for 30m"
description: "Probe for Breaking Bad S01E01 returned {{ $value }} streams. Could indicate an upstream addon outage, RD filter expansion, or a regression in the user's preset filters. Check `kubectl -n aiostreams get cronjob aiostreams-stream-probe` and the most recent job pod logs."
- alert: AIOStreamsProbeFailing
expr: aiostreams_probe_success{job="aiostreams-stream-probe"} == 0
for: 30m
labels:
severity: warning
annotations:
summary: "AIOStreams stream-probe failing for 30m"
description: "The /api/v1/user fetch or stream search is returning errors, or stream count is below threshold. Check probe logs."
- alert: AIOStreamsProbeStale
expr: (time() - aiostreams_probe_last_run_timestamp{job="aiostreams-stream-probe"}) > 1800
for: 10m
labels:
severity: warning
annotations:
summary: "AIOStreams stream-probe hasn't run in >30 min"
description: "CronJob may be unschedulable or failing before pushgateway POST."
- alert: ClaudeOAuthTokenExpiringSoon
expr: (claude_oauth_token_expiry_timestamp{job="claude-oauth-expiry-monitor"} - time()) < (30 * 86400)
for: 1h

View file

@ -93,6 +93,15 @@ resource "kubernetes_deployment" "aiostreams" {
name = "DATABASE_URI"
value = var.aiostreams_database_connection_string
}
env {
# Cache stream-response payloads for 1h. Default is -1 (disabled),
# which made every Stremio request hit all 5 upstream addons live
# slow, and contributed to the perceived empty-list issue when an
# upstream was slow/erroring. 1h is short enough that RD cache
# invalidations are picked up quickly.
name = "STREAM_CACHE_TTL"
value = "3600"
}
volume_mount {
name = "data"
mount_path = "/app/data"
@ -143,6 +152,126 @@ resource "kubernetes_service" "aiostreams" {
}
}
resource "kubernetes_manifest" "probe_secrets" {
manifest = {
apiVersion = "external-secrets.io/v1beta1"
kind = "ExternalSecret"
metadata = {
name = "aiostreams-probe-secrets"
namespace = kubernetes_namespace.aiostreams.metadata[0].name
}
spec = {
refreshInterval = "15m"
secretStoreRef = {
name = "vault-kv"
kind = "ClusterSecretStore"
}
target = { name = "aiostreams-probe-secrets" }
data = [
{ secretKey = "AIOSTREAMS_UUID", remoteRef = { key = "viktor", property = "aiostreams_uuid" } },
{ secretKey = "AIOSTREAMS_PASSWORD", remoteRef = { key = "viktor", property = "aiostreams_password" } },
]
}
}
depends_on = [kubernetes_namespace.aiostreams]
}
resource "kubernetes_cron_job_v1" "stream_probe" {
metadata {
name = "aiostreams-stream-probe"
namespace = kubernetes_namespace.aiostreams.metadata[0].name
}
spec {
schedule = "*/5 * * * *"
concurrency_policy = "Replace"
successful_jobs_history_limit = 3
failed_jobs_history_limit = 3
job_template {
metadata {}
spec {
backoff_limit = 1
ttl_seconds_after_finished = 300
template {
metadata {}
spec {
restart_policy = "Never"
container {
name = "probe"
image = "docker.io/library/python:3.12-alpine"
command = ["/bin/sh", "-c", <<-EOT
pip install --quiet --disable-pip-version-check requests && python3 -c '
import requests, os, time, urllib.parse, sys
BASE = "http://aiostreams.aiostreams.svc.cluster.local"
PUSHGATEWAY = "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/aiostreams-stream-probe"
UUID = os.environ["AIOSTREAMS_UUID"]
PW = os.environ["AIOSTREAMS_PASSWORD"]
TEST_ID = "tt0903747:1:1" # Breaking Bad S01E01 - stable, always has many streams
THRESHOLD = 50
count = 0
success = 0
duration = 0
start = time.time()
try:
r = requests.get(f"{BASE}/api/v1/user/", params={"uuid": UUID, "password": PW}, timeout=10)
r.raise_for_status()
enc = r.json()["data"]["encryptedPassword"]
enc_url = urllib.parse.quote(enc, safe="")
r2 = requests.get(
f"{BASE}/stremio/{UUID}/{enc_url}/stream/series/{TEST_ID}.json",
headers={"User-Agent": "AIOStreams/probe"}, timeout=60,
)
r2.raise_for_status()
count = len(r2.json().get("streams", []))
success = 1 if count >= THRESHOLD else 0
print(f"streams={count} success={success}")
except Exception as e:
print(f"ERROR: {e}", file=sys.stderr)
success = 0
duration = time.time() - start
body = (
"# TYPE aiostreams_stream_count gauge\n"
f"aiostreams_stream_count {count}\n"
"# TYPE aiostreams_probe_success gauge\n"
f"aiostreams_probe_success {success}\n"
"# TYPE aiostreams_probe_duration_seconds gauge\n"
f"aiostreams_probe_duration_seconds {duration:.3f}\n"
"# TYPE aiostreams_probe_last_run_timestamp gauge\n"
f"aiostreams_probe_last_run_timestamp {int(time.time())}\n"
)
try:
requests.post(PUSHGATEWAY, data=body, timeout=10).raise_for_status()
except Exception as e:
print(f"WARN: pushgateway POST failed: {e}", file=sys.stderr)
sys.exit(0 if success else 1)
'
EOT
]
env_from {
secret_ref { name = "aiostreams-probe-secrets" }
}
resources {
requests = { memory = "64Mi", cpu = "10m" }
limits = { memory = "128Mi" }
}
}
}
}
}
}
}
depends_on = [kubernetes_manifest.probe_secrets, kubernetes_deployment.aiostreams]
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
}
}
module "ingress" {
source = "../../../modules/kubernetes/ingress_factory"
# auth = "app": AIOStreams enforces its own UUID + password gate on /configure