aiostreams: 1h stream cache + canary stream-count probe + 3 alerts
Hardening pass following the empty-stream-list incident: 1. STREAM_CACHE_TTL=3600 — re-enables stream payload cache (was -1 / disabled). Default behaviour hit all 5 upstream addons on every Stremio request; with a 1h TTL repeat requests for the same title are instant, while RD cache invalidations still propagate quickly. 2. aiostreams-stream-probe CronJob (every 5 min): fetches the user's encryptedPassword via the internal ClusterIP, runs a canary stream search for Breaking Bad S01E01, pushes streams_count + probe_success to Pushgateway. Uses an ExternalSecret pulling UUID + password from Vault secret/viktor. Same pattern as email-roundtrip-monitor. 3. Three alerts in monitoring's prometheus_chart_values.tpl: - AIOStreamsStreamCountLow (< 50 streams for 30m) - AIOStreamsProbeFailing (probe_success == 0 for 30m) - AIOStreamsProbeStale (last_run_timestamp > 30min for 10m) Verified: probe returned streams=411 success=1 on first run; all 3 alerts loaded into Prometheus with state=inactive health=ok.
This commit is contained in:
parent
e037b160d0
commit
b6e334daab
2 changed files with 153 additions and 0 deletions
|
|
@ -2298,6 +2298,30 @@ serverFiles:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Email round-trip monitor never reported - check CronJob in mailserver namespace"
|
summary: "Email round-trip monitor never reported - check CronJob in mailserver namespace"
|
||||||
|
- alert: AIOStreamsStreamCountLow
|
||||||
|
expr: aiostreams_stream_count{job="aiostreams-stream-probe"} < 50
|
||||||
|
for: 30m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "AIOStreams returning <50 streams for the canary title for 30m"
|
||||||
|
description: "Probe for Breaking Bad S01E01 returned {{ $value }} streams. Could indicate an upstream addon outage, RD filter expansion, or a regression in the user's preset filters. Check `kubectl -n aiostreams get cronjob aiostreams-stream-probe` and the most recent job pod logs."
|
||||||
|
- alert: AIOStreamsProbeFailing
|
||||||
|
expr: aiostreams_probe_success{job="aiostreams-stream-probe"} == 0
|
||||||
|
for: 30m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "AIOStreams stream-probe failing for 30m"
|
||||||
|
description: "The /api/v1/user fetch or stream search is returning errors, or stream count is below threshold. Check probe logs."
|
||||||
|
- alert: AIOStreamsProbeStale
|
||||||
|
expr: (time() - aiostreams_probe_last_run_timestamp{job="aiostreams-stream-probe"}) > 1800
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "AIOStreams stream-probe hasn't run in >30 min"
|
||||||
|
description: "CronJob may be unschedulable or failing before pushgateway POST."
|
||||||
- alert: ClaudeOAuthTokenExpiringSoon
|
- alert: ClaudeOAuthTokenExpiringSoon
|
||||||
expr: (claude_oauth_token_expiry_timestamp{job="claude-oauth-expiry-monitor"} - time()) < (30 * 86400)
|
expr: (claude_oauth_token_expiry_timestamp{job="claude-oauth-expiry-monitor"} - time()) < (30 * 86400)
|
||||||
for: 1h
|
for: 1h
|
||||||
|
|
|
||||||
|
|
@ -93,6 +93,15 @@ resource "kubernetes_deployment" "aiostreams" {
|
||||||
name = "DATABASE_URI"
|
name = "DATABASE_URI"
|
||||||
value = var.aiostreams_database_connection_string
|
value = var.aiostreams_database_connection_string
|
||||||
}
|
}
|
||||||
|
env {
|
||||||
|
# Cache stream-response payloads for 1h. Default is -1 (disabled),
|
||||||
|
# which made every Stremio request hit all 5 upstream addons live —
|
||||||
|
# slow, and contributed to the perceived empty-list issue when an
|
||||||
|
# upstream was slow/erroring. 1h is short enough that RD cache
|
||||||
|
# invalidations are picked up quickly.
|
||||||
|
name = "STREAM_CACHE_TTL"
|
||||||
|
value = "3600"
|
||||||
|
}
|
||||||
volume_mount {
|
volume_mount {
|
||||||
name = "data"
|
name = "data"
|
||||||
mount_path = "/app/data"
|
mount_path = "/app/data"
|
||||||
|
|
@ -143,6 +152,126 @@ resource "kubernetes_service" "aiostreams" {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resource "kubernetes_manifest" "probe_secrets" {
|
||||||
|
manifest = {
|
||||||
|
apiVersion = "external-secrets.io/v1beta1"
|
||||||
|
kind = "ExternalSecret"
|
||||||
|
metadata = {
|
||||||
|
name = "aiostreams-probe-secrets"
|
||||||
|
namespace = kubernetes_namespace.aiostreams.metadata[0].name
|
||||||
|
}
|
||||||
|
spec = {
|
||||||
|
refreshInterval = "15m"
|
||||||
|
secretStoreRef = {
|
||||||
|
name = "vault-kv"
|
||||||
|
kind = "ClusterSecretStore"
|
||||||
|
}
|
||||||
|
target = { name = "aiostreams-probe-secrets" }
|
||||||
|
data = [
|
||||||
|
{ secretKey = "AIOSTREAMS_UUID", remoteRef = { key = "viktor", property = "aiostreams_uuid" } },
|
||||||
|
{ secretKey = "AIOSTREAMS_PASSWORD", remoteRef = { key = "viktor", property = "aiostreams_password" } },
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
depends_on = [kubernetes_namespace.aiostreams]
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "kubernetes_cron_job_v1" "stream_probe" {
|
||||||
|
metadata {
|
||||||
|
name = "aiostreams-stream-probe"
|
||||||
|
namespace = kubernetes_namespace.aiostreams.metadata[0].name
|
||||||
|
}
|
||||||
|
spec {
|
||||||
|
schedule = "*/5 * * * *"
|
||||||
|
concurrency_policy = "Replace"
|
||||||
|
successful_jobs_history_limit = 3
|
||||||
|
failed_jobs_history_limit = 3
|
||||||
|
job_template {
|
||||||
|
metadata {}
|
||||||
|
spec {
|
||||||
|
backoff_limit = 1
|
||||||
|
ttl_seconds_after_finished = 300
|
||||||
|
template {
|
||||||
|
metadata {}
|
||||||
|
spec {
|
||||||
|
restart_policy = "Never"
|
||||||
|
container {
|
||||||
|
name = "probe"
|
||||||
|
image = "docker.io/library/python:3.12-alpine"
|
||||||
|
command = ["/bin/sh", "-c", <<-EOT
|
||||||
|
pip install --quiet --disable-pip-version-check requests && python3 -c '
|
||||||
|
import requests, os, time, urllib.parse, sys
|
||||||
|
|
||||||
|
BASE = "http://aiostreams.aiostreams.svc.cluster.local"
|
||||||
|
PUSHGATEWAY = "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/aiostreams-stream-probe"
|
||||||
|
UUID = os.environ["AIOSTREAMS_UUID"]
|
||||||
|
PW = os.environ["AIOSTREAMS_PASSWORD"]
|
||||||
|
TEST_ID = "tt0903747:1:1" # Breaking Bad S01E01 - stable, always has many streams
|
||||||
|
THRESHOLD = 50
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
success = 0
|
||||||
|
duration = 0
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = requests.get(f"{BASE}/api/v1/user/", params={"uuid": UUID, "password": PW}, timeout=10)
|
||||||
|
r.raise_for_status()
|
||||||
|
enc = r.json()["data"]["encryptedPassword"]
|
||||||
|
enc_url = urllib.parse.quote(enc, safe="")
|
||||||
|
r2 = requests.get(
|
||||||
|
f"{BASE}/stremio/{UUID}/{enc_url}/stream/series/{TEST_ID}.json",
|
||||||
|
headers={"User-Agent": "AIOStreams/probe"}, timeout=60,
|
||||||
|
)
|
||||||
|
r2.raise_for_status()
|
||||||
|
count = len(r2.json().get("streams", []))
|
||||||
|
success = 1 if count >= THRESHOLD else 0
|
||||||
|
print(f"streams={count} success={success}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR: {e}", file=sys.stderr)
|
||||||
|
success = 0
|
||||||
|
|
||||||
|
duration = time.time() - start
|
||||||
|
|
||||||
|
body = (
|
||||||
|
"# TYPE aiostreams_stream_count gauge\n"
|
||||||
|
f"aiostreams_stream_count {count}\n"
|
||||||
|
"# TYPE aiostreams_probe_success gauge\n"
|
||||||
|
f"aiostreams_probe_success {success}\n"
|
||||||
|
"# TYPE aiostreams_probe_duration_seconds gauge\n"
|
||||||
|
f"aiostreams_probe_duration_seconds {duration:.3f}\n"
|
||||||
|
"# TYPE aiostreams_probe_last_run_timestamp gauge\n"
|
||||||
|
f"aiostreams_probe_last_run_timestamp {int(time.time())}\n"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
requests.post(PUSHGATEWAY, data=body, timeout=10).raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"WARN: pushgateway POST failed: {e}", file=sys.stderr)
|
||||||
|
|
||||||
|
sys.exit(0 if success else 1)
|
||||||
|
'
|
||||||
|
EOT
|
||||||
|
]
|
||||||
|
env_from {
|
||||||
|
secret_ref { name = "aiostreams-probe-secrets" }
|
||||||
|
}
|
||||||
|
resources {
|
||||||
|
requests = { memory = "64Mi", cpu = "10m" }
|
||||||
|
limits = { memory = "128Mi" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
depends_on = [kubernetes_manifest.probe_secrets, kubernetes_deployment.aiostreams]
|
||||||
|
lifecycle {
|
||||||
|
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||||
|
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
module "ingress" {
|
module "ingress" {
|
||||||
source = "../../../modules/kubernetes/ingress_factory"
|
source = "../../../modules/kubernetes/ingress_factory"
|
||||||
# auth = "app": AIOStreams enforces its own UUID + password gate on /configure
|
# auth = "app": AIOStreams enforces its own UUID + password gate on /configure
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue