aiostreams: 1h stream cache + canary stream-count probe + 3 alerts
Hardening pass following the empty-stream-list incident: 1. STREAM_CACHE_TTL=3600 — re-enables stream payload cache (was -1 / disabled). Default behaviour hit all 5 upstream addons on every Stremio request; with a 1h TTL repeat requests for the same title are instant, while RD cache invalidations still propagate quickly. 2. aiostreams-stream-probe CronJob (every 5 min): fetches the user's encryptedPassword via the internal ClusterIP, runs a canary stream search for Breaking Bad S01E01, pushes streams_count + probe_success to Pushgateway. Uses an ExternalSecret pulling UUID + password from Vault secret/viktor. Same pattern as email-roundtrip-monitor. 3. Three alerts in monitoring's prometheus_chart_values.tpl: - AIOStreamsStreamCountLow (< 50 streams for 30m) - AIOStreamsProbeFailing (probe_success == 0 for 30m) - AIOStreamsProbeStale (last_run_timestamp > 30min for 10m) Verified: probe returned streams=411 success=1 on first run; all 3 alerts loaded into Prometheus with state=inactive health=ok.
This commit is contained in:
parent
fba5ee2df4
commit
95b9f7bc89
2 changed files with 153 additions and 0 deletions
|
|
@ -2298,6 +2298,30 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "Email round-trip monitor never reported - check CronJob in mailserver namespace"
|
||||
- alert: AIOStreamsStreamCountLow
|
||||
expr: aiostreams_stream_count{job="aiostreams-stream-probe"} < 50
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "AIOStreams returning <50 streams for the canary title for 30m"
|
||||
description: "Probe for Breaking Bad S01E01 returned {{ $value }} streams. Could indicate an upstream addon outage, RD filter expansion, or a regression in the user's preset filters. Check `kubectl -n aiostreams get cronjob aiostreams-stream-probe` and the most recent job pod logs."
|
||||
- alert: AIOStreamsProbeFailing
|
||||
expr: aiostreams_probe_success{job="aiostreams-stream-probe"} == 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "AIOStreams stream-probe failing for 30m"
|
||||
description: "The /api/v1/user fetch or stream search is returning errors, or stream count is below threshold. Check probe logs."
|
||||
- alert: AIOStreamsProbeStale
|
||||
expr: (time() - aiostreams_probe_last_run_timestamp{job="aiostreams-stream-probe"}) > 1800
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "AIOStreams stream-probe hasn't run in >30 min"
|
||||
description: "CronJob may be unschedulable or failing before pushgateway POST."
|
||||
- alert: ClaudeOAuthTokenExpiringSoon
|
||||
expr: (claude_oauth_token_expiry_timestamp{job="claude-oauth-expiry-monitor"} - time()) < (30 * 86400)
|
||||
for: 1h
|
||||
|
|
|
|||
|
|
@ -93,6 +93,15 @@ resource "kubernetes_deployment" "aiostreams" {
|
|||
name = "DATABASE_URI"
|
||||
value = var.aiostreams_database_connection_string
|
||||
}
|
||||
env {
|
||||
# Cache stream-response payloads for 1h. Default is -1 (disabled),
|
||||
# which made every Stremio request hit all 5 upstream addons live —
|
||||
# slow, and contributed to the perceived empty-list issue when an
|
||||
# upstream was slow/erroring. 1h is short enough that RD cache
|
||||
# invalidations are picked up quickly.
|
||||
name = "STREAM_CACHE_TTL"
|
||||
value = "3600"
|
||||
}
|
||||
volume_mount {
|
||||
name = "data"
|
||||
mount_path = "/app/data"
|
||||
|
|
@ -143,6 +152,126 @@ resource "kubernetes_service" "aiostreams" {
|
|||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_manifest" "probe_secrets" {
|
||||
manifest = {
|
||||
apiVersion = "external-secrets.io/v1beta1"
|
||||
kind = "ExternalSecret"
|
||||
metadata = {
|
||||
name = "aiostreams-probe-secrets"
|
||||
namespace = kubernetes_namespace.aiostreams.metadata[0].name
|
||||
}
|
||||
spec = {
|
||||
refreshInterval = "15m"
|
||||
secretStoreRef = {
|
||||
name = "vault-kv"
|
||||
kind = "ClusterSecretStore"
|
||||
}
|
||||
target = { name = "aiostreams-probe-secrets" }
|
||||
data = [
|
||||
{ secretKey = "AIOSTREAMS_UUID", remoteRef = { key = "viktor", property = "aiostreams_uuid" } },
|
||||
{ secretKey = "AIOSTREAMS_PASSWORD", remoteRef = { key = "viktor", property = "aiostreams_password" } },
|
||||
]
|
||||
}
|
||||
}
|
||||
depends_on = [kubernetes_namespace.aiostreams]
|
||||
}
|
||||
|
||||
resource "kubernetes_cron_job_v1" "stream_probe" {
|
||||
metadata {
|
||||
name = "aiostreams-stream-probe"
|
||||
namespace = kubernetes_namespace.aiostreams.metadata[0].name
|
||||
}
|
||||
spec {
|
||||
schedule = "*/5 * * * *"
|
||||
concurrency_policy = "Replace"
|
||||
successful_jobs_history_limit = 3
|
||||
failed_jobs_history_limit = 3
|
||||
job_template {
|
||||
metadata {}
|
||||
spec {
|
||||
backoff_limit = 1
|
||||
ttl_seconds_after_finished = 300
|
||||
template {
|
||||
metadata {}
|
||||
spec {
|
||||
restart_policy = "Never"
|
||||
container {
|
||||
name = "probe"
|
||||
image = "docker.io/library/python:3.12-alpine"
|
||||
command = ["/bin/sh", "-c", <<-EOT
|
||||
pip install --quiet --disable-pip-version-check requests && python3 -c '
|
||||
import requests, os, time, urllib.parse, sys
|
||||
|
||||
BASE = "http://aiostreams.aiostreams.svc.cluster.local"
|
||||
PUSHGATEWAY = "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/aiostreams-stream-probe"
|
||||
UUID = os.environ["AIOSTREAMS_UUID"]
|
||||
PW = os.environ["AIOSTREAMS_PASSWORD"]
|
||||
TEST_ID = "tt0903747:1:1" # Breaking Bad S01E01 - stable, always has many streams
|
||||
THRESHOLD = 50
|
||||
|
||||
count = 0
|
||||
success = 0
|
||||
duration = 0
|
||||
start = time.time()
|
||||
|
||||
try:
|
||||
r = requests.get(f"{BASE}/api/v1/user/", params={"uuid": UUID, "password": PW}, timeout=10)
|
||||
r.raise_for_status()
|
||||
enc = r.json()["data"]["encryptedPassword"]
|
||||
enc_url = urllib.parse.quote(enc, safe="")
|
||||
r2 = requests.get(
|
||||
f"{BASE}/stremio/{UUID}/{enc_url}/stream/series/{TEST_ID}.json",
|
||||
headers={"User-Agent": "AIOStreams/probe"}, timeout=60,
|
||||
)
|
||||
r2.raise_for_status()
|
||||
count = len(r2.json().get("streams", []))
|
||||
success = 1 if count >= THRESHOLD else 0
|
||||
print(f"streams={count} success={success}")
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}", file=sys.stderr)
|
||||
success = 0
|
||||
|
||||
duration = time.time() - start
|
||||
|
||||
body = (
|
||||
"# TYPE aiostreams_stream_count gauge\n"
|
||||
f"aiostreams_stream_count {count}\n"
|
||||
"# TYPE aiostreams_probe_success gauge\n"
|
||||
f"aiostreams_probe_success {success}\n"
|
||||
"# TYPE aiostreams_probe_duration_seconds gauge\n"
|
||||
f"aiostreams_probe_duration_seconds {duration:.3f}\n"
|
||||
"# TYPE aiostreams_probe_last_run_timestamp gauge\n"
|
||||
f"aiostreams_probe_last_run_timestamp {int(time.time())}\n"
|
||||
)
|
||||
try:
|
||||
requests.post(PUSHGATEWAY, data=body, timeout=10).raise_for_status()
|
||||
except Exception as e:
|
||||
print(f"WARN: pushgateway POST failed: {e}", file=sys.stderr)
|
||||
|
||||
sys.exit(0 if success else 1)
|
||||
'
|
||||
EOT
|
||||
]
|
||||
env_from {
|
||||
secret_ref { name = "aiostreams-probe-secrets" }
|
||||
}
|
||||
resources {
|
||||
requests = { memory = "64Mi", cpu = "10m" }
|
||||
limits = { memory = "128Mi" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
depends_on = [kubernetes_manifest.probe_secrets, kubernetes_deployment.aiostreams]
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
|
||||
}
|
||||
}
|
||||
|
||||
module "ingress" {
|
||||
source = "../../../modules/kubernetes/ingress_factory"
|
||||
# auth = "app": AIOStreams enforces its own UUID + password gate on /configure
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue