infra/stacks/servarr/aiostreams/main.tf
Viktor Barzin 95b9f7bc89 aiostreams: 1h stream cache + canary stream-count probe + 3 alerts
Hardening pass following the empty-stream-list incident:

1. STREAM_CACHE_TTL=3600 — re-enables stream payload cache (was -1 /
   disabled). Default behaviour hit all 5 upstream addons on every
   Stremio request; with a 1h TTL repeat requests for the same title
   are instant, while RD cache invalidations still propagate quickly.

2. aiostreams-stream-probe CronJob (every 5 min): fetches the user's
   encryptedPassword via the internal ClusterIP, runs a canary stream
   search for Breaking Bad S01E01, pushes streams_count + probe_success
   to Pushgateway. Uses an ExternalSecret pulling UUID + password from
   Vault secret/viktor. Same pattern as email-roundtrip-monitor.

3. Three alerts in monitoring's prometheus_chart_values.tpl:
   - AIOStreamsStreamCountLow  (< 50 streams for 30m)
   - AIOStreamsProbeFailing    (probe_success == 0 for 30m)
   - AIOStreamsProbeStale      (last_run_timestamp > 30min for 10m)

Verified: probe returned streams=411 success=1 on first run; all 3
alerts loaded into Prometheus with state=inactive health=ok.
2026-05-22 14:16:46 +00:00

295 lines
8.8 KiB
HCL
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

variable "tls_secret_name" {}
variable "tier" { type = string }
variable "aiostreams_database_connection_string" { type = string }
variable "nfs_server" { type = string }
resource "kubernetes_namespace" "aiostreams" {
metadata {
name = "aiostreams"
labels = {
"istio-injection" : "disabled"
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
}
}
resource "random_id" "secret_key" {
byte_length = 32 # 32 bytes × 2 hex chars = 64 hex characters
}
resource "kubernetes_persistent_volume_claim" "data_proxmox" {
wait_until_bound = false
metadata {
name = "aiostreams-data-proxmox"
namespace = kubernetes_namespace.aiostreams.metadata[0].name
annotations = {
"resize.topolvm.io/threshold" = "10%"
"resize.topolvm.io/increase" = "100%"
"resize.topolvm.io/storage_limit" = "5Gi"
}
}
spec {
access_modes = ["ReadWriteOnce"]
storage_class_name = "proxmox-lvm"
resources {
requests = {
storage = "1Gi"
}
}
}
lifecycle {
# The autoresizer expands requests.storage up to storage_limit and
# PVCs can't shrink. Without this, every TF apply tries to revert
# to the spec value, K8s rejects the shrink, and the PVC ends up
# in Terminating-but-in-use limbo.
ignore_changes = [spec[0].resources[0].requests]
}
}
resource "kubernetes_deployment" "aiostreams" {
metadata {
name = "aiostreams"
namespace = kubernetes_namespace.aiostreams.metadata[0].name
labels = {
app = "aiostreams"
tier = var.tier
}
}
spec {
replicas = 1
strategy {
type = "Recreate"
}
selector {
match_labels = {
app = "aiostreams"
}
}
template {
metadata {
labels = {
app = "aiostreams"
}
}
spec {
container {
image = "viren070/aiostreams:2026.05.14.1326-nightly"
name = "aiostreams"
port {
container_port = 3000
}
env {
name = "BASE_URL"
value = "https://aiostreams.viktorbarzin.me"
}
env {
name = "SECRET_KEY"
value = random_id.secret_key.hex
}
env {
name = "DATABASE_URI"
value = var.aiostreams_database_connection_string
}
env {
# Cache stream-response payloads for 1h. Default is -1 (disabled),
# which made every Stremio request hit all 5 upstream addons live —
# slow, and contributed to the perceived empty-list issue when an
# upstream was slow/erroring. 1h is short enough that RD cache
# invalidations are picked up quickly.
name = "STREAM_CACHE_TTL"
value = "3600"
}
volume_mount {
name = "data"
mount_path = "/app/data"
}
resources {
requests = {
cpu = "25m"
memory = "768Mi"
}
limits = {
memory = "768Mi"
}
}
}
volume {
name = "data"
persistent_volume_claim {
claim_name = kubernetes_persistent_volume_claim.data_proxmox.metadata[0].name
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
ignore_changes = [spec[0].template[0].spec[0].dns_config]
}
}
resource "kubernetes_service" "aiostreams" {
metadata {
name = "aiostreams"
namespace = kubernetes_namespace.aiostreams.metadata[0].name
labels = {
"app" = "aiostreams"
}
}
spec {
selector = {
app = "aiostreams"
}
port {
name = "http"
port = 80
target_port = 3000
}
}
}
resource "kubernetes_manifest" "probe_secrets" {
manifest = {
apiVersion = "external-secrets.io/v1beta1"
kind = "ExternalSecret"
metadata = {
name = "aiostreams-probe-secrets"
namespace = kubernetes_namespace.aiostreams.metadata[0].name
}
spec = {
refreshInterval = "15m"
secretStoreRef = {
name = "vault-kv"
kind = "ClusterSecretStore"
}
target = { name = "aiostreams-probe-secrets" }
data = [
{ secretKey = "AIOSTREAMS_UUID", remoteRef = { key = "viktor", property = "aiostreams_uuid" } },
{ secretKey = "AIOSTREAMS_PASSWORD", remoteRef = { key = "viktor", property = "aiostreams_password" } },
]
}
}
depends_on = [kubernetes_namespace.aiostreams]
}
resource "kubernetes_cron_job_v1" "stream_probe" {
metadata {
name = "aiostreams-stream-probe"
namespace = kubernetes_namespace.aiostreams.metadata[0].name
}
spec {
schedule = "*/5 * * * *"
concurrency_policy = "Replace"
successful_jobs_history_limit = 3
failed_jobs_history_limit = 3
job_template {
metadata {}
spec {
backoff_limit = 1
ttl_seconds_after_finished = 300
template {
metadata {}
spec {
restart_policy = "Never"
container {
name = "probe"
image = "docker.io/library/python:3.12-alpine"
command = ["/bin/sh", "-c", <<-EOT
pip install --quiet --disable-pip-version-check requests && python3 -c '
import requests, os, time, urllib.parse, sys
BASE = "http://aiostreams.aiostreams.svc.cluster.local"
PUSHGATEWAY = "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/aiostreams-stream-probe"
UUID = os.environ["AIOSTREAMS_UUID"]
PW = os.environ["AIOSTREAMS_PASSWORD"]
TEST_ID = "tt0903747:1:1" # Breaking Bad S01E01 - stable, always has many streams
THRESHOLD = 50
count = 0
success = 0
duration = 0
start = time.time()
try:
r = requests.get(f"{BASE}/api/v1/user/", params={"uuid": UUID, "password": PW}, timeout=10)
r.raise_for_status()
enc = r.json()["data"]["encryptedPassword"]
enc_url = urllib.parse.quote(enc, safe="")
r2 = requests.get(
f"{BASE}/stremio/{UUID}/{enc_url}/stream/series/{TEST_ID}.json",
headers={"User-Agent": "AIOStreams/probe"}, timeout=60,
)
r2.raise_for_status()
count = len(r2.json().get("streams", []))
success = 1 if count >= THRESHOLD else 0
print(f"streams={count} success={success}")
except Exception as e:
print(f"ERROR: {e}", file=sys.stderr)
success = 0
duration = time.time() - start
body = (
"# TYPE aiostreams_stream_count gauge\n"
f"aiostreams_stream_count {count}\n"
"# TYPE aiostreams_probe_success gauge\n"
f"aiostreams_probe_success {success}\n"
"# TYPE aiostreams_probe_duration_seconds gauge\n"
f"aiostreams_probe_duration_seconds {duration:.3f}\n"
"# TYPE aiostreams_probe_last_run_timestamp gauge\n"
f"aiostreams_probe_last_run_timestamp {int(time.time())}\n"
)
try:
requests.post(PUSHGATEWAY, data=body, timeout=10).raise_for_status()
except Exception as e:
print(f"WARN: pushgateway POST failed: {e}", file=sys.stderr)
sys.exit(0 if success else 1)
'
EOT
]
env_from {
secret_ref { name = "aiostreams-probe-secrets" }
}
resources {
requests = { memory = "64Mi", cpu = "10m" }
limits = { memory = "128Mi" }
}
}
}
}
}
}
}
depends_on = [kubernetes_manifest.probe_secrets, kubernetes_deployment.aiostreams]
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
}
}
module "ingress" {
source = "../../../modules/kubernetes/ingress_factory"
# auth = "app": AIOStreams enforces its own UUID + password gate on /configure
# and /api/*, and Stremio addon URLs (/stremio/{uuid}/{encryptedPassword}/...)
# use the encryptedPassword path segment as a bearer token. Authentik forward-auth
# broke Stremio clients (cannot follow OAuth 302) and is redundant with the app's
# own auth. UUIDs are 128-bit random; password attempts are rate-limited.
auth = "app"
dns_type = "proxied"
namespace = kubernetes_namespace.aiostreams.metadata[0].name
name = "aiostreams"
tls_secret_name = var.tls_secret_name
extra_annotations = {
"gethomepage.dev/enabled" = "true"
"gethomepage.dev/name" = "AIOStreams"
"gethomepage.dev/description" = "Streaming addon manager"
"gethomepage.dev/icon" = "stremio.png"
"gethomepage.dev/group" = "Media & Entertainment"
"gethomepage.dev/pod-selector" = ""
}
}