From a0725ede57be066b12a25912e01382afdc4b4d2d Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 16 Jun 2026 08:18:32 +0000 Subject: [PATCH 1/2] chrome-service: stop ignoring container[0].image so TF re-asserts the pinned browser image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The chrome-service container (container[0]) runs the pinned Microsoft Playwright image, which ships chromium under /ms-playwright. Its image was still listed in the deployment's lifecycle ignore_changes — a leftover KEEL_IGNORE from before ADR-0002 #29 moved the novnc container to TF management. With that field ignored, a stray clobber of container[0] to ghcr chrome-service-novnc:latest (which has no chromium there) stuck permanently: the container crash-looped ~12h on "chromium binary not found under /ms-playwright" (273 restarts) and TF could not revert it. Remove container[0].image from ignore_changes so Terraform pins it to local.image and re-asserts it on every apply. Both containers are TF-managed now (novnc since ADR-0002 #29); Keel is inert (policy=never), so nothing should fight TF here. Surfaced by /cluster-health. Live state was already restored transiently via kubectl set image; this commit makes the fix durable. Co-Authored-By: Claude Opus 4.8 --- stacks/chrome-service/main.tf | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/stacks/chrome-service/main.tf b/stacks/chrome-service/main.tf index a0e803c9..30210808 100644 --- a/stacks/chrome-service/main.tf +++ b/stacks/chrome-service/main.tf @@ -439,8 +439,12 @@ resource "kubernetes_deployment" "chrome_service" { metadata[0].annotations["keel.sh/trigger"], metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 metadata[0].annotations["keel.sh/match-tag"], - spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates - # container[1]=novnc now TF-managed on ghcr:latest (ADR-0002 #29) — was KEEL_IGNORE + # container[0]=chrome-service (MS Playwright, pinned via local.image) and + # container[1]=novnc (ghcr:latest, ADR-0002 #29) are BOTH TF-managed now. + # container[0].image was previously KEEL_IGNORE'd here; that let a stray + # clobber to the novnc image stick (chromium-not-found crashloop 2026-06-16) + # because TF could not revert the ignored field. Removed so TF re-asserts the + # pinned image. Keel is inert (keel.sh/policy=never) and no deploy step touches these. spec[0].template[0].spec[0].init_container[0].image, metadata[0].annotations["kubernetes.io/change-cause"], metadata[0].annotations["deployment.kubernetes.io/revision"], From 2479560fa24b6feef0bcb2e2369627de265730af Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 16 Jun 2026 08:18:33 +0000 Subject: [PATCH 2/2] mam-farming: make MAMFarmingStuck a grabber heartbeat, not a grab-count check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MAMFarmingStuck fired whenever the freeleech grabber added 0 torrents in 4h, but grabbing 0 is normal: the grabber searches a random catalogue offset each run and legitimately finds nothing when freeleech is dry (account ratio was a healthy 37.5; the alert even misreported it as "0.00" because $value was the grabbed count, not the ratio). The alert's real intent was to catch the grabber not running at all (CronJob Forbid-blocked / wedged), but increase(grabbed[4h])==0 cannot distinguish "didn't run" from "ran, nothing to grab" since Pushgateway serves the last pushed value forever. The grabber now heartbeats mam_grabber_last_run_timestamp on every completed run (main success, ratio/mouse skip, and qBittorrent-unreachable paths). The alert fires only when that heartbeat is >4h stale — the true stuck condition. Cookie expiry and qBittorrent-down keep their own dedicated alerts. Surfaced by /cluster-health as a false-firing alert. Co-Authored-By: Claude Opus 4.8 --- .../monitoring/prometheus_chart_values.tpl | 17 ++++++++++++----- .../mam-farming/files/freeleech-grabber.py | 8 +++++++- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index ce35bfb7..a86c832f 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -2840,15 +2840,22 @@ serverFiles: annotations: summary: "MAM ratio is {{ $value | printf \"%.2f\" }} for 24h (target: >= 1.0)" - alert: MAMFarmingStuck + # Heartbeat-based: fires only when the grabber CronJob has not COMPLETED + # a run in >4h (the original failure mode: Forbid-blocked / wedged in + # ContainerCreating). The grabber heartbeats mam_grabber_last_run_timestamp + # on every completed run — including legit dry runs that grab 0 (its random + # search offset lands on an empty/over-filtered page, which is normal). The + # old increase(mam_farming_grabbed[4h])==0 could not tell "didn't run" from + # "ran, nothing to grab" (Pushgateway serves the last value forever), so a + # dry freeleech period false-fired. Cookie-expiry and qBittorrent-down have + # their own alerts (MAM session cookie / QBittorrentDisconnected). expr: | - increase(mam_farming_grabbed[4h]) == 0 - and mam_farming_total_seeding < 150 - and mam_ratio >= 1.2 - for: 4h + time() - mam_grabber_last_run_timestamp > 4 * 3600 + for: 15m labels: severity: warning annotations: - summary: "Grabber has added 0 torrents in 4h despite healthy ratio ({{ $value | printf \"%.2f\" }})" + summary: "MAM freeleech grabber has not completed a run in {{ $value | humanizeDuration }} — CronJob stuck/blocked" - alert: MAMJanitorStuckBacklog expr: mam_janitor_skipped_active > 400 for: 6h diff --git a/stacks/servarr/mam-farming/files/freeleech-grabber.py b/stacks/servarr/mam-farming/files/freeleech-grabber.py index 3c7064c7..160a160e 100644 --- a/stacks/servarr/mam-farming/files/freeleech-grabber.py +++ b/stacks/servarr/mam-farming/files/freeleech-grabber.py @@ -134,6 +134,7 @@ def main(): profile_metrics + f'mam_grabber_skipped_reason{{reason="{reason}"}} 1\n' + f"mam_farming_grabbed 0\n" + + f"mam_grabber_last_run_timestamp {int(time.time())}\n" ) return @@ -153,7 +154,11 @@ def main(): ).json() except Exception as e: print(f"qBittorrent unreachable: {e}", file=sys.stderr) - push(profile_metrics + "mam_farming_grabbed 0\n") + push( + profile_metrics + + "mam_farming_grabbed 0\n" + + f"mam_grabber_last_run_timestamp {int(time.time())}\n" + ) sys.exit(1) farming = [t for t in all_torrents if t.get("category") == "mam-farming"] @@ -264,6 +269,7 @@ def main(): + f"mam_farming_grabbed {grabbed}\n" + f"mam_farming_total_seeding {len(farming) + grabbed}\n" + f"mam_farming_size_bytes {total_size}\n" + + f"mam_grabber_last_run_timestamp {int(time.time())}\n" ) push(metrics) print(f"Done: grabbed={grabbed}")