Compare commits
2 commits
1ba453c65d
...
2479560fa2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2479560fa2 | ||
|
|
a0725ede57 |
3 changed files with 25 additions and 8 deletions
|
|
@ -439,8 +439,12 @@ resource "kubernetes_deployment" "chrome_service" {
|
|||
metadata[0].annotations["keel.sh/trigger"],
|
||||
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
|
||||
metadata[0].annotations["keel.sh/match-tag"],
|
||||
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates
|
||||
# container[1]=novnc now TF-managed on ghcr:latest (ADR-0002 #29) — was KEEL_IGNORE
|
||||
# container[0]=chrome-service (MS Playwright, pinned via local.image) and
|
||||
# container[1]=novnc (ghcr:latest, ADR-0002 #29) are BOTH TF-managed now.
|
||||
# container[0].image was previously KEEL_IGNORE'd here; that let a stray
|
||||
# clobber to the novnc image stick (chromium-not-found crashloop 2026-06-16)
|
||||
# because TF could not revert the ignored field. Removed so TF re-asserts the
|
||||
# pinned image. Keel is inert (keel.sh/policy=never) and no deploy step touches these.
|
||||
spec[0].template[0].spec[0].init_container[0].image,
|
||||
metadata[0].annotations["kubernetes.io/change-cause"],
|
||||
metadata[0].annotations["deployment.kubernetes.io/revision"],
|
||||
|
|
|
|||
|
|
@ -2840,15 +2840,22 @@ serverFiles:
|
|||
annotations:
|
||||
summary: "MAM ratio is {{ $value | printf \"%.2f\" }} for 24h (target: >= 1.0)"
|
||||
- alert: MAMFarmingStuck
|
||||
# Heartbeat-based: fires only when the grabber CronJob has not COMPLETED
|
||||
# a run in >4h (the original failure mode: Forbid-blocked / wedged in
|
||||
# ContainerCreating). The grabber heartbeats mam_grabber_last_run_timestamp
|
||||
# on every completed run — including legit dry runs that grab 0 (its random
|
||||
# search offset lands on an empty/over-filtered page, which is normal). The
|
||||
# old increase(mam_farming_grabbed[4h])==0 could not tell "didn't run" from
|
||||
# "ran, nothing to grab" (Pushgateway serves the last value forever), so a
|
||||
# dry freeleech period false-fired. Cookie-expiry and qBittorrent-down have
|
||||
# their own alerts (MAM session cookie / QBittorrentDisconnected).
|
||||
expr: |
|
||||
increase(mam_farming_grabbed[4h]) == 0
|
||||
and mam_farming_total_seeding < 150
|
||||
and mam_ratio >= 1.2
|
||||
for: 4h
|
||||
time() - mam_grabber_last_run_timestamp > 4 * 3600
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Grabber has added 0 torrents in 4h despite healthy ratio ({{ $value | printf \"%.2f\" }})"
|
||||
summary: "MAM freeleech grabber has not completed a run in {{ $value | humanizeDuration }} — CronJob stuck/blocked"
|
||||
- alert: MAMJanitorStuckBacklog
|
||||
expr: mam_janitor_skipped_active > 400
|
||||
for: 6h
|
||||
|
|
|
|||
|
|
@ -134,6 +134,7 @@ def main():
|
|||
profile_metrics
|
||||
+ f'mam_grabber_skipped_reason{{reason="{reason}"}} 1\n'
|
||||
+ f"mam_farming_grabbed 0\n"
|
||||
+ f"mam_grabber_last_run_timestamp {int(time.time())}\n"
|
||||
)
|
||||
return
|
||||
|
||||
|
|
@ -153,7 +154,11 @@ def main():
|
|||
).json()
|
||||
except Exception as e:
|
||||
print(f"qBittorrent unreachable: {e}", file=sys.stderr)
|
||||
push(profile_metrics + "mam_farming_grabbed 0\n")
|
||||
push(
|
||||
profile_metrics
|
||||
+ "mam_farming_grabbed 0\n"
|
||||
+ f"mam_grabber_last_run_timestamp {int(time.time())}\n"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
farming = [t for t in all_torrents if t.get("category") == "mam-farming"]
|
||||
|
|
@ -264,6 +269,7 @@ def main():
|
|||
+ f"mam_farming_grabbed {grabbed}\n"
|
||||
+ f"mam_farming_total_seeding {len(farming) + grabbed}\n"
|
||||
+ f"mam_farming_size_bytes {total_size}\n"
|
||||
+ f"mam_grabber_last_run_timestamp {int(time.time())}\n"
|
||||
)
|
||||
push(metrics)
|
||||
print(f"Done: grabbed={grabbed}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue