From 5b49634fe0d2ef3fc5d99ed24a28d04b08e80c88 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 27 Jun 2026 15:23:42 +0000 Subject: [PATCH] rybbit/crowdsec-cf-sync: stop Cloudflare Lists-API retry-storm (429 self-DoS) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The edge-ban sync was failing every 2 min on Cloudflare HTTP 429 (rate-limited) and never recovering, leaving the crowdsec_ban list empty. Root cause: backoff_limit=2 made k8s re-run a failing pod up to 3x within seconds, so each */2 cycle fired a burst of POSTs into Cloudflare's per-60s Lists-API write limit. That kept the throttle perpetually tripped (it stopped clearing even after minutes of quiet) — a self-inflicted DoS. Two changes make the sync gentle and self-healing: - backoff_limit 2 -> 0: one attempt per */2 cycle (the schedule IS the retry cadence), no rapid-fire burst. - lapi_kv_sync.py: treat a CF 429 as a soft-skip (exit 0, retry next cycle) like the existing LAPI fail-safe, instead of fail-loud + k8s retry. Any other CF error still fails loud. Found during a cluster health check (AIOStreams CSI + pfSense SSH issues handled separately). Co-Authored-By: Claude Opus 4.8 --- stacks/rybbit/crowdsec_edge.tf | 7 ++++++- stacks/rybbit/lapi_kv_sync.py | 23 ++++++++++++++++++++--- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/stacks/rybbit/crowdsec_edge.tf b/stacks/rybbit/crowdsec_edge.tf index 692c3711..a24d0039 100644 --- a/stacks/rybbit/crowdsec_edge.tf +++ b/stacks/rybbit/crowdsec_edge.tf @@ -234,7 +234,12 @@ resource "kubernetes_cron_job_v1" "crowdsec_cf_sync" { job_template { metadata {} spec { - backoff_limit = 2 + # 0 retries: the */2 schedule IS the retry cadence. backoff_limit=2 made + # k8s re-run a failing pod up to 3x within seconds, hammering Cloudflare's + # Lists-API write limit inside one 60s window and escalating the throttle + # until it stopped clearing (2026-06-27 outage). One attempt per cycle + + # the 429-soft-skip in lapi_kv_sync.py keeps the sync gentle/self-healing. + backoff_limit = 0 ttl_seconds_after_finished = 3600 template { metadata { diff --git a/stacks/rybbit/lapi_kv_sync.py b/stacks/rybbit/lapi_kv_sync.py index 01eb1c89..627974ab 100644 --- a/stacks/rybbit/lapi_kv_sync.py +++ b/stacks/rybbit/lapi_kv_sync.py @@ -84,7 +84,12 @@ POLL_INTERVAL = 1.0 class CFError(Exception): - """Cloudflare API failure -> job should exit non-zero (fail loud).""" + """Cloudflare API failure. Carries the HTTP status so the caller can treat a + 429 rate-limit as a soft-skip (retry next run) instead of a hard failure.""" + + def __init__(self, message, status=None): + super().__init__(message) + self.status = status def _req(url, *, method="GET", headers=None, data=None, timeout=20): @@ -109,7 +114,7 @@ def _cf(url, *, method="GET", payload=None, timeout=20): detail = e.read().decode(errors="replace")[:500] except Exception: pass - raise CFError(f"{method} {url} -> HTTP {e.code} {detail}") from e + raise CFError(f"{method} {url} -> HTTP {e.code} {detail}", status=e.code) from e except urllib.error.URLError as e: raise CFError(f"{method} {url} -> {e}") from e if res is not None and not res.get("success", True): @@ -330,10 +335,22 @@ def main(): print(f"[info] LAPI desired: {len(block)} block (ban-only, ip-scope)") - # 2. Reconcile the single block list. CF errors fail loud (non-zero exit). + # 2. Reconcile the single block list. A 429 rate-limit is a SOFT-SKIP (exit + # 0, retry next */2 run) — like the LAPI fail-safe above — so a transient + # Cloudflare Lists-API throttle never marks the job Failed or triggers a k8s + # retry-storm (rapid re-attempts only deepen the throttle until it stops + # clearing). Any OTHER CF error still fails loud (non-zero exit). try: reconcile("block", CF_BAN_LIST_ID, block) except CFError as e: + if e.status == 429: + print( + f"[skip] Cloudflare rate-limited ({e}); leaving the list " + f"untouched this run, will retry next cycle (fail-safe).", + file=sys.stderr, + ) + push_metrics(len(block), ok=False) + return 0 print(f"[error] Cloudflare API failure: {e}", file=sys.stderr) push_metrics(len(block), ok=False) return 1