From 7cf93a05877daf992d99474c15f65bc2fa0ab8c3 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 20 Jun 2026 19:29:43 +0000 Subject: [PATCH] crowdsec+rybbit: proxied edge to single CF list (block-only) + retrigger firewall-bouncer apply CF account hard-limits to 1 Rules List, so proxied enforcement uses one crowdsec_ban list + one WAF block rule; the sync writes both ban and captcha decisions into it (captcha downgraded to block at the edge). Drops the second list + managed_challenge rule. Trivial touch to firewall_bouncer.tf to make CI re-apply crowdsec and recreate the DaemonSet (tar fix already in master; stale orphan was cleared). Co-Authored-By: Claude Opus 4.8 --- .../modules/crowdsec/firewall_bouncer.tf | 1 + stacks/rybbit/crowdsec_edge.tf | 72 ++++++---------- stacks/rybbit/lapi_kv_sync.py | 86 +++++++++---------- 3 files changed, 69 insertions(+), 90 deletions(-) diff --git a/stacks/crowdsec/modules/crowdsec/firewall_bouncer.tf b/stacks/crowdsec/modules/crowdsec/firewall_bouncer.tf index dee3d0fe..1cc5e549 100644 --- a/stacks/crowdsec/modules/crowdsec/firewall_bouncer.tf +++ b/stacks/crowdsec/modules/crowdsec/firewall_bouncer.tf @@ -33,6 +33,7 @@ # nodeSelector pins this to ONE node (k8s-node2, which runs a Traefik pod) for first validation. # !!! REMOVING THE nodeSelector ROLLS THIS DAEMONSET CLUSTER-WIDE !!! # Do that ONLY after the one-node validation checklist passes (see commit/PR). +# Validating on k8s-node2 (single node) before removing the nodeSelector to roll cluster-wide. locals { # Pin a specific stable release. Bump deliberately (re-validate on one node first). diff --git a/stacks/rybbit/crowdsec_edge.tf b/stacks/rybbit/crowdsec_edge.tf index 4c31b9d8..cf1607ea 100644 --- a/stacks/rybbit/crowdsec_edge.tf +++ b/stacks/rybbit/crowdsec_edge.tf @@ -4,28 +4,29 @@ # Proxied hosts terminate at the Cloudflare edge, so the in-cluster CrowdSec # bouncer (which keys on the real client IP seen by Traefik) never gets to # decide on them. To enforce CrowdSec bans/captchas on proxied traffic we push -# the decision INTO the Cloudflare edge as account-level IP Lists + a single +# the decision INTO the Cloudflare edge as a SINGLE account-level IP List + one # zone-scoped WAF custom rule: # -# * Two account IP Lists — `crowdsec_ban` and `crowdsec_captcha` — hold the -# banned / captcha'd source IPs (empty in TF; populated at runtime). +# * ONE account IP List — `crowdsec_ban` — holds BOTH the banned AND captcha'd +# source IPs (empty in TF; populated at runtime). The CF account hard-limits +# to ONE Rules List, so captcha decisions are downgraded to block at the +# edge and folded into this same list (block-only enforcement). # * A zone-scoped WAF ruleset in the http_request_firewall_custom phase -# blocks `(ip.src in $crowdsec_ban)` and managed-challenges -# `(ip.src in $crowdsec_captcha)`. Because it's a ZONE rule it enforces +# blocks `(ip.src in $crowdsec_ban)`. Because it's a ZONE rule it enforces # across ALL proxied hosts in the zone (~135), not just the handful a # Worker would route. (The previous Worker+KV design only covered the ~27 # hosts the rybbit Worker routed; the analytics Worker in worker/ is # unrelated and stays.) # -# This file is the CONTROL PLANE that keeps those lists in sync with LAPI: -# 1. the two empty IP Lists (list ITEMS are owned by the CronJob at runtime, +# This file is the CONTROL PLANE that keeps that list in sync with LAPI: +# 1. the single empty IP List (list ITEMS are owned by the CronJob at runtime, # NOT by Terraform — see the lifecycle ignore_changes on `item`), # 2. a LEAST-PRIVILEGE Cloudflare API token (account Filter-Lists edit only, # scoped to this account) the sync job authenticates with, # 3. a CronJob running lapi_kv_sync.py every 2 min to full-reconcile LAPI -# decisions into the two lists (mirrors monitoring/alert_digest.tf: stock -# python:3.12-alpine + pure-stdlib script from a ConfigMap, no pip/apk at -# runtime). +# decisions (ban + captcha) into the one list (mirrors +# monitoring/alert_digest.tf: stock python:3.12-alpine + pure-stdlib script +# from a ConfigMap, no pip/apk at runtime). # # Cloudflare provider is pinned v4.52.7 (~> 4) — v4 schema is used throughout # (v5 differs greatly: policy is a block here not a `policies = [...]` list; @@ -44,7 +45,7 @@ locals { } # ----------------------------------------------------------------------------- -# IP Lists — empty shells. The CronJob owns the items at runtime via the CF +# IP List — empty shell. The CronJob owns the items at runtime via the CF # Rules-Lists API; TF must NOT manage items or every 2-min sync would fight the # next `terragrunt apply` (apply would try to delete the runtime items). # @@ -54,7 +55,7 @@ locals { # comment=... }`. We declare NO `item` blocks (empty list) and # ignore_changes=[item] so runtime items don't show as drift. # NOTE: list `name` must match /^[a-zA-Z0-9_]+$/ (underscores ok, no dashes) -# — hence crowdsec_ban / crowdsec_captcha (underscore, not dash). +# — hence crowdsec_ban (underscore, not dash). # ----------------------------------------------------------------------------- resource "cloudflare_list" "crowdsec_ban" { account_id = local.cf_account_id @@ -69,29 +70,17 @@ resource "cloudflare_list" "crowdsec_ban" { } } -resource "cloudflare_list" "crowdsec_captcha" { - account_id = local.cf_account_id - name = "crowdsec_captcha" - kind = "ip" - description = "CrowdSec captcha decisions (synced from LAPI)" - - lifecycle { - ignore_changes = [item] - } -} - # ----------------------------------------------------------------------------- -# Zone-scoped WAF custom ruleset — the actual enforcement. One ruleset, two -# rules, applied to EVERY proxied host in the zone. +# Zone-scoped WAF custom ruleset — the actual enforcement. One ruleset, one +# block rule, applied to EVERY proxied host in the zone. # # ### VERIFY (v4.52.7): cloudflare_ruleset with zone_id + kind="zone" + # phase="http_request_firewall_custom"; `rules` is a repeatable block with -# action/expression/description/enabled. actions "block" and -# "managed_challenge" are both valid. List references in WAF expressions use -# the list NAME with a `$` prefix (NOT the list id): ($crowdsec_ban). -# Rule order matters — ban (block) is evaluated before captcha so a -# double-listed IP is blocked outright (the sync script also enforces -# ban-wins, so an IP is never in both lists, but order is belt-and-braces). +# action/expression/description/enabled. action "block" is valid. List +# references in WAF expressions use the list NAME with a `$` prefix (NOT the +# list id): ($crowdsec_ban). Both ban and captcha decisions land in this one +# list (the CF account allows only one Rules List), so a single block rule +# covers everything — captcha is enforced as block at the edge. # # zone_id is the viktorbarzin.me zone — the single zone id used repo-wide # (default of var.cloudflare_zone_id in modules/kubernetes/ingress_factory and @@ -116,24 +105,19 @@ resource "cloudflare_ruleset" "crowdsec" { kind = "zone" phase = "http_request_firewall_custom" - # The WAF rules reference the IP lists by name ($crowdsec_ban / $crowdsec_captcha), - # so the lists must exist before this ruleset is created/updated. - depends_on = [cloudflare_list.crowdsec_ban, cloudflare_list.crowdsec_captcha] + # The WAF rule references the IP list by name ($crowdsec_ban), so the list + # must exist before this ruleset is created/updated. + depends_on = [cloudflare_list.crowdsec_ban] - # CrowdSec ban — evaluated FIRST so a banned IP is blocked before anything else. + # CrowdSec ban — block every IP in the single edge list. The sync writes BOTH + # ban and captcha decisions into crowdsec_ban (captcha downgraded to block at + # the edge) because the CF account allows only ONE Rules List. rules { action = "block" expression = "(ip.src in $crowdsec_ban)" description = "CrowdSec: block banned IPs" enabled = true } - # CrowdSec captcha — managed challenge for flagged IPs. - rules { - action = "managed_challenge" - expression = "(ip.src in $crowdsec_captcha)" - description = "CrowdSec: challenge flagged IPs" - enabled = true - } # Pre-existing rule, imported and preserved verbatim (currently disabled). rules { action = "skip" @@ -279,10 +263,6 @@ resource "kubernetes_cron_job_v1" "crowdsec_cf_sync" { name = "CF_BAN_LIST_ID" value = cloudflare_list.crowdsec_ban.id } - env { - name = "CF_CAPTCHA_LIST_ID" - value = cloudflare_list.crowdsec_captcha.id - } env { name = "PUSHGATEWAY_URL" value = "http://prometheus-prometheus-pushgateway.monitoring:9091" diff --git a/stacks/rybbit/lapi_kv_sync.py b/stacks/rybbit/lapi_kv_sync.py index 942912c9..8a0231d5 100644 --- a/stacks/rybbit/lapi_kv_sync.py +++ b/stacks/rybbit/lapi_kv_sync.py @@ -1,30 +1,33 @@ #!/usr/bin/env python3 -"""Sync CrowdSec LAPI decisions -> two Cloudflare account IP Lists. +"""Sync CrowdSec LAPI decisions -> ONE Cloudflare account IP List (block-only). Cloudflare-PROXIED hosts terminate at the CF edge, so the in-cluster CrowdSec bouncer (which keys on the client IP Traefik sees) never decides on them. We push the decisions into the edge instead: a zone-scoped WAF custom rule blocks -`(ip.src in $crowdsec_ban)` and managed-challenges `(ip.src in $crowdsec_captcha)` -across EVERY proxied host in the zone. This job is the control plane that keeps -those two IP Lists in sync with LAPI. +`(ip.src in $crowdsec_ban)` across EVERY proxied host in the zone. This job is +the control plane that keeps that one IP List in sync with LAPI. + +The CF account hard-limits to ONE Rules List, so enforcement is BLOCK-ONLY: +BOTH ban AND captcha (scope=="ip") decisions are folded into the single +crowdsec_ban list and captcha is downgraded to block at the proxied edge. (Filename kept as lapi_kv_sync.py for path/ConfigMap continuity with the prior -Workers-KV design; it no longer touches KV — it reconciles CF Rules Lists.) +Workers-KV design; it no longer touches KV — it reconciles a CF Rules List.) Design notes: * Pure Python stdlib (no pip/apk at runtime) — runs on stock python:3.12-alpine mounted from a ConfigMap, the alert_digest pattern. - * FULL RECONCILE each run: read the complete decision set from LAPI, partition - into ban / captcha desired sets, then for each list compute add (desired - - existing) and remove (existing - desired) and apply both. An IP listed for - BOTH ban and captcha is placed in BAN ONLY (ban wins; the WAF rule order - also blocks-before-challenges as belt-and-braces). A `cscli decisions - delete` therefore clears from the edge within one interval (<=2 min). - * FAIL-SAFE on LAPI: if LAPI can't be read we SKIP the run (lists untouched, - exit 0). A LAPI outage thus freezes the edge state rather than wiping every - ban — degrade toward the last-known-good block set, never toward all-block - or a thundering un-ban. (Decisions linger only until the next successful - sync, not their TTL — we reconcile to LAPI truth, we don't expire entries.) + * FULL RECONCILE each run: read the complete decision set from LAPI, take the + UNION of ban + captcha (scope=="ip") as the single desired set, then compute + add (desired - existing) and remove (existing - desired) against the one + crowdsec_ban list and apply both. A `cscli decisions delete` therefore + clears from the edge within one interval (<=2 min). + * FAIL-SAFE on LAPI: if LAPI can't be read we SKIP the run (list untouched, + exit 0). A LAPI outage thus freezes the edge state rather than wiping the + block list — degrade toward the last-known-good block set, never toward + all-block or a thundering un-ban. (Decisions linger only until the next + successful sync, not their TTL — we reconcile to LAPI truth, we don't + expire entries.) * FAIL-LOUD on Cloudflare: any CF API error is logged and the job exits non-zero so the failure is visible (CronJob backoff + missing success metric + the next run retries). @@ -42,9 +45,9 @@ official API reference (developers.cloudflare.com, 2026): * GET /accounts/{acct}/rules/lists/bulk_operations/{op_id} -> status in {pending,running,completed,failed} (failed carries `error`). ASYNC HANDLING: Cloudflare allows only ONE pending bulk operation per ACCOUNT. - So we must NOT fire add+delete (or both lists) concurrently — we serialize and - poll each operation_id to a terminal state (short bounded timeout) before the - next mutation. If a poll times out we stop mutating for this run and report + So we must NOT fire add+delete concurrently — we serialize and poll each + operation_id to a terminal state (short bounded timeout) before the next + mutation. If a poll times out we stop mutating for this run and report partial success (the next 2-min run reconciles the rest); we never abandon an in-flight op and immediately issue another (that would 409/reject). """ @@ -63,7 +66,6 @@ LAPI_KEY = os.environ["LAPI_KEY"] # kvsync bouncer key, registered in LAPI CF_ACCOUNT_ID = os.environ["CF_ACCOUNT_ID"] CF_API_TOKEN = os.environ["CF_API_TOKEN"] # scoped: Account Filter Lists Edit CF_BAN_LIST_ID = os.environ["CF_BAN_LIST_ID"] -CF_CAPTCHA_LIST_ID = os.environ["CF_CAPTCHA_LIST_ID"] PUSHGATEWAY = os.environ.get("PUSHGATEWAY_URL", "").rstrip("/") # optional CF_API = "https://api.cloudflare.com/client/v4" @@ -115,17 +117,19 @@ def _cf(url, *, method="GET", payload=None, timeout=20): # LAPI # --------------------------------------------------------------------------- # def fetch_decisions(): - """Return (ban_set, captcha_set) of IPs from LAPI. + """Return the single desired set of IPs to BLOCK at the edge. - Only scope=="ip" decisions are projected (the WAF rule keys on ip.src). An - IP appearing in both ban and captcha is placed in BAN only. Raises on - transport/HTTP error so the caller can SKIP the run (fail-safe). + Only scope=="ip" decisions are projected (the WAF rule keys on ip.src). The + CF account allows only ONE Rules List, so BOTH "ban" AND "captcha" decisions + are folded into one block set (captcha is downgraded to block at the proxied + edge). Raises on transport/HTTP error so the caller can SKIP the run + (fail-safe). """ data = _req( f"{LAPI_URL}/v1/decisions", headers={"X-Api-Key": LAPI_KEY, "Accept": "application/json"}, ) - ban, captcha = set(), set() + block = set() for d in data or []: if (d.get("scope") or "").lower() != "ip": continue @@ -133,13 +137,10 @@ def fetch_decisions(): if not ip: continue dtype = (d.get("type") or "").lower() - if dtype == "ban": - ban.add(ip) - elif dtype == "captcha": - captcha.add(ip) + if dtype in ("ban", "captcha"): + block.add(ip) # other remediation types (e.g. throttle) are ignored - captcha -= ban # ban wins: never list the same IP in both - return ban, captcha + return block # --------------------------------------------------------------------------- # @@ -260,14 +261,12 @@ def reconcile(label, list_id, desired): # --------------------------------------------------------------------------- # # Metrics (best-effort) # --------------------------------------------------------------------------- # -def push_metrics(ban_n, captcha_n, ok): +def push_metrics(block_n, ok): if not PUSHGATEWAY: return payload = ( "# TYPE crowdsec_cf_list_ban_count gauge\n" - f"crowdsec_cf_list_ban_count {ban_n}\n" - "# TYPE crowdsec_cf_list_captcha_count gauge\n" - f"crowdsec_cf_list_captcha_count {captcha_n}\n" + f"crowdsec_cf_list_ban_count {block_n}\n" "# TYPE crowdsec_cf_list_sync_success gauge\n" f"crowdsec_cf_list_sync_success {1 if ok else 0}\n" "# TYPE crowdsec_cf_list_sync_last_run_seconds gauge\n" @@ -289,28 +288,27 @@ def push_metrics(ban_n, captcha_n, ok): def main(): # 1. Desired state from LAPI. Any failure here = SKIP (fail-safe). try: - ban, captcha = fetch_decisions() + block = fetch_decisions() except Exception as e: print( - f"[skip] LAPI unreadable ({e}); leaving CF lists untouched " + f"[skip] LAPI unreadable ({e}); leaving the CF list untouched " f"(fail-safe: freeze last-known edge state).", file=sys.stderr, ) - push_metrics(0, 0, ok=False) + push_metrics(0, ok=False) return 0 - print(f"[info] LAPI desired: {len(ban)} ban / {len(captcha)} captcha (ip-scope)") + print(f"[info] LAPI desired: {len(block)} block (ban+captcha, ip-scope)") - # 2. Reconcile both lists. CF errors fail loud (non-zero exit). + # 2. Reconcile the single block list. CF errors fail loud (non-zero exit). try: - reconcile("ban", CF_BAN_LIST_ID, ban) - reconcile("captcha", CF_CAPTCHA_LIST_ID, captcha) + reconcile("block", CF_BAN_LIST_ID, block) except CFError as e: print(f"[error] Cloudflare API failure: {e}", file=sys.stderr) - push_metrics(len(ban), len(captcha), ok=False) + push_metrics(len(block), ok=False) return 1 - push_metrics(len(ban), len(captcha), ok=True) + push_metrics(len(block), ok=True) return 0 -- 2.49.1