From 7cf93a05877daf992d99474c15f65bc2fa0ab8c3 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sat, 20 Jun 2026 19:29:43 +0000
Subject: [PATCH] crowdsec+rybbit: proxied edge to single CF list (block-only)
 + retrigger firewall-bouncer apply

CF account hard-limits to 1 Rules List, so proxied enforcement uses one crowdsec_ban
list + one WAF block rule; the sync writes both ban and captcha decisions into it
(captcha downgraded to block at the edge). Drops the second list + managed_challenge
rule. Trivial touch to firewall_bouncer.tf to make CI re-apply crowdsec and recreate
the DaemonSet (tar fix already in master; stale orphan was cleared).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../modules/crowdsec/firewall_bouncer.tf      |  1 +
 stacks/rybbit/crowdsec_edge.tf                | 72 ++++++----------
 stacks/rybbit/lapi_kv_sync.py                 | 86 +++++++++----------
 3 files changed, 69 insertions(+), 90 deletions(-)

diff --git a/stacks/crowdsec/modules/crowdsec/firewall_bouncer.tf b/stacks/crowdsec/modules/crowdsec/firewall_bouncer.tf
index dee3d0fe..1cc5e549 100644
--- a/stacks/crowdsec/modules/crowdsec/firewall_bouncer.tf
+++ b/stacks/crowdsec/modules/crowdsec/firewall_bouncer.tf
@@ -33,6 +33,7 @@
 # nodeSelector pins this to ONE node (k8s-node2, which runs a Traefik pod) for first validation.
 # !!! REMOVING THE nodeSelector ROLLS THIS DAEMONSET CLUSTER-WIDE !!!
 # Do that ONLY after the one-node validation checklist passes (see commit/PR).
+# Validating on k8s-node2 (single node) before removing the nodeSelector to roll cluster-wide.
 
 locals {
   # Pin a specific stable release. Bump deliberately (re-validate on one node first).
diff --git a/stacks/rybbit/crowdsec_edge.tf b/stacks/rybbit/crowdsec_edge.tf
index 4c31b9d8..cf1607ea 100644
--- a/stacks/rybbit/crowdsec_edge.tf
+++ b/stacks/rybbit/crowdsec_edge.tf
@@ -4,28 +4,29 @@
 # Proxied hosts terminate at the Cloudflare edge, so the in-cluster CrowdSec
 # bouncer (which keys on the real client IP seen by Traefik) never gets to
 # decide on them. To enforce CrowdSec bans/captchas on proxied traffic we push
-# the decision INTO the Cloudflare edge as account-level IP Lists + a single
+# the decision INTO the Cloudflare edge as a SINGLE account-level IP List + one
 # zone-scoped WAF custom rule:
 #
-#   * Two account IP Lists — `crowdsec_ban` and `crowdsec_captcha` — hold the
-#     banned / captcha'd source IPs (empty in TF; populated at runtime).
+#   * ONE account IP List — `crowdsec_ban` — holds BOTH the banned AND captcha'd
+#     source IPs (empty in TF; populated at runtime). The CF account hard-limits
+#     to ONE Rules List, so captcha decisions are downgraded to block at the
+#     edge and folded into this same list (block-only enforcement).
 #   * A zone-scoped WAF ruleset in the http_request_firewall_custom phase
-#     blocks `(ip.src in $crowdsec_ban)` and managed-challenges
-#     `(ip.src in $crowdsec_captcha)`. Because it's a ZONE rule it enforces
+#     blocks `(ip.src in $crowdsec_ban)`. Because it's a ZONE rule it enforces
 #     across ALL proxied hosts in the zone (~135), not just the handful a
 #     Worker would route. (The previous Worker+KV design only covered the ~27
 #     hosts the rybbit Worker routed; the analytics Worker in worker/ is
 #     unrelated and stays.)
 #
-# This file is the CONTROL PLANE that keeps those lists in sync with LAPI:
-#   1. the two empty IP Lists (list ITEMS are owned by the CronJob at runtime,
+# This file is the CONTROL PLANE that keeps that list in sync with LAPI:
+#   1. the single empty IP List (list ITEMS are owned by the CronJob at runtime,
 #      NOT by Terraform — see the lifecycle ignore_changes on `item`),
 #   2. a LEAST-PRIVILEGE Cloudflare API token (account Filter-Lists edit only,
 #      scoped to this account) the sync job authenticates with,
 #   3. a CronJob running lapi_kv_sync.py every 2 min to full-reconcile LAPI
-#      decisions into the two lists (mirrors monitoring/alert_digest.tf: stock
-#      python:3.12-alpine + pure-stdlib script from a ConfigMap, no pip/apk at
-#      runtime).
+#      decisions (ban + captcha) into the one list (mirrors
+#      monitoring/alert_digest.tf: stock python:3.12-alpine + pure-stdlib script
+#      from a ConfigMap, no pip/apk at runtime).
 #
 # Cloudflare provider is pinned v4.52.7 (~> 4) — v4 schema is used throughout
 # (v5 differs greatly: policy is a block here not a `policies = [...]` list;
@@ -44,7 +45,7 @@ locals {
 }
 
 # -----------------------------------------------------------------------------
-# IP Lists — empty shells. The CronJob owns the items at runtime via the CF
+# IP List — empty shell. The CronJob owns the items at runtime via the CF
 # Rules-Lists API; TF must NOT manage items or every 2-min sync would fight the
 # next `terragrunt apply` (apply would try to delete the runtime items).
 #
@@ -54,7 +55,7 @@ locals {
 #     comment=... }`. We declare NO `item` blocks (empty list) and
 #     ignore_changes=[item] so runtime items don't show as drift.
 #     NOTE: list `name` must match /^[a-zA-Z0-9_]+$/ (underscores ok, no dashes)
-#     — hence crowdsec_ban / crowdsec_captcha (underscore, not dash).
+#     — hence crowdsec_ban (underscore, not dash).
 # -----------------------------------------------------------------------------
 resource "cloudflare_list" "crowdsec_ban" {
   account_id  = local.cf_account_id
@@ -69,29 +70,17 @@ resource "cloudflare_list" "crowdsec_ban" {
   }
 }
 
-resource "cloudflare_list" "crowdsec_captcha" {
-  account_id  = local.cf_account_id
-  name        = "crowdsec_captcha"
-  kind        = "ip"
-  description = "CrowdSec captcha decisions (synced from LAPI)"
-
-  lifecycle {
-    ignore_changes = [item]
-  }
-}
-
 # -----------------------------------------------------------------------------
-# Zone-scoped WAF custom ruleset — the actual enforcement. One ruleset, two
-# rules, applied to EVERY proxied host in the zone.
+# Zone-scoped WAF custom ruleset — the actual enforcement. One ruleset, one
+# block rule, applied to EVERY proxied host in the zone.
 #
 # ### VERIFY (v4.52.7): cloudflare_ruleset with zone_id + kind="zone" +
 #     phase="http_request_firewall_custom"; `rules` is a repeatable block with
-#     action/expression/description/enabled. actions "block" and
-#     "managed_challenge" are both valid. List references in WAF expressions use
-#     the list NAME with a `$` prefix (NOT the list id): ($crowdsec_ban).
-#     Rule order matters — ban (block) is evaluated before captcha so a
-#     double-listed IP is blocked outright (the sync script also enforces
-#     ban-wins, so an IP is never in both lists, but order is belt-and-braces).
+#     action/expression/description/enabled. action "block" is valid. List
+#     references in WAF expressions use the list NAME with a `$` prefix (NOT the
+#     list id): ($crowdsec_ban). Both ban and captcha decisions land in this one
+#     list (the CF account allows only one Rules List), so a single block rule
+#     covers everything — captcha is enforced as block at the edge.
 #
 # zone_id is the viktorbarzin.me zone — the single zone id used repo-wide
 # (default of var.cloudflare_zone_id in modules/kubernetes/ingress_factory and
@@ -116,24 +105,19 @@ resource "cloudflare_ruleset" "crowdsec" {
   kind    = "zone"
   phase   = "http_request_firewall_custom"
 
-  # The WAF rules reference the IP lists by name ($crowdsec_ban / $crowdsec_captcha),
-  # so the lists must exist before this ruleset is created/updated.
-  depends_on = [cloudflare_list.crowdsec_ban, cloudflare_list.crowdsec_captcha]
+  # The WAF rule references the IP list by name ($crowdsec_ban), so the list
+  # must exist before this ruleset is created/updated.
+  depends_on = [cloudflare_list.crowdsec_ban]
 
-  # CrowdSec ban — evaluated FIRST so a banned IP is blocked before anything else.
+  # CrowdSec ban — block every IP in the single edge list. The sync writes BOTH
+  # ban and captcha decisions into crowdsec_ban (captcha downgraded to block at
+  # the edge) because the CF account allows only ONE Rules List.
   rules {
     action      = "block"
     expression  = "(ip.src in $crowdsec_ban)"
     description = "CrowdSec: block banned IPs"
     enabled     = true
   }
-  # CrowdSec captcha — managed challenge for flagged IPs.
-  rules {
-    action      = "managed_challenge"
-    expression  = "(ip.src in $crowdsec_captcha)"
-    description = "CrowdSec: challenge flagged IPs"
-    enabled     = true
-  }
   # Pre-existing rule, imported and preserved verbatim (currently disabled).
   rules {
     action      = "skip"
@@ -279,10 +263,6 @@ resource "kubernetes_cron_job_v1" "crowdsec_cf_sync" {
                 name  = "CF_BAN_LIST_ID"
                 value = cloudflare_list.crowdsec_ban.id
               }
-              env {
-                name  = "CF_CAPTCHA_LIST_ID"
-                value = cloudflare_list.crowdsec_captcha.id
-              }
               env {
                 name  = "PUSHGATEWAY_URL"
                 value = "http://prometheus-prometheus-pushgateway.monitoring:9091"
diff --git a/stacks/rybbit/lapi_kv_sync.py b/stacks/rybbit/lapi_kv_sync.py
index 942912c9..8a0231d5 100644
--- a/stacks/rybbit/lapi_kv_sync.py
+++ b/stacks/rybbit/lapi_kv_sync.py
@@ -1,30 +1,33 @@
 #!/usr/bin/env python3
-"""Sync CrowdSec LAPI decisions -> two Cloudflare account IP Lists.
+"""Sync CrowdSec LAPI decisions -> ONE Cloudflare account IP List (block-only).
 
 Cloudflare-PROXIED hosts terminate at the CF edge, so the in-cluster CrowdSec
 bouncer (which keys on the client IP Traefik sees) never decides on them. We
 push the decisions into the edge instead: a zone-scoped WAF custom rule blocks
-`(ip.src in $crowdsec_ban)` and managed-challenges `(ip.src in $crowdsec_captcha)`
-across EVERY proxied host in the zone. This job is the control plane that keeps
-those two IP Lists in sync with LAPI.
+`(ip.src in $crowdsec_ban)` across EVERY proxied host in the zone. This job is
+the control plane that keeps that one IP List in sync with LAPI.
+
+The CF account hard-limits to ONE Rules List, so enforcement is BLOCK-ONLY:
+BOTH ban AND captcha (scope=="ip") decisions are folded into the single
+crowdsec_ban list and captcha is downgraded to block at the proxied edge.
 
 (Filename kept as lapi_kv_sync.py for path/ConfigMap continuity with the prior
-Workers-KV design; it no longer touches KV — it reconciles CF Rules Lists.)
+Workers-KV design; it no longer touches KV — it reconciles a CF Rules List.)
 
 Design notes:
   * Pure Python stdlib (no pip/apk at runtime) — runs on stock python:3.12-alpine
     mounted from a ConfigMap, the alert_digest pattern.
-  * FULL RECONCILE each run: read the complete decision set from LAPI, partition
-    into ban / captcha desired sets, then for each list compute add (desired -
-    existing) and remove (existing - desired) and apply both. An IP listed for
-    BOTH ban and captcha is placed in BAN ONLY (ban wins; the WAF rule order
-    also blocks-before-challenges as belt-and-braces). A `cscli decisions
-    delete` therefore clears from the edge within one interval (<=2 min).
-  * FAIL-SAFE on LAPI: if LAPI can't be read we SKIP the run (lists untouched,
-    exit 0). A LAPI outage thus freezes the edge state rather than wiping every
-    ban — degrade toward the last-known-good block set, never toward all-block
-    or a thundering un-ban. (Decisions linger only until the next successful
-    sync, not their TTL — we reconcile to LAPI truth, we don't expire entries.)
+  * FULL RECONCILE each run: read the complete decision set from LAPI, take the
+    UNION of ban + captcha (scope=="ip") as the single desired set, then compute
+    add (desired - existing) and remove (existing - desired) against the one
+    crowdsec_ban list and apply both. A `cscli decisions delete` therefore
+    clears from the edge within one interval (<=2 min).
+  * FAIL-SAFE on LAPI: if LAPI can't be read we SKIP the run (list untouched,
+    exit 0). A LAPI outage thus freezes the edge state rather than wiping the
+    block list — degrade toward the last-known-good block set, never toward
+    all-block or a thundering un-ban. (Decisions linger only until the next
+    successful sync, not their TTL — we reconcile to LAPI truth, we don't
+    expire entries.)
   * FAIL-LOUD on Cloudflare: any CF API error is logged and the job exits
     non-zero so the failure is visible (CronJob backoff + missing success
     metric + the next run retries).
@@ -42,9 +45,9 @@ official API reference (developers.cloudflare.com, 2026):
   * GET    /accounts/{acct}/rules/lists/bulk_operations/{op_id} -> status in
            {pending,running,completed,failed} (failed carries `error`).
   ASYNC HANDLING: Cloudflare allows only ONE pending bulk operation per ACCOUNT.
-  So we must NOT fire add+delete (or both lists) concurrently — we serialize and
-  poll each operation_id to a terminal state (short bounded timeout) before the
-  next mutation. If a poll times out we stop mutating for this run and report
+  So we must NOT fire add+delete concurrently — we serialize and poll each
+  operation_id to a terminal state (short bounded timeout) before the next
+  mutation. If a poll times out we stop mutating for this run and report
   partial success (the next 2-min run reconciles the rest); we never abandon an
   in-flight op and immediately issue another (that would 409/reject).
 """
@@ -63,7 +66,6 @@ LAPI_KEY = os.environ["LAPI_KEY"]  # kvsync bouncer key, registered in LAPI
 CF_ACCOUNT_ID = os.environ["CF_ACCOUNT_ID"]
 CF_API_TOKEN = os.environ["CF_API_TOKEN"]  # scoped: Account Filter Lists Edit
 CF_BAN_LIST_ID = os.environ["CF_BAN_LIST_ID"]
-CF_CAPTCHA_LIST_ID = os.environ["CF_CAPTCHA_LIST_ID"]
 PUSHGATEWAY = os.environ.get("PUSHGATEWAY_URL", "").rstrip("/")  # optional
 
 CF_API = "https://api.cloudflare.com/client/v4"
@@ -115,17 +117,19 @@ def _cf(url, *, method="GET", payload=None, timeout=20):
 # LAPI
 # --------------------------------------------------------------------------- #
 def fetch_decisions():
-    """Return (ban_set, captcha_set) of IPs from LAPI.
+    """Return the single desired set of IPs to BLOCK at the edge.
 
-    Only scope=="ip" decisions are projected (the WAF rule keys on ip.src). An
-    IP appearing in both ban and captcha is placed in BAN only. Raises on
-    transport/HTTP error so the caller can SKIP the run (fail-safe).
+    Only scope=="ip" decisions are projected (the WAF rule keys on ip.src). The
+    CF account allows only ONE Rules List, so BOTH "ban" AND "captcha" decisions
+    are folded into one block set (captcha is downgraded to block at the proxied
+    edge). Raises on transport/HTTP error so the caller can SKIP the run
+    (fail-safe).
     """
     data = _req(
         f"{LAPI_URL}/v1/decisions",
         headers={"X-Api-Key": LAPI_KEY, "Accept": "application/json"},
     )
-    ban, captcha = set(), set()
+    block = set()
     for d in data or []:
         if (d.get("scope") or "").lower() != "ip":
             continue
@@ -133,13 +137,10 @@ def fetch_decisions():
         if not ip:
             continue
         dtype = (d.get("type") or "").lower()
-        if dtype == "ban":
-            ban.add(ip)
-        elif dtype == "captcha":
-            captcha.add(ip)
+        if dtype in ("ban", "captcha"):
+            block.add(ip)
         # other remediation types (e.g. throttle) are ignored
-    captcha -= ban  # ban wins: never list the same IP in both
-    return ban, captcha
+    return block
 
 
 # --------------------------------------------------------------------------- #
@@ -260,14 +261,12 @@ def reconcile(label, list_id, desired):
 # --------------------------------------------------------------------------- #
 # Metrics (best-effort)
 # --------------------------------------------------------------------------- #
-def push_metrics(ban_n, captcha_n, ok):
+def push_metrics(block_n, ok):
     if not PUSHGATEWAY:
         return
     payload = (
         "# TYPE crowdsec_cf_list_ban_count gauge\n"
-        f"crowdsec_cf_list_ban_count {ban_n}\n"
-        "# TYPE crowdsec_cf_list_captcha_count gauge\n"
-        f"crowdsec_cf_list_captcha_count {captcha_n}\n"
+        f"crowdsec_cf_list_ban_count {block_n}\n"
         "# TYPE crowdsec_cf_list_sync_success gauge\n"
         f"crowdsec_cf_list_sync_success {1 if ok else 0}\n"
         "# TYPE crowdsec_cf_list_sync_last_run_seconds gauge\n"
@@ -289,28 +288,27 @@ def push_metrics(ban_n, captcha_n, ok):
 def main():
     # 1. Desired state from LAPI. Any failure here = SKIP (fail-safe).
     try:
-        ban, captcha = fetch_decisions()
+        block = fetch_decisions()
     except Exception as e:
         print(
-            f"[skip] LAPI unreadable ({e}); leaving CF lists untouched "
+            f"[skip] LAPI unreadable ({e}); leaving the CF list untouched "
             f"(fail-safe: freeze last-known edge state).",
             file=sys.stderr,
         )
-        push_metrics(0, 0, ok=False)
+        push_metrics(0, ok=False)
         return 0
 
-    print(f"[info] LAPI desired: {len(ban)} ban / {len(captcha)} captcha (ip-scope)")
+    print(f"[info] LAPI desired: {len(block)} block (ban+captcha, ip-scope)")
 
-    # 2. Reconcile both lists. CF errors fail loud (non-zero exit).
+    # 2. Reconcile the single block list. CF errors fail loud (non-zero exit).
     try:
-        reconcile("ban", CF_BAN_LIST_ID, ban)
-        reconcile("captcha", CF_CAPTCHA_LIST_ID, captcha)
+        reconcile("block", CF_BAN_LIST_ID, block)
     except CFError as e:
         print(f"[error] Cloudflare API failure: {e}", file=sys.stderr)
-        push_metrics(len(ban), len(captcha), ok=False)
+        push_metrics(len(block), ok=False)
         return 1
 
-    push_metrics(len(ban), len(captcha), ok=True)
+    push_metrics(len(block), ok=True)
     return 0
 
 
-- 
2.49.1