Merge pull request 'CrowdSec proxied: single CF list (block-only) + firewall-bouncer re-apply' (#5) from wizard/crowdsec-1list into master
Some checks failed
ci/woodpecker/push/default Pipeline failed
Some checks failed
ci/woodpecker/push/default Pipeline failed
This commit is contained in:
commit
834c5e6a2a
3 changed files with 69 additions and 90 deletions
|
|
@ -33,6 +33,7 @@
|
|||
# nodeSelector pins this to ONE node (k8s-node2, which runs a Traefik pod) for first validation.
|
||||
# !!! REMOVING THE nodeSelector ROLLS THIS DAEMONSET CLUSTER-WIDE !!!
|
||||
# Do that ONLY after the one-node validation checklist passes (see commit/PR).
|
||||
# Validating on k8s-node2 (single node) before removing the nodeSelector to roll cluster-wide.
|
||||
|
||||
locals {
|
||||
# Pin a specific stable release. Bump deliberately (re-validate on one node first).
|
||||
|
|
|
|||
|
|
@ -4,28 +4,29 @@
|
|||
# Proxied hosts terminate at the Cloudflare edge, so the in-cluster CrowdSec
|
||||
# bouncer (which keys on the real client IP seen by Traefik) never gets to
|
||||
# decide on them. To enforce CrowdSec bans/captchas on proxied traffic we push
|
||||
# the decision INTO the Cloudflare edge as account-level IP Lists + a single
|
||||
# the decision INTO the Cloudflare edge as a SINGLE account-level IP List + one
|
||||
# zone-scoped WAF custom rule:
|
||||
#
|
||||
# * Two account IP Lists — `crowdsec_ban` and `crowdsec_captcha` — hold the
|
||||
# banned / captcha'd source IPs (empty in TF; populated at runtime).
|
||||
# * ONE account IP List — `crowdsec_ban` — holds BOTH the banned AND captcha'd
|
||||
# source IPs (empty in TF; populated at runtime). The CF account hard-limits
|
||||
# to ONE Rules List, so captcha decisions are downgraded to block at the
|
||||
# edge and folded into this same list (block-only enforcement).
|
||||
# * A zone-scoped WAF ruleset in the http_request_firewall_custom phase
|
||||
# blocks `(ip.src in $crowdsec_ban)` and managed-challenges
|
||||
# `(ip.src in $crowdsec_captcha)`. Because it's a ZONE rule it enforces
|
||||
# blocks `(ip.src in $crowdsec_ban)`. Because it's a ZONE rule it enforces
|
||||
# across ALL proxied hosts in the zone (~135), not just the handful a
|
||||
# Worker would route. (The previous Worker+KV design only covered the ~27
|
||||
# hosts the rybbit Worker routed; the analytics Worker in worker/ is
|
||||
# unrelated and stays.)
|
||||
#
|
||||
# This file is the CONTROL PLANE that keeps those lists in sync with LAPI:
|
||||
# 1. the two empty IP Lists (list ITEMS are owned by the CronJob at runtime,
|
||||
# This file is the CONTROL PLANE that keeps that list in sync with LAPI:
|
||||
# 1. the single empty IP List (list ITEMS are owned by the CronJob at runtime,
|
||||
# NOT by Terraform — see the lifecycle ignore_changes on `item`),
|
||||
# 2. a LEAST-PRIVILEGE Cloudflare API token (account Filter-Lists edit only,
|
||||
# scoped to this account) the sync job authenticates with,
|
||||
# 3. a CronJob running lapi_kv_sync.py every 2 min to full-reconcile LAPI
|
||||
# decisions into the two lists (mirrors monitoring/alert_digest.tf: stock
|
||||
# python:3.12-alpine + pure-stdlib script from a ConfigMap, no pip/apk at
|
||||
# runtime).
|
||||
# decisions (ban + captcha) into the one list (mirrors
|
||||
# monitoring/alert_digest.tf: stock python:3.12-alpine + pure-stdlib script
|
||||
# from a ConfigMap, no pip/apk at runtime).
|
||||
#
|
||||
# Cloudflare provider is pinned v4.52.7 (~> 4) — v4 schema is used throughout
|
||||
# (v5 differs greatly: policy is a block here not a `policies = [...]` list;
|
||||
|
|
@ -44,7 +45,7 @@ locals {
|
|||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# IP Lists — empty shells. The CronJob owns the items at runtime via the CF
|
||||
# IP List — empty shell. The CronJob owns the items at runtime via the CF
|
||||
# Rules-Lists API; TF must NOT manage items or every 2-min sync would fight the
|
||||
# next `terragrunt apply` (apply would try to delete the runtime items).
|
||||
#
|
||||
|
|
@ -54,7 +55,7 @@ locals {
|
|||
# comment=... }`. We declare NO `item` blocks (empty list) and
|
||||
# ignore_changes=[item] so runtime items don't show as drift.
|
||||
# NOTE: list `name` must match /^[a-zA-Z0-9_]+$/ (underscores ok, no dashes)
|
||||
# — hence crowdsec_ban / crowdsec_captcha (underscore, not dash).
|
||||
# — hence crowdsec_ban (underscore, not dash).
|
||||
# -----------------------------------------------------------------------------
|
||||
resource "cloudflare_list" "crowdsec_ban" {
|
||||
account_id = local.cf_account_id
|
||||
|
|
@ -69,29 +70,17 @@ resource "cloudflare_list" "crowdsec_ban" {
|
|||
}
|
||||
}
|
||||
|
||||
resource "cloudflare_list" "crowdsec_captcha" {
|
||||
account_id = local.cf_account_id
|
||||
name = "crowdsec_captcha"
|
||||
kind = "ip"
|
||||
description = "CrowdSec captcha decisions (synced from LAPI)"
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = [item]
|
||||
}
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Zone-scoped WAF custom ruleset — the actual enforcement. One ruleset, two
|
||||
# rules, applied to EVERY proxied host in the zone.
|
||||
# Zone-scoped WAF custom ruleset — the actual enforcement. One ruleset, one
|
||||
# block rule, applied to EVERY proxied host in the zone.
|
||||
#
|
||||
# ### VERIFY (v4.52.7): cloudflare_ruleset with zone_id + kind="zone" +
|
||||
# phase="http_request_firewall_custom"; `rules` is a repeatable block with
|
||||
# action/expression/description/enabled. actions "block" and
|
||||
# "managed_challenge" are both valid. List references in WAF expressions use
|
||||
# the list NAME with a `$` prefix (NOT the list id): ($crowdsec_ban).
|
||||
# Rule order matters — ban (block) is evaluated before captcha so a
|
||||
# double-listed IP is blocked outright (the sync script also enforces
|
||||
# ban-wins, so an IP is never in both lists, but order is belt-and-braces).
|
||||
# action/expression/description/enabled. action "block" is valid. List
|
||||
# references in WAF expressions use the list NAME with a `$` prefix (NOT the
|
||||
# list id): ($crowdsec_ban). Both ban and captcha decisions land in this one
|
||||
# list (the CF account allows only one Rules List), so a single block rule
|
||||
# covers everything — captcha is enforced as block at the edge.
|
||||
#
|
||||
# zone_id is the viktorbarzin.me zone — the single zone id used repo-wide
|
||||
# (default of var.cloudflare_zone_id in modules/kubernetes/ingress_factory and
|
||||
|
|
@ -116,24 +105,19 @@ resource "cloudflare_ruleset" "crowdsec" {
|
|||
kind = "zone"
|
||||
phase = "http_request_firewall_custom"
|
||||
|
||||
# The WAF rules reference the IP lists by name ($crowdsec_ban / $crowdsec_captcha),
|
||||
# so the lists must exist before this ruleset is created/updated.
|
||||
depends_on = [cloudflare_list.crowdsec_ban, cloudflare_list.crowdsec_captcha]
|
||||
# The WAF rule references the IP list by name ($crowdsec_ban), so the list
|
||||
# must exist before this ruleset is created/updated.
|
||||
depends_on = [cloudflare_list.crowdsec_ban]
|
||||
|
||||
# CrowdSec ban — evaluated FIRST so a banned IP is blocked before anything else.
|
||||
# CrowdSec ban — block every IP in the single edge list. The sync writes BOTH
|
||||
# ban and captcha decisions into crowdsec_ban (captcha downgraded to block at
|
||||
# the edge) because the CF account allows only ONE Rules List.
|
||||
rules {
|
||||
action = "block"
|
||||
expression = "(ip.src in $crowdsec_ban)"
|
||||
description = "CrowdSec: block banned IPs"
|
||||
enabled = true
|
||||
}
|
||||
# CrowdSec captcha — managed challenge for flagged IPs.
|
||||
rules {
|
||||
action = "managed_challenge"
|
||||
expression = "(ip.src in $crowdsec_captcha)"
|
||||
description = "CrowdSec: challenge flagged IPs"
|
||||
enabled = true
|
||||
}
|
||||
# Pre-existing rule, imported and preserved verbatim (currently disabled).
|
||||
rules {
|
||||
action = "skip"
|
||||
|
|
@ -279,10 +263,6 @@ resource "kubernetes_cron_job_v1" "crowdsec_cf_sync" {
|
|||
name = "CF_BAN_LIST_ID"
|
||||
value = cloudflare_list.crowdsec_ban.id
|
||||
}
|
||||
env {
|
||||
name = "CF_CAPTCHA_LIST_ID"
|
||||
value = cloudflare_list.crowdsec_captcha.id
|
||||
}
|
||||
env {
|
||||
name = "PUSHGATEWAY_URL"
|
||||
value = "http://prometheus-prometheus-pushgateway.monitoring:9091"
|
||||
|
|
|
|||
|
|
@ -1,30 +1,33 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Sync CrowdSec LAPI decisions -> two Cloudflare account IP Lists.
|
||||
"""Sync CrowdSec LAPI decisions -> ONE Cloudflare account IP List (block-only).
|
||||
|
||||
Cloudflare-PROXIED hosts terminate at the CF edge, so the in-cluster CrowdSec
|
||||
bouncer (which keys on the client IP Traefik sees) never decides on them. We
|
||||
push the decisions into the edge instead: a zone-scoped WAF custom rule blocks
|
||||
`(ip.src in $crowdsec_ban)` and managed-challenges `(ip.src in $crowdsec_captcha)`
|
||||
across EVERY proxied host in the zone. This job is the control plane that keeps
|
||||
those two IP Lists in sync with LAPI.
|
||||
`(ip.src in $crowdsec_ban)` across EVERY proxied host in the zone. This job is
|
||||
the control plane that keeps that one IP List in sync with LAPI.
|
||||
|
||||
The CF account hard-limits to ONE Rules List, so enforcement is BLOCK-ONLY:
|
||||
BOTH ban AND captcha (scope=="ip") decisions are folded into the single
|
||||
crowdsec_ban list and captcha is downgraded to block at the proxied edge.
|
||||
|
||||
(Filename kept as lapi_kv_sync.py for path/ConfigMap continuity with the prior
|
||||
Workers-KV design; it no longer touches KV — it reconciles CF Rules Lists.)
|
||||
Workers-KV design; it no longer touches KV — it reconciles a CF Rules List.)
|
||||
|
||||
Design notes:
|
||||
* Pure Python stdlib (no pip/apk at runtime) — runs on stock python:3.12-alpine
|
||||
mounted from a ConfigMap, the alert_digest pattern.
|
||||
* FULL RECONCILE each run: read the complete decision set from LAPI, partition
|
||||
into ban / captcha desired sets, then for each list compute add (desired -
|
||||
existing) and remove (existing - desired) and apply both. An IP listed for
|
||||
BOTH ban and captcha is placed in BAN ONLY (ban wins; the WAF rule order
|
||||
also blocks-before-challenges as belt-and-braces). A `cscli decisions
|
||||
delete` therefore clears from the edge within one interval (<=2 min).
|
||||
* FAIL-SAFE on LAPI: if LAPI can't be read we SKIP the run (lists untouched,
|
||||
exit 0). A LAPI outage thus freezes the edge state rather than wiping every
|
||||
ban — degrade toward the last-known-good block set, never toward all-block
|
||||
or a thundering un-ban. (Decisions linger only until the next successful
|
||||
sync, not their TTL — we reconcile to LAPI truth, we don't expire entries.)
|
||||
* FULL RECONCILE each run: read the complete decision set from LAPI, take the
|
||||
UNION of ban + captcha (scope=="ip") as the single desired set, then compute
|
||||
add (desired - existing) and remove (existing - desired) against the one
|
||||
crowdsec_ban list and apply both. A `cscli decisions delete` therefore
|
||||
clears from the edge within one interval (<=2 min).
|
||||
* FAIL-SAFE on LAPI: if LAPI can't be read we SKIP the run (list untouched,
|
||||
exit 0). A LAPI outage thus freezes the edge state rather than wiping the
|
||||
block list — degrade toward the last-known-good block set, never toward
|
||||
all-block or a thundering un-ban. (Decisions linger only until the next
|
||||
successful sync, not their TTL — we reconcile to LAPI truth, we don't
|
||||
expire entries.)
|
||||
* FAIL-LOUD on Cloudflare: any CF API error is logged and the job exits
|
||||
non-zero so the failure is visible (CronJob backoff + missing success
|
||||
metric + the next run retries).
|
||||
|
|
@ -42,9 +45,9 @@ official API reference (developers.cloudflare.com, 2026):
|
|||
* GET /accounts/{acct}/rules/lists/bulk_operations/{op_id} -> status in
|
||||
{pending,running,completed,failed} (failed carries `error`).
|
||||
ASYNC HANDLING: Cloudflare allows only ONE pending bulk operation per ACCOUNT.
|
||||
So we must NOT fire add+delete (or both lists) concurrently — we serialize and
|
||||
poll each operation_id to a terminal state (short bounded timeout) before the
|
||||
next mutation. If a poll times out we stop mutating for this run and report
|
||||
So we must NOT fire add+delete concurrently — we serialize and poll each
|
||||
operation_id to a terminal state (short bounded timeout) before the next
|
||||
mutation. If a poll times out we stop mutating for this run and report
|
||||
partial success (the next 2-min run reconciles the rest); we never abandon an
|
||||
in-flight op and immediately issue another (that would 409/reject).
|
||||
"""
|
||||
|
|
@ -63,7 +66,6 @@ LAPI_KEY = os.environ["LAPI_KEY"] # kvsync bouncer key, registered in LAPI
|
|||
CF_ACCOUNT_ID = os.environ["CF_ACCOUNT_ID"]
|
||||
CF_API_TOKEN = os.environ["CF_API_TOKEN"] # scoped: Account Filter Lists Edit
|
||||
CF_BAN_LIST_ID = os.environ["CF_BAN_LIST_ID"]
|
||||
CF_CAPTCHA_LIST_ID = os.environ["CF_CAPTCHA_LIST_ID"]
|
||||
PUSHGATEWAY = os.environ.get("PUSHGATEWAY_URL", "").rstrip("/") # optional
|
||||
|
||||
CF_API = "https://api.cloudflare.com/client/v4"
|
||||
|
|
@ -115,17 +117,19 @@ def _cf(url, *, method="GET", payload=None, timeout=20):
|
|||
# LAPI
|
||||
# --------------------------------------------------------------------------- #
|
||||
def fetch_decisions():
|
||||
"""Return (ban_set, captcha_set) of IPs from LAPI.
|
||||
"""Return the single desired set of IPs to BLOCK at the edge.
|
||||
|
||||
Only scope=="ip" decisions are projected (the WAF rule keys on ip.src). An
|
||||
IP appearing in both ban and captcha is placed in BAN only. Raises on
|
||||
transport/HTTP error so the caller can SKIP the run (fail-safe).
|
||||
Only scope=="ip" decisions are projected (the WAF rule keys on ip.src). The
|
||||
CF account allows only ONE Rules List, so BOTH "ban" AND "captcha" decisions
|
||||
are folded into one block set (captcha is downgraded to block at the proxied
|
||||
edge). Raises on transport/HTTP error so the caller can SKIP the run
|
||||
(fail-safe).
|
||||
"""
|
||||
data = _req(
|
||||
f"{LAPI_URL}/v1/decisions",
|
||||
headers={"X-Api-Key": LAPI_KEY, "Accept": "application/json"},
|
||||
)
|
||||
ban, captcha = set(), set()
|
||||
block = set()
|
||||
for d in data or []:
|
||||
if (d.get("scope") or "").lower() != "ip":
|
||||
continue
|
||||
|
|
@ -133,13 +137,10 @@ def fetch_decisions():
|
|||
if not ip:
|
||||
continue
|
||||
dtype = (d.get("type") or "").lower()
|
||||
if dtype == "ban":
|
||||
ban.add(ip)
|
||||
elif dtype == "captcha":
|
||||
captcha.add(ip)
|
||||
if dtype in ("ban", "captcha"):
|
||||
block.add(ip)
|
||||
# other remediation types (e.g. throttle) are ignored
|
||||
captcha -= ban # ban wins: never list the same IP in both
|
||||
return ban, captcha
|
||||
return block
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
|
@ -260,14 +261,12 @@ def reconcile(label, list_id, desired):
|
|||
# --------------------------------------------------------------------------- #
|
||||
# Metrics (best-effort)
|
||||
# --------------------------------------------------------------------------- #
|
||||
def push_metrics(ban_n, captcha_n, ok):
|
||||
def push_metrics(block_n, ok):
|
||||
if not PUSHGATEWAY:
|
||||
return
|
||||
payload = (
|
||||
"# TYPE crowdsec_cf_list_ban_count gauge\n"
|
||||
f"crowdsec_cf_list_ban_count {ban_n}\n"
|
||||
"# TYPE crowdsec_cf_list_captcha_count gauge\n"
|
||||
f"crowdsec_cf_list_captcha_count {captcha_n}\n"
|
||||
f"crowdsec_cf_list_ban_count {block_n}\n"
|
||||
"# TYPE crowdsec_cf_list_sync_success gauge\n"
|
||||
f"crowdsec_cf_list_sync_success {1 if ok else 0}\n"
|
||||
"# TYPE crowdsec_cf_list_sync_last_run_seconds gauge\n"
|
||||
|
|
@ -289,28 +288,27 @@ def push_metrics(ban_n, captcha_n, ok):
|
|||
def main():
|
||||
# 1. Desired state from LAPI. Any failure here = SKIP (fail-safe).
|
||||
try:
|
||||
ban, captcha = fetch_decisions()
|
||||
block = fetch_decisions()
|
||||
except Exception as e:
|
||||
print(
|
||||
f"[skip] LAPI unreadable ({e}); leaving CF lists untouched "
|
||||
f"[skip] LAPI unreadable ({e}); leaving the CF list untouched "
|
||||
f"(fail-safe: freeze last-known edge state).",
|
||||
file=sys.stderr,
|
||||
)
|
||||
push_metrics(0, 0, ok=False)
|
||||
push_metrics(0, ok=False)
|
||||
return 0
|
||||
|
||||
print(f"[info] LAPI desired: {len(ban)} ban / {len(captcha)} captcha (ip-scope)")
|
||||
print(f"[info] LAPI desired: {len(block)} block (ban+captcha, ip-scope)")
|
||||
|
||||
# 2. Reconcile both lists. CF errors fail loud (non-zero exit).
|
||||
# 2. Reconcile the single block list. CF errors fail loud (non-zero exit).
|
||||
try:
|
||||
reconcile("ban", CF_BAN_LIST_ID, ban)
|
||||
reconcile("captcha", CF_CAPTCHA_LIST_ID, captcha)
|
||||
reconcile("block", CF_BAN_LIST_ID, block)
|
||||
except CFError as e:
|
||||
print(f"[error] Cloudflare API failure: {e}", file=sys.stderr)
|
||||
push_metrics(len(ban), len(captcha), ok=False)
|
||||
push_metrics(len(block), ok=False)
|
||||
return 1
|
||||
|
||||
push_metrics(len(ban), len(captcha), ok=True)
|
||||
push_metrics(len(block), ok=True)
|
||||
return 0
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue