[infra] Fix rewrite-body plugin + cleanup TrueNAS + version bumps

## Context

The rewrite-body Traefik plugin (packruler/rewrite-body v1.2.0) silently
broke on Traefik v3.6.12 — every service using rybbit analytics or anti-AI
injection returned HTTP 200 with "Error 404: Not Found" body. Root cause:
middleware specs referenced plugin name `rewrite-body` but Traefik registered
it as `traefik-plugin-rewritebody`.

Migrated to maintained fork `the-ccsn/traefik-plugin-rewritebody` v0.1.3
which uses the correct plugin name. Also added `lastModified = true` and
`methods = ["GET"]` to anti-AI middleware to avoid rewriting non-HTML
responses.

## This change

- Replace packruler/rewrite-body v1.2.0 with the-ccsn/traefik-plugin-rewritebody v0.1.3
- Fix plugin name in all 3 middleware locations (ingress_factory, reverse-proxy factory, traefik anti-AI)
- Remove deprecated TrueNAS cloud sync monitor (VM decommissioned 2026-04-13)
- Remove CloudSyncStale/CloudSyncFailing/CloudSyncNeverRun alerts
- Fix PrometheusBackupNeverRun alert (for: 48h → 32d to match monthly sidecar schedule)
- Bump versions: rybbit v1.0.21→v1.1.0, wealthfolio v1.1.0→v3.2,
  networking-toolbox 1.1.1→1.6.0, cyberchef v10.24.0→v9.55.0
- MySQL standalone storage_limit 30Gi → 50Gi
- beads-server: fix Dolt workbench type casing, remove Authentik on GraphQL endpoint

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-04-17 05:51:52 +00:00
parent 8b206a63ad
commit f8facf44dd
16 changed files with 29 additions and 277 deletions

View file

@ -12,7 +12,7 @@
}
]
},
"description": "Backup health overview — K8s CronJob backups and TrueNAS Cloud Sync status",
"description": "Backup health overview — K8s CronJob backups",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
@ -56,11 +56,6 @@
"expr": "time() - backup_last_success_timestamp",
"legendFormat": "{{ job }}",
"refId": "A"
},
{
"expr": "time() - cloudsync_last_success_timestamp",
"legendFormat": "cloudsync-{{ task_id }}",
"refId": "B"
}
]
},
@ -279,106 +274,6 @@
}
]
},
{
"title": "Cloud Sync Status",
"type": "stat",
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 30 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "type": "value", "options": { "1": { "text": "SUCCESS", "color": "green" }, "0": { "text": "FAILED", "color": "red" } } }
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"colorMode": "background",
"graphMode": "none",
"textMode": "auto"
},
"targets": [
{
"expr": "cloudsync_job_state",
"legendFormat": "Task {{ task_id }}",
"refId": "A"
}
]
},
{
"title": "Cloud Sync Duration",
"type": "stat",
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 30 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 3600 },
{ "color": "red", "value": 86400 }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"colorMode": "background",
"graphMode": "none",
"textMode": "auto"
},
"targets": [
{
"expr": "cloudsync_duration_seconds",
"legendFormat": "Task {{ task_id }}",
"refId": "A"
}
]
},
{
"title": "Cloud Sync Transfer Volume",
"type": "bargauge",
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 36 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "bytes",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1073741824 },
{ "color": "red", "value": 10737418240 }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"displayMode": "gradient",
"orientation": "horizontal",
"showUnfilled": true
},
"targets": [
{
"expr": "cloudsync_transferred_bytes",
"legendFormat": "Task {{ task_id }}",
"refId": "A"
}
]
},
{
"title": "LVM Thin Snapshots",
"type": "row",
@ -565,7 +460,7 @@
"normal": false,
"error": true
},
"alertInstanceLabelFilter": "{__alert_rule_title__=~\".*[Bb]ackup.*|.*[Ss]napshot.*|.*CloudSync.*|.*ThinPool.*\"}",
"alertInstanceLabelFilter": "{__alert_rule_title__=~\".*[Bb]ackup.*|.*[Ss]napshot.*|.*ThinPool.*\"}",
"folder": { "id": null, "title": "" },
"folderId": null
}
@ -609,7 +504,7 @@
],
"targets": [
{
"expr": "kube_cronjob_status_last_schedule_time{cronjob=~\".*backup.*|.*cloudsync.*|.*etcd.*|.*raft.*\"} * 1000",
"expr": "kube_cronjob_status_last_schedule_time{cronjob=~\".*backup.*|.*etcd.*|.*raft.*\"} * 1000",
"legendFormat": "",
"refId": "A",
"instant": true,

View file

@ -29,10 +29,6 @@ variable "grafana_admin_password" {
}
variable "tier" { type = string }
variable "mysql_host" { type = string }
variable "truenas_api_key" {
type = string
sensitive = true
}
resource "kubernetes_namespace" "monitoring" {
metadata {
@ -94,126 +90,6 @@ resource "kubernetes_cron_job_v1" "monitor_prom" {
}
}
# -----------------------------------------------------------------------------
# Cloud Sync Monitor DEPRECATED: TrueNAS decommissioned 2026-04-13
# TODO: Remove this resource entirely once TrueNAS VM is shut down
# -----------------------------------------------------------------------------
resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
metadata {
name = "cloudsync-monitor"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec {
concurrency_policy = "Replace"
failed_jobs_history_limit = 3
successful_jobs_history_limit = 3
schedule = "0 */6 * * *"
job_template {
metadata {}
spec {
backoff_limit = 2
ttl_seconds_after_finished = 300
template {
metadata {}
spec {
container {
name = "cloudsync-monitor"
image = "docker.io/library/alpine"
command = ["/bin/sh", "-c", <<-EOT
set -euo pipefail
apk add --no-cache curl jq
# Query TrueNAS Cloud Sync tasks (TrueNAS deprecated this monitor should be removed)
RESPONSE=$(curl -sf -H "Authorization: Bearer $TRUENAS_API_KEY" \
"http://10.0.10.15/api/v2.0/cloudsync" 2>&1) || {
echo "WARN: TrueNAS API unreachable (VM deprecated)"
exit 0
}
# Parse each task's last successful run
echo "$RESPONSE" | jq -c '.[]' | while read -r task; do
TASK_ID=$(echo "$task" | jq -r '.id')
TASK_DESC=$(echo "$task" | jq -r '.description // "task-\(.id)"' | tr ' ' '_' | tr -cd '[:alnum:]_-')
JOB_STATE=$(echo "$task" | jq -r '.job.state // "UNKNOWN"')
JOB_TIME=$(echo "$task" | jq -r '.job.time_finished."$date" // 0')
if [ "$JOB_TIME" != "0" ] && [ "$JOB_TIME" != "null" ]; then
# TrueNAS returns milliseconds since epoch
EPOCH_SECS=$((JOB_TIME / 1000))
else
EPOCH_SECS=0
fi
# Extract transfer stats from job progress description
# Format: "1182 / 1182, 3.928 GiB / 3.928 GiB, 8.737 MiB/s, ..."
JOB_PROGRESS=$(echo "$task" | jq -r '.job.progress.description // ""')
TX_TOTAL=$(echo "$JOB_PROGRESS" | awk -F', ' '{split($2, a, " / "); print a[2]}')
TX_NUM=$(echo "$TX_TOTAL" | awk '{print $1}')
TX_NUM=$${TX_NUM:-0}
TX_UNIT=$(echo "$TX_TOTAL" | awk '{print $2}')
TX_UNIT=$${TX_UNIT:-Bytes}
case "$TX_UNIT" in
Bytes|B) TX_MULT=1 ;; KiB|kB) TX_MULT=1024 ;; MiB|MB) TX_MULT=1048576 ;;
GiB|GB) TX_MULT=1073741824 ;; *) TX_MULT=1 ;;
esac
TRANSFERRED_BYTES=$(echo "$TX_NUM $TX_MULT" | awk '{printf "%.0f", $1 * $2}')
JOB_STARTED=$(echo "$task" | jq -r '.job.time_started."$date" // 0')
JOB_FINISHED=$(echo "$task" | jq -r '.job.time_finished."$date" // 0')
if [ "$JOB_STARTED" != "0" ] && [ "$JOB_STARTED" != "null" ] && [ "$JOB_FINISHED" != "0" ] && [ "$JOB_FINISHED" != "null" ]; then
SYNC_DURATION=$(( (JOB_FINISHED - JOB_STARTED) / 1000 ))
else
SYNC_DURATION=0
fi
echo "Task $TASK_ID ($TASK_DESC): state=$JOB_STATE, last_finished=$EPOCH_SECS, duration=$${SYNC_DURATION}s"
# Push metrics to Pushgateway
cat <<METRICS | curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/cloudsync-monitor/task_id/$TASK_ID"
# HELP cloudsync_last_success_timestamp Last successful Cloud Sync completion (unix epoch)
# TYPE cloudsync_last_success_timestamp gauge
cloudsync_last_success_timestamp $EPOCH_SECS
# HELP cloudsync_job_state Cloud Sync job state (1=SUCCESS, 0=other)
# TYPE cloudsync_job_state gauge
cloudsync_job_state $([ "$JOB_STATE" = "SUCCESS" ] && echo 1 || echo 0)
# HELP cloudsync_duration_seconds Duration of the last Cloud Sync run
# TYPE cloudsync_duration_seconds gauge
cloudsync_duration_seconds $SYNC_DURATION
# HELP cloudsync_transferred_bytes Bytes transferred during Cloud Sync run
# TYPE cloudsync_transferred_bytes gauge
cloudsync_transferred_bytes $TRANSFERRED_BYTES
METRICS
done
echo "Cloud Sync monitor complete"
EOT
]
env {
name = "TRUENAS_API_KEY"
value = var.truenas_api_key
}
resources {
requests = {
memory = "32Mi"
cpu = "10m"
}
limits = {
memory = "64Mi"
}
}
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
}
}
}
# -----------------------------------------------------------------------------
# DNS Anomaly Monitor query Technitium stats API, detect anomalies, push to Pushgateway
# Runs every 15 min. Checks for query spikes, high error rates, and suspicious patterns.

View file

@ -1213,32 +1213,11 @@ serverFiles:
summary: "Prometheus backup is {{ $value | humanizeDuration }} old (threshold: 32d)"
- alert: PrometheusBackupNeverRun
expr: absent(prometheus_backup_last_success_timestamp{job="prometheus-backup"})
for: 48h
for: 32d
labels:
severity: warning
annotations:
summary: "Prometheus backup has never reported a successful run"
- alert: CloudSyncStale
expr: (time() - cloudsync_last_success_timestamp{job="cloudsync-monitor"}) > 691200
for: 1h
labels:
severity: critical
annotations:
summary: "Cloud Sync task {{ $labels.task_id }} is {{ $value | humanizeDuration }} old (threshold: 8d) — off-site backups may have stopped"
- alert: CloudSyncNeverRun
expr: absent(cloudsync_last_success_timestamp{job="cloudsync-monitor"})
for: 48h
labels:
severity: warning
annotations:
summary: "Cloud Sync monitor has never reported — check cloudsync-monitor CronJob"
- alert: CloudSyncFailing
expr: cloudsync_job_state{job="cloudsync-monitor", task_id!="2"} == 0
for: 6h
labels:
severity: warning
annotations:
summary: "Cloud Sync task {{ $labels.task_id }} last state was not SUCCESS"
summary: "Prometheus backup has never reported a successful run (sidecar runs monthly, 1st Sunday 04:00 UTC — alert only fires if absent for >32d)"
- alert: CSIDriverCrashLoop
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace=~"nfs-csi|proxmox-csi"} > 0
for: 10m