diff --git a/docs/architecture/backup-dr.md b/docs/architecture/backup-dr.md index a98f4303..5e531728 100644 --- a/docs/architecture/backup-dr.md +++ b/docs/architecture/backup-dr.md @@ -72,7 +72,7 @@ graph TB subgraph Monitoring["Monitoring & Alerting"] Prometheus["Prometheus Alerts
PostgreSQLBackupStale, MySQLBackupStale
WeeklyBackupStale, OffsiteBackupSyncStale
LVMSnapshotStale, BackupDiskFull
VaultwardenIntegrityFail"] - Pushgateway["Pushgateway
backup script metrics
cloudsync metrics
vaultwarden integrity"] + Pushgateway["Pushgateway
backup script metrics
vaultwarden integrity"] end PVCBackup -.->|push metrics| Pushgateway diff --git a/docs/architecture/monitoring.md b/docs/architecture/monitoring.md index 7826887d..5a7a120d 100644 --- a/docs/architecture/monitoring.md +++ b/docs/architecture/monitoring.md @@ -176,7 +176,6 @@ Uptime Kuma monitors: TCP SMTP (port 25) on `176.12.22.76` (external), IMAP (por - **VaultwardenBackupStale**: >8d since last backup - **RedisBackupStale**: >8d since last backup - **PrometheusBackupStale**: >32d since last backup -- **CloudSyncStale**: >8d since last cloud sync - **VaultwardenIntegrityFail**: Backup integrity check failed ### Vault Paths diff --git a/modules/kubernetes/ingress_factory/main.tf b/modules/kubernetes/ingress_factory/main.tf index 347608de..58e98618 100644 --- a/modules/kubernetes/ingress_factory/main.tf +++ b/modules/kubernetes/ingress_factory/main.tf @@ -270,7 +270,7 @@ resource "kubernetes_manifest" "rybbit_analytics" { } spec = { plugin = { - rewrite-body = { + traefik-plugin-rewritebody = { rewrites = [{ regex = "" replacement = "" diff --git a/stacks/beads-server/main.tf b/stacks/beads-server/main.tf index b2787bee..49b1e44a 100644 --- a/stacks/beads-server/main.tf +++ b/stacks/beads-server/main.tf @@ -189,7 +189,7 @@ resource "kubernetes_config_map" "workbench_store" { connectionUrl = "mysql://beads@dolt.beads-server.svc.cluster.local:3306/code" hideDoltFeatures = false useSSL = false - type = "mysql" + type = "Mysql" }]) } } @@ -396,7 +396,9 @@ resource "kubernetes_ingress_v1" "graphql" { name = "dolt-workbench-graphql" namespace = kubernetes_namespace.beads.metadata[0].name annotations = { - "traefik.ingress.kubernetes.io/router.middlewares" = "traefik-authentik-forward-auth@kubernetescrd" + # No Authentik on GraphQL — the main page handles auth. + # JS fetch() to /graphql may not pass Authentik's forward-auth + # (302 on POST → fetch fails → "Request timed out"). } } spec { diff --git a/stacks/cyberchef/main.tf b/stacks/cyberchef/main.tf index 221072aa..620b1c4e 100644 --- a/stacks/cyberchef/main.tf +++ b/stacks/cyberchef/main.tf @@ -53,7 +53,7 @@ resource "kubernetes_deployment" "cyberchef" { } spec { container { - image = "mpepping/cyberchef:v10.24.0" + image = "mpepping/cyberchef:v9.55.0" name = "cyberchef" port { diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index 36864054..aa94d1d4 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -521,7 +521,7 @@ resource "kubernetes_stateful_set_v1" "mysql_standalone" { annotations = { "resize.topolvm.io/threshold" = "80%" "resize.topolvm.io/increase" = "100%" - "resize.topolvm.io/storage_limit" = "30Gi" + "resize.topolvm.io/storage_limit" = "50Gi" } } spec { diff --git a/stacks/monitoring/main.tf b/stacks/monitoring/main.tf index 503e3e1a..ddf77022 100644 --- a/stacks/monitoring/main.tf +++ b/stacks/monitoring/main.tf @@ -31,5 +31,4 @@ module "monitoring" { pve_password = data.vault_kv_secret_v2.secrets.data["pve_password"] grafana_admin_password = data.vault_kv_secret_v2.secrets.data["grafana_admin_password"] tier = local.tiers.cluster - truenas_api_key = data.vault_kv_secret_v2.viktor.data["truenas_api_key"] } diff --git a/stacks/monitoring/modules/monitoring/dashboards/backup_health.json b/stacks/monitoring/modules/monitoring/dashboards/backup_health.json index 5d8e5f45..c7aa3790 100644 --- a/stacks/monitoring/modules/monitoring/dashboards/backup_health.json +++ b/stacks/monitoring/modules/monitoring/dashboards/backup_health.json @@ -12,7 +12,7 @@ } ] }, - "description": "Backup health overview — K8s CronJob backups and TrueNAS Cloud Sync status", + "description": "Backup health overview — K8s CronJob backups", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, @@ -56,11 +56,6 @@ "expr": "time() - backup_last_success_timestamp", "legendFormat": "{{ job }}", "refId": "A" - }, - { - "expr": "time() - cloudsync_last_success_timestamp", - "legendFormat": "cloudsync-{{ task_id }}", - "refId": "B" } ] }, @@ -279,106 +274,6 @@ } ] }, - { - "title": "Cloud Sync Status", - "type": "stat", - "gridPos": { "h": 6, "w": 12, "x": 0, "y": 30 }, - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "fieldConfig": { - "defaults": { - "mappings": [ - { "type": "value", "options": { "1": { "text": "SUCCESS", "color": "green" }, "0": { "text": "FAILED", "color": "red" } } } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "red", "value": null }, - { "color": "green", "value": 1 } - ] - } - }, - "overrides": [] - }, - "options": { - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "colorMode": "background", - "graphMode": "none", - "textMode": "auto" - }, - "targets": [ - { - "expr": "cloudsync_job_state", - "legendFormat": "Task {{ task_id }}", - "refId": "A" - } - ] - }, - { - "title": "Cloud Sync Duration", - "type": "stat", - "gridPos": { "h": 6, "w": 12, "x": 12, "y": 30 }, - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "fieldConfig": { - "defaults": { - "unit": "s", - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 3600 }, - { "color": "red", "value": 86400 } - ] - } - }, - "overrides": [] - }, - "options": { - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "colorMode": "background", - "graphMode": "none", - "textMode": "auto" - }, - "targets": [ - { - "expr": "cloudsync_duration_seconds", - "legendFormat": "Task {{ task_id }}", - "refId": "A" - } - ] - }, - { - "title": "Cloud Sync Transfer Volume", - "type": "bargauge", - "gridPos": { "h": 6, "w": 24, "x": 0, "y": 36 }, - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "fieldConfig": { - "defaults": { - "unit": "bytes", - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 1073741824 }, - { "color": "red", "value": 10737418240 } - ] - } - }, - "overrides": [] - }, - "options": { - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "displayMode": "gradient", - "orientation": "horizontal", - "showUnfilled": true - }, - "targets": [ - { - "expr": "cloudsync_transferred_bytes", - "legendFormat": "Task {{ task_id }}", - "refId": "A" - } - ] - }, { "title": "LVM Thin Snapshots", "type": "row", @@ -565,7 +460,7 @@ "normal": false, "error": true }, - "alertInstanceLabelFilter": "{__alert_rule_title__=~\".*[Bb]ackup.*|.*[Ss]napshot.*|.*CloudSync.*|.*ThinPool.*\"}", + "alertInstanceLabelFilter": "{__alert_rule_title__=~\".*[Bb]ackup.*|.*[Ss]napshot.*|.*ThinPool.*\"}", "folder": { "id": null, "title": "" }, "folderId": null } @@ -609,7 +504,7 @@ ], "targets": [ { - "expr": "kube_cronjob_status_last_schedule_time{cronjob=~\".*backup.*|.*cloudsync.*|.*etcd.*|.*raft.*\"} * 1000", + "expr": "kube_cronjob_status_last_schedule_time{cronjob=~\".*backup.*|.*etcd.*|.*raft.*\"} * 1000", "legendFormat": "", "refId": "A", "instant": true, diff --git a/stacks/monitoring/modules/monitoring/main.tf b/stacks/monitoring/modules/monitoring/main.tf index 41e313a3..f6ebc2d5 100644 --- a/stacks/monitoring/modules/monitoring/main.tf +++ b/stacks/monitoring/modules/monitoring/main.tf @@ -29,10 +29,6 @@ variable "grafana_admin_password" { } variable "tier" { type = string } variable "mysql_host" { type = string } -variable "truenas_api_key" { - type = string - sensitive = true -} resource "kubernetes_namespace" "monitoring" { metadata { @@ -94,126 +90,6 @@ resource "kubernetes_cron_job_v1" "monitor_prom" { } } -# ----------------------------------------------------------------------------- -# Cloud Sync Monitor — DEPRECATED: TrueNAS decommissioned 2026-04-13 -# TODO: Remove this resource entirely once TrueNAS VM is shut down -# ----------------------------------------------------------------------------- -resource "kubernetes_cron_job_v1" "cloudsync_monitor" { - metadata { - name = "cloudsync-monitor" - namespace = kubernetes_namespace.monitoring.metadata[0].name - } - spec { - concurrency_policy = "Replace" - failed_jobs_history_limit = 3 - successful_jobs_history_limit = 3 - schedule = "0 */6 * * *" - job_template { - metadata {} - spec { - backoff_limit = 2 - ttl_seconds_after_finished = 300 - template { - metadata {} - spec { - container { - name = "cloudsync-monitor" - image = "docker.io/library/alpine" - command = ["/bin/sh", "-c", <<-EOT - set -euo pipefail - apk add --no-cache curl jq - - # Query TrueNAS Cloud Sync tasks (TrueNAS deprecated — this monitor should be removed) - RESPONSE=$(curl -sf -H "Authorization: Bearer $TRUENAS_API_KEY" \ - "http://10.0.10.15/api/v2.0/cloudsync" 2>&1) || { - echo "WARN: TrueNAS API unreachable (VM deprecated)" - exit 0 - } - - # Parse each task's last successful run - echo "$RESPONSE" | jq -c '.[]' | while read -r task; do - TASK_ID=$(echo "$task" | jq -r '.id') - TASK_DESC=$(echo "$task" | jq -r '.description // "task-\(.id)"' | tr ' ' '_' | tr -cd '[:alnum:]_-') - JOB_STATE=$(echo "$task" | jq -r '.job.state // "UNKNOWN"') - JOB_TIME=$(echo "$task" | jq -r '.job.time_finished."$date" // 0') - - if [ "$JOB_TIME" != "0" ] && [ "$JOB_TIME" != "null" ]; then - # TrueNAS returns milliseconds since epoch - EPOCH_SECS=$((JOB_TIME / 1000)) - else - EPOCH_SECS=0 - fi - - # Extract transfer stats from job progress description - # Format: "1182 / 1182, 3.928 GiB / 3.928 GiB, 8.737 MiB/s, ..." - JOB_PROGRESS=$(echo "$task" | jq -r '.job.progress.description // ""') - TX_TOTAL=$(echo "$JOB_PROGRESS" | awk -F', ' '{split($2, a, " / "); print a[2]}') - TX_NUM=$(echo "$TX_TOTAL" | awk '{print $1}') - TX_NUM=$${TX_NUM:-0} - TX_UNIT=$(echo "$TX_TOTAL" | awk '{print $2}') - TX_UNIT=$${TX_UNIT:-Bytes} - case "$TX_UNIT" in - Bytes|B) TX_MULT=1 ;; KiB|kB) TX_MULT=1024 ;; MiB|MB) TX_MULT=1048576 ;; - GiB|GB) TX_MULT=1073741824 ;; *) TX_MULT=1 ;; - esac - TRANSFERRED_BYTES=$(echo "$TX_NUM $TX_MULT" | awk '{printf "%.0f", $1 * $2}') - JOB_STARTED=$(echo "$task" | jq -r '.job.time_started."$date" // 0') - JOB_FINISHED=$(echo "$task" | jq -r '.job.time_finished."$date" // 0') - if [ "$JOB_STARTED" != "0" ] && [ "$JOB_STARTED" != "null" ] && [ "$JOB_FINISHED" != "0" ] && [ "$JOB_FINISHED" != "null" ]; then - SYNC_DURATION=$(( (JOB_FINISHED - JOB_STARTED) / 1000 )) - else - SYNC_DURATION=0 - fi - - echo "Task $TASK_ID ($TASK_DESC): state=$JOB_STATE, last_finished=$EPOCH_SECS, duration=$${SYNC_DURATION}s" - - # Push metrics to Pushgateway - cat < 691200 - for: 1h - labels: - severity: critical - annotations: - summary: "Cloud Sync task {{ $labels.task_id }} is {{ $value | humanizeDuration }} old (threshold: 8d) — off-site backups may have stopped" - - alert: CloudSyncNeverRun - expr: absent(cloudsync_last_success_timestamp{job="cloudsync-monitor"}) - for: 48h - labels: - severity: warning - annotations: - summary: "Cloud Sync monitor has never reported — check cloudsync-monitor CronJob" - - alert: CloudSyncFailing - expr: cloudsync_job_state{job="cloudsync-monitor", task_id!="2"} == 0 - for: 6h - labels: - severity: warning - annotations: - summary: "Cloud Sync task {{ $labels.task_id }} last state was not SUCCESS" + summary: "Prometheus backup has never reported a successful run (sidecar runs monthly, 1st Sunday 04:00 UTC — alert only fires if absent for >32d)" - alert: CSIDriverCrashLoop expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace=~"nfs-csi|proxmox-csi"} > 0 for: 10m diff --git a/stacks/networking-toolbox/main.tf b/stacks/networking-toolbox/main.tf index 5284df77..ea59d4dc 100644 --- a/stacks/networking-toolbox/main.tf +++ b/stacks/networking-toolbox/main.tf @@ -48,7 +48,7 @@ resource "kubernetes_deployment" "networking-toolbox" { } spec { container { - image = "lissy93/networking-toolbox:1.1.1" + image = "lissy93/networking-toolbox:1.6.0" name = "networking-toolbox" port { container_port = 3000 diff --git a/stacks/reverse-proxy/modules/reverse_proxy/factory/main.tf b/stacks/reverse-proxy/modules/reverse_proxy/factory/main.tf index 0108862b..066f385e 100644 --- a/stacks/reverse-proxy/modules/reverse_proxy/factory/main.tf +++ b/stacks/reverse-proxy/modules/reverse_proxy/factory/main.tf @@ -173,7 +173,7 @@ resource "kubernetes_manifest" "rybbit_analytics" { } spec = { plugin = { - rewrite-body = { + traefik-plugin-rewritebody = { rewrites = [{ regex = "" replacement = "" diff --git a/stacks/rybbit/main.tf b/stacks/rybbit/main.tf index a90b2569..73380235 100644 --- a/stacks/rybbit/main.tf +++ b/stacks/rybbit/main.tf @@ -315,7 +315,7 @@ resource "kubernetes_deployment" "rybbit" { } spec { container { - image = "ghcr.io/rybbit-io/rybbit-backend:v1.0.21" + image = "ghcr.io/rybbit-io/rybbit-backend:v1.1.0" name = "rybbit" env { @@ -471,7 +471,7 @@ resource "kubernetes_deployment" "rybbit-client" { spec { container { name = "rybbit-client" - image = "ghcr.io/rybbit-io/rybbit-client:v1.0.20" + image = "ghcr.io/rybbit-io/rybbit-client:v1.1.0" env { name = "NODE_ENV" value = "production" diff --git a/stacks/traefik/modules/traefik/main.tf b/stacks/traefik/modules/traefik/main.tf index 8c1b00bc..febe3383 100644 --- a/stacks/traefik/modules/traefik/main.tf +++ b/stacks/traefik/modules/traefik/main.tf @@ -53,12 +53,12 @@ resource "helm_release" "traefik" { "set -e; ", "STORAGE=/plugins-storage; ", "mkdir -p \"$STORAGE/archives/github.com/maxlerebourg/crowdsec-bouncer-traefik-plugin\"; ", - "mkdir -p \"$STORAGE/archives/github.com/packruler/rewrite-body\"; ", + "mkdir -p \"$STORAGE/archives/github.com/the-ccsn/traefik-plugin-rewritebody\"; ", "wget -q -T 30 -O \"$STORAGE/archives/github.com/maxlerebourg/crowdsec-bouncer-traefik-plugin/v1.4.2.zip\" ", "\"https://github.com/maxlerebourg/crowdsec-bouncer-traefik-plugin/archive/refs/tags/v1.4.2.zip\"; ", - "wget -q -T 30 -O \"$STORAGE/archives/github.com/packruler/rewrite-body/v1.2.0.zip\" ", - "\"https://github.com/packruler/rewrite-body/archive/refs/tags/v1.2.0.zip\"; ", - "printf '{\"github.com/maxlerebourg/crowdsec-bouncer-traefik-plugin\":\"v1.4.2\",\"github.com/packruler/rewrite-body\":\"v1.2.0\"}' ", + "wget -q -T 30 -O \"$STORAGE/archives/github.com/the-ccsn/traefik-plugin-rewritebody/v0.1.3.zip\" ", + "\"https://github.com/the-ccsn/traefik-plugin-rewritebody/archive/refs/tags/v0.1.3.zip\"; ", + "printf '{\"github.com/maxlerebourg/crowdsec-bouncer-traefik-plugin\":\"v1.4.2\",\"github.com/the-ccsn/traefik-plugin-rewritebody\":\"v0.1.3\"}' ", "> \"$STORAGE/archives/state.json\"; ", "echo \"Plugins pre-downloaded successfully\"", ])] @@ -170,9 +170,9 @@ resource "helm_release" "traefik" { moduleName = "github.com/maxlerebourg/crowdsec-bouncer-traefik-plugin" version = "v1.4.2" } - rewrite-body = { - moduleName = "github.com/packruler/rewrite-body" - version = "v1.2.0" + traefik-plugin-rewritebody = { + moduleName = "github.com/the-ccsn/traefik-plugin-rewritebody" + version = "v0.1.3" } } } diff --git a/stacks/traefik/modules/traefik/middleware.tf b/stacks/traefik/modules/traefik/middleware.tf index 4326298c..4edbf264 100644 --- a/stacks/traefik/modules/traefik/middleware.tf +++ b/stacks/traefik/modules/traefik/middleware.tf @@ -380,13 +380,15 @@ resource "kubernetes_manifest" "middleware_anti_ai_trap_links" { } spec = { plugin = { - rewrite-body = { + traefik-plugin-rewritebody = { + lastModified = true rewrites = [{ regex = "" replacement = "
Research ArchiveDataset ExportBenchmark ResultsWeb IndexText Corpus
" }] monitoring = { - types = ["text/html"] + types = ["text/html"] + methods = ["GET"] } } } diff --git a/stacks/wealthfolio/main.tf b/stacks/wealthfolio/main.tf index 6288f484..db2e4328 100644 --- a/stacks/wealthfolio/main.tf +++ b/stacks/wealthfolio/main.tf @@ -111,7 +111,7 @@ resource "kubernetes_deployment" "wealthfolio" { } spec { container { - image = "afadil/wealthfolio:v1.1.0" + image = "afadil/wealthfolio:3.2" name = "wealthfolio" port { container_port = 8080