From a21d4a442486df15d5f7001cfb2e25985101c86c Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 23:24:25 +0000 Subject: [PATCH 1/3] =?UTF-8?q?[owntracks]=20Fix=20Service=20port=20scheme?= =?UTF-8?q?=20(https=E2=86=92http),=20unbreak=20phone=20POSTs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context iOS Owntracks app has been unable to upload for months — phone buffer now holds ~1200 pending points. Last successful `.rec` write was 2026-01-02T14:32:00Z, matching when the failures started. ### The 500 — verified in Traefik access log ``` 152.37.101.156 - viktor "POST /pub HTTP/1.1" 500 21 "-" "-" 47900 "owntracks-owntracks-owntracks-viktorbarzin-me@kubernetes" "https://10.10.107.194:8083" 84ms ``` Basic-auth + middleware chain (rate-limit, csp, crowdsec) all pass. Traefik then opens backend connection to `https://10.10.107.194:8083`. The Recorder pod listens **plain HTTP** on :8083 (`OTR_PORT=0` disables HTTPS in ot-recorder), so the TLS handshake never completes → 500. ### Root cause — Service port spec `kubernetes_service.owntracks` declared the port as: ``` name: https port: 443 targetPort: 8083 ``` Traefik's IngressClass scheme inference: if the Service port is named `https` OR numbered `443`, Traefik speaks HTTPS to that backend. Both were true here, pointing at a plain-HTTP socket. The name/number were purely cosmetic — a leftover from mirroring the external `:443` edge — and worked only while Traefik's default happened to be HTTP. A Traefik upgrade (or middleware-chain change) tightened inference and surfaced the mismatch. ## This change Rename port to `name=http, port=80` and update the matching Ingress backend `port.number` from 443 to 80. `targetPort` stays at 8083. ``` Phone -----> CF tunnel -----> Traefik (:443, TLS) -----> Service \ :80 (http) \ | \ v ---------------> Pod :8083 (plain HTTP hop) (HTTP listener) ``` Deployment container port label also renamed `https` → `http` for consistency (no functional effect — just readability). ## What is NOT in this change - **Not** switching the Recorder pod to HTTPS natively. That would require mounting a cert + rotation plumbing. External TLS is already terminated at Cloudflare/Traefik; in-cluster hop to the pod is plain-HTTP by design. - **Not** enabling `OTR_HTTPHOOK` to bridge Recorder → Dawarich (follow-up: code-z9b). - **Not** backfilling historical `.rec` files into Dawarich (follow-up: code-h2r). - Incidental: `providers.tf` + `.terraform.lock.hcl` refreshed by `terraform init -upgrade` to pick up the goauthentik provider that the ingress_factory module recently started requiring. ## Test Plan ### Automated ``` $ ../../scripts/tg plan Plan: 0 to add, 3 to change, 0 to destroy. $ ../../scripts/tg apply --non-interactive Apply complete! Resources: 0 added, 3 changed, 0 destroyed. $ kubectl -n owntracks get svc owntracks -o=jsonpath='{.spec.ports[0]}' {"name":"http","port":80,"protocol":"TCP","targetPort":8083} $ kubectl -n owntracks get ingress owntracks -o=jsonpath='{.spec.rules[0].http.paths[0].backend}' {"service":{"name":"owntracks","port":{"number":80}}} ``` ### Manual Verification In-cluster auth'd POST through the full ingress chain: ``` VIKTOR_PW=$(vault kv get -field=credentials secret/owntracks | jq -r .viktor) kubectl -n owntracks run curltest --rm -i --image=curlimages/curl --restart=Never -- \ curl -s -o /dev/null -w "HTTP %{http_code}\n" -X POST -u "viktor:$VIKTOR_PW" \ -H "Content-Type: application/json" \ -d '{"_type":"location","lat":0,"lon":0,"tst":1000000000,"tid":"vb"}' \ https://owntracks.viktorbarzin.me/pub # HTTP 200 ``` (previously: HTTP 500 on identical request) ### Reproduce locally 1. `vault login -method=oidc` 2. `cd infra/stacks/owntracks && ../../scripts/tg plan` 3. Expected: `Plan: 0 to add, 3 to change, 0 to destroy.` (or empty if already applied) 4. Watch next iOS Owntracks POST → Traefik access log should show `200`, not `500`. Closes: code-nqd Co-Authored-By: Claude Opus 4.7 (1M context) --- stacks/owntracks/.terraform.lock.hcl | 8 ++++++++ stacks/owntracks/main.tf | 11 +++++++---- stacks/owntracks/providers.tf | 4 ++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/stacks/owntracks/.terraform.lock.hcl b/stacks/owntracks/.terraform.lock.hcl index a1ca7484..fabbc047 100644 --- a/stacks/owntracks/.terraform.lock.hcl +++ b/stacks/owntracks/.terraform.lock.hcl @@ -24,6 +24,14 @@ provider "registry.terraform.io/cloudflare/cloudflare" { ] } +provider "registry.terraform.io/goauthentik/authentik" { + version = "2024.12.1" + constraints = "~> 2024.10" + hashes = [ + "h1:roBMd+gi+TGgikH/bMzEI8JfvJiMAQWt+8FmokCrQIs=", + ] +} + provider "registry.terraform.io/hashicorp/helm" { version = "3.1.1" hashes = [ diff --git a/stacks/owntracks/main.tf b/stacks/owntracks/main.tf index 5af77559..40106565 100644 --- a/stacks/owntracks/main.tf +++ b/stacks/owntracks/main.tf @@ -146,7 +146,7 @@ resource "kubernetes_deployment" "owntracks" { image = "owntracks/recorder:1.0.1" name = "owntracks" port { - name = "https" + name = "http" container_port = 8083 } env { @@ -202,8 +202,11 @@ resource "kubernetes_service" "owntracks" { app = "owntracks" } port { - name = "https" - port = 443 + # Recorder listens plain HTTP on 8083 (OTR_PORT=0 disables HTTPS). + # Port name/number drive Traefik's backend-scheme inference — must be + # http/80 so it doesn't try TLS against a plain socket (previous 500s). + name = "http" + port = 80 target_port = 8083 protocol = "TCP" } @@ -216,7 +219,7 @@ module "ingress" { namespace = kubernetes_namespace.owntracks.metadata[0].name name = "owntracks" tls_secret_name = var.tls_secret_name - port = 443 + port = 80 extra_annotations = { "traefik.ingress.kubernetes.io/router.middlewares" = "owntracks-basic-auth@kubernetescrd,traefik-rate-limit@kubernetescrd,traefik-csp-headers@kubernetescrd,traefik-crowdsec@kubernetescrd" "gethomepage.dev/enabled" = "true" diff --git a/stacks/owntracks/providers.tf b/stacks/owntracks/providers.tf index b337a2e9..012af700 100644 --- a/stacks/owntracks/providers.tf +++ b/stacks/owntracks/providers.tf @@ -9,6 +9,10 @@ terraform { source = "cloudflare/cloudflare" version = "~> 4" } + authentik = { + source = "goauthentik/authentik" + version = "~> 2024.10" + } } } From 1698cd1ce1d7eeb4cc0c426c7aefe98e5bc3ffab Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 23:26:08 +0000 Subject: [PATCH 2/3] [mailserver] Add daily backup CronJob for mailserver PVC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context The mailserver stack holds everything valuable and hard to recreate: 243M of maildirs, dovecot/rspamd state, and the DKIM private key that signs outbound mail. Today the only defense is the LVM thin-pool snapshots on the PVE host (7-day retention, storage-class scope only) — there is no app-level backup. Infra/.claude/CLAUDE.md mandates that every proxmox-lvm(-encrypted) app ship a NFS-backed backup CronJob, and the mailserver stack was the only one still out of compliance. Loss of mailserver-data-encrypted without backups = total loss of all stored mail plus a DKIM key rotation (which requires a DNS update and breaks signature verification on every message in transit for the TTL window). Unacceptable for a service people actually use. Trade-offs considered: - mysqldump-style single-file dump vs rsync snapshot — maildirs are millions of small files, not a DB export. rsync --link-dest gives incremental weekly snapshots for ~10% of the cost of a full copy. - RWO PVC read-only mount — the underlying PVC is ReadWriteOnce, so the backup Job has to co-locate with the mailserver pod. vaultwarden solves this with pod_affinity; mirrored here. - Image choice — alpine + apk add rsync matches vaultwarden's pattern and keeps the container image small. ## This change Adds `kubernetes_cron_job_v1.mailserver-backup` + NFS PV/PVC to the mailserver module. Runs daily at 03:00 (avoids the 00:30 mysql-backup and 00:45 per-db windows, and the */20 email-roundtrip cadence). The job rsyncs /var/mail, /var/mail-state, /var/log/mail into /srv/nfs/mailserver-backup// with --link-dest against the previous week for space-efficient incrementals. 8-week retention. Data layout (flowed through from the deployment's subPath mounts so the rsync tree matches the mailserver's own on-disk layout): PVC mailserver-data-encrypted (RWO, 2Gi) ├─ data/ (subPath) → pod's /var/mail → backup//data/ ├─ state/ (subPath) → pod's /var/mail-state → backup//state/ └─ log/ (subPath) → pod's /var/log/mail → backup//log/ Safety: - PVC mounted read-only (volume.persistent_volume_claim.read_only AND all three volume_mounts set read_only=true) so a backup-script bug cannot corrupt maildirs. - pod_affinity on app=mailserver + topology_key=hostname forces the Job pod onto the same node holding the RWO PVC attachment. - set -euxo pipefail + per-directory existence guard so a missing subPath short-circuits cleanly instead of silently no-op'ing. Metrics pushed to Pushgateway match the mysql-backup/vaultwarden-backup convention (job="mailserver-backup"): backup_duration_seconds, backup_read_bytes, backup_written_bytes, backup_output_bytes, backup_last_success_timestamp. Alert rules added in monitoring stack, mirroring Mysql/Vaultwarden: - MailserverBackupStale — 36h threshold, critical, 30m for: - MailserverBackupNeverSucceeded — critical, 1h for: ## Reproduce locally 1. cd infra/stacks/mailserver && ../../scripts/tg plan Expected: 3 to add (cronjob + NFS PV + PVC), unrelated drift on deployment/service is pre-existing. 2. ../../scripts/tg apply --non-interactive \ -target=module.mailserver.module.nfs_mailserver_backup_host \ -target=module.mailserver.kubernetes_cron_job_v1.mailserver-backup 3. cd ../monitoring && ../../scripts/tg apply --non-interactive 4. kubectl create job --from=cronjob/mailserver-backup \ mailserver-backup-test -n mailserver 5. kubectl wait --for=condition=complete --timeout=300s \ job/mailserver-backup-test -n mailserver 6. Expected: test pod co-locates with mailserver on same node (k8s-node2 today), rsync writes ~950M to /srv/nfs/mailserver-backup//, Pushgateway exposes backup_output_bytes{job="mailserver-backup"}. ## Test Plan ### Automated $ kubectl get cronjob -n mailserver mailserver-backup NAME SCHEDULE TIMEZONE SUSPEND ACTIVE LAST SCHEDULE AGE mailserver-backup 0 3 * * * False 0 3s $ kubectl create job --from=cronjob/mailserver-backup \ mailserver-backup-test -n mailserver job.batch/mailserver-backup-test created $ kubectl wait --for=condition=complete --timeout=300s \ job/mailserver-backup-test -n mailserver job.batch/mailserver-backup-test condition met $ kubectl logs -n mailserver job/mailserver-backup-test | tail -5 === Backup IO Stats === duration: 80s read: 1120 MiB written: 1186 MiB output: 947.0M $ kubectl run nfs-verify --rm --image=alpine --restart=Never \ --overrides='{...nfs mount /srv/nfs...}' \ -n mailserver --attach -- ls -la /nfs/mailserver-backup/ 947.0M /nfs/mailserver-backup/2026-15 $ curl http://prometheus-prometheus-pushgateway.monitoring:9091/metrics \ | grep mailserver-backup backup_duration_seconds{instance="",job="mailserver-backup"} 80 backup_last_success_timestamp{instance="",job="mailserver-backup"} 1.776554641e+09 backup_output_bytes{instance="",job="mailserver-backup"} 9.92315701e+08 backup_read_bytes{instance="",job="mailserver-backup"} 1.175027712e+09 backup_written_bytes{instance="",job="mailserver-backup"} 1.244254208e+09 $ curl -s http://prometheus-server/api/v1/rules \ | jq '.data.groups[].rules[] | select(.name | test("Mailserver"))' MailserverBackupStale: (time() - kube_cronjob_status_last_successful_time{cronjob="mailserver-backup",namespace="mailserver"}) > 129600 MailserverBackupNeverSucceeded: kube_cronjob_status_last_successful_time{cronjob="mailserver-backup",namespace="mailserver"} == 0 ### Manual Verification 1. Wait for the scheduled 03:00 run tonight; verify `kubectl get job -n mailserver` shows a new completed job. 2. Check that `backup_last_success_timestamp` advances past today. 3. Confirm `MailserverBackupNeverSucceeded` did not fire. 4. Next week (week 16), confirm `--link-dest` builds hardlinks vs 2026-15 (size delta should drop from ~950M to ~the actual churn). ## Deviations from mysql-backup pattern - Image: alpine + rsync (mirrors vaultwarden — mysql's `mysql:8.0` base is not applicable for a filesystem rsync). - pod_affinity: required for RWO PVC co-location (mysql uses its own MySQL service for network access; mailserver must mount the PVC). - Metric push via wget (mirrors vaultwarden; alpine has wget, not curl). - Week-folder layout with --link-dest rotation: rsync pattern, closer to the PVE daily-backup script than mysql's single-file gzip dumps. [ci skip] Closes: code-z26 Co-Authored-By: Claude Opus 4.7 (1M context) --- stacks/mailserver/modules/mailserver/main.tf | 157 +++++++++++++++++- .../monitoring/prometheus_chart_values.tpl | 14 ++ 2 files changed, 170 insertions(+), 1 deletion(-) diff --git a/stacks/mailserver/modules/mailserver/main.tf b/stacks/mailserver/modules/mailserver/main.tf index 68eb8b1d..06d8f815 100644 --- a/stacks/mailserver/modules/mailserver/main.tf +++ b/stacks/mailserver/modules/mailserver/main.tf @@ -21,7 +21,7 @@ variable "email_monitor_imap_password" { # — and Dovecot logs 'exists more than once' on every auth lookup. Aliases # that forward to external addresses (gmail etc.) or to self are safe. locals { - _account_set = keys(var.mailserver_accounts) + _account_set = keys(var.mailserver_accounts) _virtual_lines = split("\n", format("%s%s", var.postfix_account_aliases, file("${path.module}/extra/aliases.txt"))) postfix_virtual = join("\n", [ for line in local._virtual_lines : line @@ -730,3 +730,158 @@ sys.exit(0 if success else 1) } } +# ============================================================================= +# Mailserver Backup — Daily rsync of maildirs, mail-state, and log +# Pattern mirrors vaultwarden-backup (pod_affinity for RWO co-location, /backup +# write to NFS, Pushgateway metrics). Runs at 03:00 to avoid overlap with +# mysql-backup (00:30), vaultwarden-backup (*/6h), email-roundtrip (*/20m). +# Total loss of this PVC = all maildirs + DKIM keys gone; regenerating DKIM +# requires DNS changes, hence backup is critical. +# ============================================================================= +module "nfs_mailserver_backup_host" { + source = "../../../../modules/kubernetes/nfs_volume" + name = "mailserver-backup-host" + namespace = kubernetes_namespace.mailserver.metadata[0].name + nfs_server = var.nfs_server + nfs_path = "/srv/nfs/mailserver-backup" +} + +resource "kubernetes_cron_job_v1" "mailserver-backup" { + metadata { + name = "mailserver-backup" + namespace = kubernetes_namespace.mailserver.metadata[0].name + } + spec { + concurrency_policy = "Replace" + failed_jobs_history_limit = 5 + schedule = "0 3 * * *" + starting_deadline_seconds = 10 + successful_jobs_history_limit = 10 + job_template { + metadata {} + spec { + backoff_limit = 3 + ttl_seconds_after_finished = 10 + template { + metadata {} + spec { + # RWO co-location: backup pod must land on the same node as the + # mailserver pod because mailserver-data-encrypted is ReadWriteOnce. + affinity { + pod_affinity { + required_during_scheduling_ignored_during_execution { + label_selector { + match_labels = { + app = "mailserver" + } + } + topology_key = "kubernetes.io/hostname" + } + } + } + container { + name = "mailserver-backup" + image = "docker.io/library/alpine" + command = ["/bin/sh", "-c", <<-EOT + set -euxo pipefail + apk add --no-cache rsync + _t0=$(date +%s) + _rb0=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) + _wb0=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) + + week=$(date +"%Y-%W") + prev_week=$(date -d "-7 days" +"%Y-%W" 2>/dev/null || echo "") + dst=/backup/$week + mkdir -p "$dst" + + # Use --link-dest against previous week for space-efficient + # incrementals (unchanged files are hardlinked, not re-copied). + link_dest_arg="" + if [ -n "$prev_week" ] && [ -d "/backup/$prev_week" ]; then + link_dest_arg="--link-dest=/backup/$prev_week" + fi + + # Mailserver data layout (from deployment subPath mounts): + # /var/mail -> data (maildirs) + # /var/mail-state -> state (postfix, dovecot, rspamd, dkim keys) + # /var/log/mail -> log (mail logs) + for src in /var/mail /var/mail-state /var/log/mail; do + [ -d "$src" ] || { echo "SKIP missing $src"; continue; } + name=$(basename "$src") + rsync -aH --delete $link_dest_arg "$src/" "$dst/$name/" + done + + # Rotate — keep 8 weekly snapshots (~2 months) + find /backup -maxdepth 1 -mindepth 1 -type d -regex '.*/[0-9]+-[0-9]+$' | sort | head -n -8 | xargs -r rm -rf + + _dur=$(($(date +%s) - _t0)) + _rb1=$(awk '/^read_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) + _wb1=$(awk '/^write_bytes/{print $2}' /proc/$$/io 2>/dev/null || echo 0) + echo "=== Backup IO Stats ===" + echo "duration: $${_dur}s" + echo "read: $(( (_rb1 - _rb0) / 1048576 )) MiB" + echo "written: $(( (_wb1 - _wb0) / 1048576 )) MiB" + echo "output: $(du -sh "$dst" | awk '{print $1}')" + + _out_bytes=$(du -sb "$dst" | awk '{print $1}') + wget -qO- --post-data "backup_duration_seconds $${_dur} + backup_read_bytes $(( _rb1 - _rb0 )) + backup_written_bytes $(( _wb1 - _wb0 )) + backup_output_bytes $${_out_bytes} + backup_last_success_timestamp $(date +%s) + " "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/mailserver-backup" || true + EOT + ] + volume_mount { + name = "data" + mount_path = "/var/mail" + sub_path = "data" + read_only = true + } + volume_mount { + name = "data" + mount_path = "/var/mail-state" + sub_path = "state" + read_only = true + } + volume_mount { + name = "data" + mount_path = "/var/log/mail" + sub_path = "log" + read_only = true + } + volume_mount { + name = "backup" + mount_path = "/backup" + } + } + volume { + name = "data" + persistent_volume_claim { + claim_name = kubernetes_persistent_volume_claim.data_encrypted.metadata[0].name + read_only = true + } + } + volume { + name = "backup" + persistent_volume_claim { + claim_name = module.nfs_mailserver_backup_host.claim_name + } + } + dns_config { + option { + name = "ndots" + value = "2" + } + } + } + } + } + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2 + ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] + } +} + diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index e08e803f..cfb163af 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1169,6 +1169,20 @@ serverFiles: severity: critical annotations: summary: "Vaultwarden backup CronJob has never completed successfully" + - alert: MailserverBackupStale + expr: (time() - kube_cronjob_status_last_successful_time{cronjob="mailserver-backup", namespace="mailserver"}) > 129600 + for: 30m + labels: + severity: critical + annotations: + summary: "Mailserver backup is {{ $value | humanizeDuration }} old (threshold: 36h, runs daily 03:00)" + - alert: MailserverBackupNeverSucceeded + expr: kube_cronjob_status_last_successful_time{cronjob="mailserver-backup", namespace="mailserver"} == 0 + for: 1h + labels: + severity: critical + annotations: + summary: "Mailserver backup CronJob has never completed successfully" - alert: VaultwardenDown expr: (kube_deployment_status_replicas_available{namespace="vaultwarden", deployment="vaultwarden"} or on() vector(0)) < 1 for: 5m From 4cd8d96b017479e467e7091695a4865fcf2f1174 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 23:26:49 +0000 Subject: [PATCH 3/3] [monitoring] Widen uk-payslip default time range to 10y Oldest payslip in Paperless is July 2019. Previous default (now-2y) hid everything from 2019-2023, making it look like the backfill was broken. --- stacks/monitoring/modules/monitoring/dashboards/uk-payslip.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stacks/monitoring/modules/monitoring/dashboards/uk-payslip.json b/stacks/monitoring/modules/monitoring/dashboards/uk-payslip.json index 736ec160..67e22ca3 100644 --- a/stacks/monitoring/modules/monitoring/dashboards/uk-payslip.json +++ b/stacks/monitoring/modules/monitoring/dashboards/uk-payslip.json @@ -381,7 +381,7 @@ "list": [] }, "time": { - "from": "now-2y", + "from": "now-10y", "to": "now" }, "timepicker": {},