From 3217a5f605b9235c4e4cb3b5376a73647934b565 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 5 Apr 2026 19:29:17 +0300 Subject: [PATCH] add bank sync monitoring with Pushgateway metrics and Prometheus alerts [ci skip] CronJob now captures HTTP status, pushes bank_sync_success/duration/last_success to Pushgateway. Alerts: BankSyncFailing (6h), BankSyncStale (48h). --- stacks/actualbudget/factory/main.tf | 43 ++++++++++++++++--- .../monitoring/prometheus_chart_values.tpl | 42 ++++++++++++++++++ 2 files changed, 80 insertions(+), 5 deletions(-) diff --git a/stacks/actualbudget/factory/main.tf b/stacks/actualbudget/factory/main.tf index 9020922a..6bf72ff1 100644 --- a/stacks/actualbudget/factory/main.tf +++ b/stacks/actualbudget/factory/main.tf @@ -240,13 +240,13 @@ resource "kubernetes_cron_job_v1" "bank-sync" { concurrency_policy = "Replace" failed_jobs_history_limit = 5 schedule = "0 0 * * *" # Daily - starting_deadline_seconds = 10 + starting_deadline_seconds = 60 successful_jobs_history_limit = 10 job_template { metadata {} spec { - backoff_limit = 3 - ttl_seconds_after_finished = 10 + backoff_limit = 1 + ttl_seconds_after_finished = 300 template { metadata {} spec { @@ -254,8 +254,41 @@ resource "kubernetes_cron_job_v1" "bank-sync" { name = "bank-sync" image = "curlimages/curl" command = ["/bin/sh", "-c", <<-EOT - # set -eux # Shows credentials so use only when debugging - curl -X POST --location 'http://budget-http-api-${var.name}/v1/budgets/${var.sync_id}/accounts/banksync' --header 'accept: application/json' --header 'budget-encryption-password: ${var.budget_encryption_password}' --header 'x-api-key: ${random_string.api-key.result}' + PUSHGATEWAY="http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/bank-sync-${var.name}" + START=$(date +%s) + + HTTP_CODE=$(curl -s -o /tmp/response.txt -w '%%{http_code}' \ + -X POST --location \ + 'http://budget-http-api-${var.name}/v1/budgets/${var.sync_id}/accounts/banksync' \ + --header 'accept: application/json' \ + --header 'budget-encryption-password: ${var.budget_encryption_password}' \ + --header 'x-api-key: ${random_string.api-key.result}') + + END=$(date +%s) + DURATION=$((END - START)) + + if [ "$HTTP_CODE" = "200" ]; then + SUCCESS=1 + LAST_SUCCESS=$END + else + SUCCESS=0 + LAST_SUCCESS=0 + echo "Bank sync failed with HTTP $HTTP_CODE:" + cat /tmp/response.txt + echo "" + fi + + cat < 86400 + for: 30m + labels: + severity: critical + annotations: + summary: "LVM PVC snapshots are {{ $value | humanizeDuration }} old (expected every 12h)" + - alert: LVMSnapshotNeverRun + expr: absent(lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) + for: 48h + labels: + severity: warning + annotations: + summary: "LVM PVC snapshot job has never reported metrics to Pushgateway" + - alert: LVMSnapshotFailing + expr: lvm_snapshot_last_status{job="lvm-pvc-snapshot"} != 0 + for: 0m + labels: + severity: critical + annotations: + summary: "LVM PVC snapshot job failed (status={{ $value }})" + - alert: LVMThinPoolLow + expr: lvm_snapshot_thinpool_free_pct{job="lvm-pvc-snapshot"} < 15 + for: 0m + labels: + severity: warning + annotations: + summary: "LVM thin pool has only {{ $value }}% free — snapshot overhead may cause pool exhaustion" - alert: NewTailscaleClient expr: irate(headscale_machine_registrations_total{action="reauth"}[5m]) > 0 for: 5m @@ -1472,6 +1500,20 @@ serverFiles: severity: warning annotations: summary: "Mail server has no available replicas - mail may not be received" + - alert: BankSyncFailing + expr: bank_sync_success == 0 + for: 6h + labels: + severity: warning + annotations: + summary: "Bank sync failing for {{ $labels.job }}. Accounts may need GoCardless reauthorization." + - alert: BankSyncStale + expr: (time() - bank_sync_last_success_timestamp) > 172800 + for: 1h + labels: + severity: warning + annotations: + summary: "Bank sync for {{ $labels.job }} has not succeeded in >48h. Check CronJob and account auth." - alert: EmailRoundtripFailing expr: email_roundtrip_success{job="email-roundtrip-monitor"} == 0 for: 90m