From 665b6b2934d1f75a58fc36da4b4d9e51e849bc6a Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 11 May 2026 18:55:15 +0000 Subject: [PATCH] actualbudget+monitoring: per-account bank-sync metrics, drop noisy alert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bank-sync CronJob was posting to /accounts/banksync which fans out to ALL accounts in a single call. With PSD2/GoCardless's 4-successful-pulls per-account per-24h quota, a single rate-limited account would 500 the whole call, and `bank_sync_success` would flip to 0 even though the data itself was still flowing through manual UI syncs. Result: BankSyncFailing fired routinely whenever the user had been active in the UI that day — a structural false positive. Fix: * CronJob: enumerate accounts via GET /accounts, POST per-account /accounts/{id}/banksync, emit bank_sync_account_success and bank_sync_account_last_success_timestamp labelled by account name. Roll up bank_sync_success = 1 iff any account succeeded. * Alerts: drop BankSyncFailing (noise generator). Keep BankSyncStale at 48h (global drought). Add BankSyncAccountStale at 72h (catches single-account auth expiry — the real signal we wanted). Verified: manual run on bank-sync-viktor pushes 6 per-account success + timestamp series; roll-up bank_sync_success=1; no firing alerts. --- stacks/actualbudget/factory/main.tf | 115 ++++++++++++------ .../monitoring/prometheus_chart_values.tpl | 20 +-- 2 files changed, 92 insertions(+), 43 deletions(-) diff --git a/stacks/actualbudget/factory/main.tf b/stacks/actualbudget/factory/main.tf index 147d39ea..869f9036 100644 --- a/stacks/actualbudget/factory/main.tf +++ b/stacks/actualbudget/factory/main.tf @@ -282,48 +282,93 @@ resource "kubernetes_cron_job_v1" "bank-sync" { spec { container { name = "bank-sync" - image = "curlimages/curl" + image = "alpine:3.20" command = ["/bin/sh", "-c", <<-EOT - PUSHGATEWAY="http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/bank-sync-${var.name}" + set -u + apk add --no-cache curl jq >/dev/null 2>&1 + + USER_NAME='${var.name}' + SYNC_ID='${var.sync_id}' + API_KEY='${random_string.api-key.result}' + PW='${var.budget_encryption_password}' + PG="http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/bank-sync-$USER_NAME" + API="http://budget-http-api-$USER_NAME" + START=$(date +%s) - HTTP_CODE=$(curl -s -o /tmp/response.txt -w '%%{http_code}' \ - -X POST --location \ - 'http://budget-http-api-${var.name}/v1/budgets/${var.sync_id}/accounts/banksync' \ - --header 'accept: application/json' \ - --header 'budget-encryption-password: ${var.budget_encryption_password}' \ - --header 'x-api-key: ${random_string.api-key.result}') + # Enumerate active accounts: open + on-budget. + ACCOUNTS=$(curl -fsS "$API/v1/budgets/$SYNC_ID/accounts" \ + -H "x-api-key: $API_KEY" \ + -H "budget-encryption-password: $PW" \ + | jq -c '.data[] | select(.closed == false and .offbudget == false) | {id, name}') - END=$(date +%s) - DURATION=$((END - START)) - - if [ "$HTTP_CODE" = "200" ]; then - SUCCESS=1 - LAST_SUCCESS=$END - else - SUCCESS=0 - echo "Bank sync failed with HTTP $HTTP_CODE:" - cat /tmp/response.txt - echo "" + if [ -z "$ACCOUNTS" ]; then + echo "ERROR: GET /accounts returned no eligible accounts; aborting" + exit 1 fi - # Pushgateway POST preserves metrics not in the payload, so on - # failure we omit bank_sync_last_success_timestamp to keep the - # prior success value — this prevents BankSyncStale from firing - # alongside BankSyncFailing after a single failed run. - { - printf '# HELP bank_sync_success Whether the last bank sync succeeded (1=ok, 0=fail)\n' - printf '# TYPE bank_sync_success gauge\n' - printf 'bank_sync_success %s\n' "$SUCCESS" - printf '# HELP bank_sync_duration_seconds Duration of the last bank sync run\n' - printf '# TYPE bank_sync_duration_seconds gauge\n' - printf 'bank_sync_duration_seconds %s\n' "$DURATION" - if [ "$SUCCESS" = "1" ]; then - printf '# HELP bank_sync_last_success_timestamp Unix timestamp of the last successful sync\n' - printf '# TYPE bank_sync_last_success_timestamp gauge\n' - printf 'bank_sync_last_success_timestamp %s\n' "$LAST_SUCCESS" + : > /tmp/payload + rm -f /tmp/any_success + + # Per-account sync. Each account has its own PSD2/GoCardless + # quota (4 successful pulls per 24h), so we treat them + # independently — one rate-limited account doesn't mark the + # run as a failure. + echo "$ACCOUNTS" | while IFS= read -r ACCT; do + [ -z "$ACCT" ] && continue + ID=$(echo "$ACCT" | jq -r '.id') + NAME=$(echo "$ACCT" | jq -r '.name') + LABEL=$(echo "$NAME" | sed -E 's/[^a-zA-Z0-9]+/_/g') + + HTTP_CODE=$(curl -s -o /tmp/r.txt -w '%%{http_code}' \ + -X POST "$API/v1/budgets/$SYNC_ID/accounts/$ID/banksync" \ + -H 'accept: application/json' \ + -H "x-api-key: $API_KEY" \ + -H "budget-encryption-password: $PW") || HTTP_CODE=0 + + NOW=$(date +%s) + if [ "$HTTP_CODE" = "200" ]; then + echo "OK account=$NAME" + printf 'bank_sync_account_success{account="%s"} 1\n' "$LABEL" >> /tmp/payload + printf 'bank_sync_account_last_success_timestamp{account="%s"} %s\n' "$LABEL" "$NOW" >> /tmp/payload + : > /tmp/any_success + else + echo "FAIL account=$NAME http=$HTTP_CODE body=$(cat /tmp/r.txt)" + printf 'bank_sync_account_success{account="%s"} 0\n' "$LABEL" >> /tmp/payload fi - } | curl -s --data-binary @- "$PUSHGATEWAY" + done + + END=$(date +%s) + DUR=$((END - START)) + + if [ -f /tmp/any_success ]; then + ANY=1 + else + ANY=0 + fi + + # Pushgateway POST preserves prior values for label sets not + # in the payload, so per-account last_success_timestamp values + # for accounts that failed this run keep their prior good + # values — that's what BankSyncAccountStale alerts on. + { + printf '# HELP bank_sync_account_success Per-account sync result (1=ok, 0=fail)\n' + printf '# TYPE bank_sync_account_success gauge\n' + printf '# HELP bank_sync_account_last_success_timestamp Per-account Unix timestamp of last successful sync\n' + printf '# TYPE bank_sync_account_last_success_timestamp gauge\n' + cat /tmp/payload + printf '# HELP bank_sync_success 1 if at least one account synced this run\n' + printf '# TYPE bank_sync_success gauge\n' + printf 'bank_sync_success %s\n' "$ANY" + printf '# HELP bank_sync_duration_seconds Total duration of the cron run\n' + printf '# TYPE bank_sync_duration_seconds gauge\n' + printf 'bank_sync_duration_seconds %s\n' "$DUR" + if [ "$ANY" = "1" ]; then + printf '# HELP bank_sync_last_success_timestamp Unix timestamp of the most recent successful sync of any account\n' + printf '# TYPE bank_sync_last_success_timestamp gauge\n' + printf 'bank_sync_last_success_timestamp %s\n' "$END" + fi + } | curl -fsS --data-binary @- "$PG" EOT ] } diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 05afc077..305dfd5c 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -2152,20 +2152,24 @@ serverFiles: severity: warning annotations: summary: "Mail server has no available replicas - mail may not be received" - - alert: BankSyncFailing - expr: bank_sync_success == 0 - for: 6h - labels: - severity: warning - annotations: - summary: "Bank sync failing. Accounts may need GoCardless reauthorization. Check Pushgateway for which instance." + # Note: no BankSyncFailing alert — GoCardless enforces per-account + # PSD2 quotas (4 successful pulls per account per 24h). Manual UI + # syncs consume the same quota, so the nightly cron routinely hits + # rate-limits without any real outage. Alert only on staleness. - alert: BankSyncStale expr: (time() - bank_sync_last_success_timestamp) > 172800 for: 1h labels: severity: warning annotations: - summary: "Bank sync has not succeeded in more than 48h. Check CronJob and account auth." + summary: "Bank sync (instance {{ $labels.instance }}): NO account has synced in over 48h. Likely a real outage — check CronJob, http-api logs, and GoCardless re-auth." + - alert: BankSyncAccountStale + expr: (time() - bank_sync_account_last_success_timestamp) > 259200 + for: 1h + labels: + severity: warning + annotations: + summary: "Bank sync (instance {{ $labels.instance }}): account {{ $labels.account }} has not synced in over 72h. GoCardless requisition may have expired — re-link in Settings → Bank Sync." - alert: EmailRoundtripFailing expr: email_roundtrip_success{job="email-roundtrip-monitor"} == 0 for: 60m