actualbudget+monitoring: per-account bank-sync metrics, drop noisy alert

The bank-sync CronJob was posting to /accounts/banksync which fans out to
ALL accounts in a single call. With PSD2/GoCardless's 4-successful-pulls
per-account per-24h quota, a single rate-limited account would 500 the
whole call, and `bank_sync_success` would flip to 0 even though the data
itself was still flowing through manual UI syncs. Result: BankSyncFailing
fired routinely whenever the user had been active in the UI that day —
a structural false positive.

Fix:
  * CronJob: enumerate accounts via GET /accounts, POST per-account
    /accounts/{id}/banksync, emit bank_sync_account_success and
    bank_sync_account_last_success_timestamp labelled by account name.
    Roll up bank_sync_success = 1 iff any account succeeded.
  * Alerts: drop BankSyncFailing (noise generator). Keep BankSyncStale
    at 48h (global drought). Add BankSyncAccountStale at 72h (catches
    single-account auth expiry — the real signal we wanted).

Verified: manual run on bank-sync-viktor pushes 6 per-account success +
timestamp series; roll-up bank_sync_success=1; no firing alerts.
This commit is contained in:
Viktor Barzin 2026-05-11 18:55:15 +00:00
parent 7b6eee49c4
commit 665b6b2934
2 changed files with 92 additions and 43 deletions

View file

@ -282,48 +282,93 @@ resource "kubernetes_cron_job_v1" "bank-sync" {
spec {
container {
name = "bank-sync"
image = "curlimages/curl"
image = "alpine:3.20"
command = ["/bin/sh", "-c", <<-EOT
PUSHGATEWAY="http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/bank-sync-${var.name}"
set -u
apk add --no-cache curl jq >/dev/null 2>&1
USER_NAME='${var.name}'
SYNC_ID='${var.sync_id}'
API_KEY='${random_string.api-key.result}'
PW='${var.budget_encryption_password}'
PG="http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/bank-sync-$USER_NAME"
API="http://budget-http-api-$USER_NAME"
START=$(date +%s)
HTTP_CODE=$(curl -s -o /tmp/response.txt -w '%%{http_code}' \
-X POST --location \
'http://budget-http-api-${var.name}/v1/budgets/${var.sync_id}/accounts/banksync' \
--header 'accept: application/json' \
--header 'budget-encryption-password: ${var.budget_encryption_password}' \
--header 'x-api-key: ${random_string.api-key.result}')
# Enumerate active accounts: open + on-budget.
ACCOUNTS=$(curl -fsS "$API/v1/budgets/$SYNC_ID/accounts" \
-H "x-api-key: $API_KEY" \
-H "budget-encryption-password: $PW" \
| jq -c '.data[] | select(.closed == false and .offbudget == false) | {id, name}')
END=$(date +%s)
DURATION=$((END - START))
if [ "$HTTP_CODE" = "200" ]; then
SUCCESS=1
LAST_SUCCESS=$END
else
SUCCESS=0
echo "Bank sync failed with HTTP $HTTP_CODE:"
cat /tmp/response.txt
echo ""
if [ -z "$ACCOUNTS" ]; then
echo "ERROR: GET /accounts returned no eligible accounts; aborting"
exit 1
fi
# Pushgateway POST preserves metrics not in the payload, so on
# failure we omit bank_sync_last_success_timestamp to keep the
# prior success value this prevents BankSyncStale from firing
# alongside BankSyncFailing after a single failed run.
{
printf '# HELP bank_sync_success Whether the last bank sync succeeded (1=ok, 0=fail)\n'
printf '# TYPE bank_sync_success gauge\n'
printf 'bank_sync_success %s\n' "$SUCCESS"
printf '# HELP bank_sync_duration_seconds Duration of the last bank sync run\n'
printf '# TYPE bank_sync_duration_seconds gauge\n'
printf 'bank_sync_duration_seconds %s\n' "$DURATION"
if [ "$SUCCESS" = "1" ]; then
printf '# HELP bank_sync_last_success_timestamp Unix timestamp of the last successful sync\n'
printf '# TYPE bank_sync_last_success_timestamp gauge\n'
printf 'bank_sync_last_success_timestamp %s\n' "$LAST_SUCCESS"
: > /tmp/payload
rm -f /tmp/any_success
# Per-account sync. Each account has its own PSD2/GoCardless
# quota (4 successful pulls per 24h), so we treat them
# independently one rate-limited account doesn't mark the
# run as a failure.
echo "$ACCOUNTS" | while IFS= read -r ACCT; do
[ -z "$ACCT" ] && continue
ID=$(echo "$ACCT" | jq -r '.id')
NAME=$(echo "$ACCT" | jq -r '.name')
LABEL=$(echo "$NAME" | sed -E 's/[^a-zA-Z0-9]+/_/g')
HTTP_CODE=$(curl -s -o /tmp/r.txt -w '%%{http_code}' \
-X POST "$API/v1/budgets/$SYNC_ID/accounts/$ID/banksync" \
-H 'accept: application/json' \
-H "x-api-key: $API_KEY" \
-H "budget-encryption-password: $PW") || HTTP_CODE=0
NOW=$(date +%s)
if [ "$HTTP_CODE" = "200" ]; then
echo "OK account=$NAME"
printf 'bank_sync_account_success{account="%s"} 1\n' "$LABEL" >> /tmp/payload
printf 'bank_sync_account_last_success_timestamp{account="%s"} %s\n' "$LABEL" "$NOW" >> /tmp/payload
: > /tmp/any_success
else
echo "FAIL account=$NAME http=$HTTP_CODE body=$(cat /tmp/r.txt)"
printf 'bank_sync_account_success{account="%s"} 0\n' "$LABEL" >> /tmp/payload
fi
} | curl -s --data-binary @- "$PUSHGATEWAY"
done
END=$(date +%s)
DUR=$((END - START))
if [ -f /tmp/any_success ]; then
ANY=1
else
ANY=0
fi
# Pushgateway POST preserves prior values for label sets not
# in the payload, so per-account last_success_timestamp values
# for accounts that failed this run keep their prior good
# values that's what BankSyncAccountStale alerts on.
{
printf '# HELP bank_sync_account_success Per-account sync result (1=ok, 0=fail)\n'
printf '# TYPE bank_sync_account_success gauge\n'
printf '# HELP bank_sync_account_last_success_timestamp Per-account Unix timestamp of last successful sync\n'
printf '# TYPE bank_sync_account_last_success_timestamp gauge\n'
cat /tmp/payload
printf '# HELP bank_sync_success 1 if at least one account synced this run\n'
printf '# TYPE bank_sync_success gauge\n'
printf 'bank_sync_success %s\n' "$ANY"
printf '# HELP bank_sync_duration_seconds Total duration of the cron run\n'
printf '# TYPE bank_sync_duration_seconds gauge\n'
printf 'bank_sync_duration_seconds %s\n' "$DUR"
if [ "$ANY" = "1" ]; then
printf '# HELP bank_sync_last_success_timestamp Unix timestamp of the most recent successful sync of any account\n'
printf '# TYPE bank_sync_last_success_timestamp gauge\n'
printf 'bank_sync_last_success_timestamp %s\n' "$END"
fi
} | curl -fsS --data-binary @- "$PG"
EOT
]
}

View file

@ -2152,20 +2152,24 @@ serverFiles:
severity: warning
annotations:
summary: "Mail server has no available replicas - mail may not be received"
- alert: BankSyncFailing
expr: bank_sync_success == 0
for: 6h
labels:
severity: warning
annotations:
summary: "Bank sync failing. Accounts may need GoCardless reauthorization. Check Pushgateway for which instance."
# Note: no BankSyncFailing alert — GoCardless enforces per-account
# PSD2 quotas (4 successful pulls per account per 24h). Manual UI
# syncs consume the same quota, so the nightly cron routinely hits
# rate-limits without any real outage. Alert only on staleness.
- alert: BankSyncStale
expr: (time() - bank_sync_last_success_timestamp) > 172800
for: 1h
labels:
severity: warning
annotations:
summary: "Bank sync has not succeeded in more than 48h. Check CronJob and account auth."
summary: "Bank sync (instance {{ $labels.instance }}): NO account has synced in over 48h. Likely a real outage — check CronJob, http-api logs, and GoCardless re-auth."
- alert: BankSyncAccountStale
expr: (time() - bank_sync_account_last_success_timestamp) > 259200
for: 1h
labels:
severity: warning
annotations:
summary: "Bank sync (instance {{ $labels.instance }}): account {{ $labels.account }} has not synced in over 72h. GoCardless requisition may have expired — re-link in Settings → Bank Sync."
- alert: EmailRoundtripFailing
expr: email_roundtrip_success{job="email-roundtrip-monitor"} == 0
for: 60m