actualbudget+monitoring: per-account bank-sync metrics, drop noisy alert
The bank-sync CronJob was posting to /accounts/banksync which fans out to
ALL accounts in a single call. With PSD2/GoCardless's 4-successful-pulls
per-account per-24h quota, a single rate-limited account would 500 the
whole call, and `bank_sync_success` would flip to 0 even though the data
itself was still flowing through manual UI syncs. Result: BankSyncFailing
fired routinely whenever the user had been active in the UI that day —
a structural false positive.
Fix:
* CronJob: enumerate accounts via GET /accounts, POST per-account
/accounts/{id}/banksync, emit bank_sync_account_success and
bank_sync_account_last_success_timestamp labelled by account name.
Roll up bank_sync_success = 1 iff any account succeeded.
* Alerts: drop BankSyncFailing (noise generator). Keep BankSyncStale
at 48h (global drought). Add BankSyncAccountStale at 72h (catches
single-account auth expiry — the real signal we wanted).
Verified: manual run on bank-sync-viktor pushes 6 per-account success +
timestamp series; roll-up bank_sync_success=1; no firing alerts.
This commit is contained in:
parent
5a271e70ab
commit
a980f78b58
2 changed files with 92 additions and 43 deletions
|
|
@ -282,48 +282,93 @@ resource "kubernetes_cron_job_v1" "bank-sync" {
|
||||||
spec {
|
spec {
|
||||||
container {
|
container {
|
||||||
name = "bank-sync"
|
name = "bank-sync"
|
||||||
image = "curlimages/curl"
|
image = "alpine:3.20"
|
||||||
command = ["/bin/sh", "-c", <<-EOT
|
command = ["/bin/sh", "-c", <<-EOT
|
||||||
PUSHGATEWAY="http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/bank-sync-${var.name}"
|
set -u
|
||||||
|
apk add --no-cache curl jq >/dev/null 2>&1
|
||||||
|
|
||||||
|
USER_NAME='${var.name}'
|
||||||
|
SYNC_ID='${var.sync_id}'
|
||||||
|
API_KEY='${random_string.api-key.result}'
|
||||||
|
PW='${var.budget_encryption_password}'
|
||||||
|
PG="http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/bank-sync-$USER_NAME"
|
||||||
|
API="http://budget-http-api-$USER_NAME"
|
||||||
|
|
||||||
START=$(date +%s)
|
START=$(date +%s)
|
||||||
|
|
||||||
HTTP_CODE=$(curl -s -o /tmp/response.txt -w '%%{http_code}' \
|
# Enumerate active accounts: open + on-budget.
|
||||||
-X POST --location \
|
ACCOUNTS=$(curl -fsS "$API/v1/budgets/$SYNC_ID/accounts" \
|
||||||
'http://budget-http-api-${var.name}/v1/budgets/${var.sync_id}/accounts/banksync' \
|
-H "x-api-key: $API_KEY" \
|
||||||
--header 'accept: application/json' \
|
-H "budget-encryption-password: $PW" \
|
||||||
--header 'budget-encryption-password: ${var.budget_encryption_password}' \
|
| jq -c '.data[] | select(.closed == false and .offbudget == false) | {id, name}')
|
||||||
--header 'x-api-key: ${random_string.api-key.result}')
|
|
||||||
|
|
||||||
END=$(date +%s)
|
if [ -z "$ACCOUNTS" ]; then
|
||||||
DURATION=$((END - START))
|
echo "ERROR: GET /accounts returned no eligible accounts; aborting"
|
||||||
|
exit 1
|
||||||
if [ "$HTTP_CODE" = "200" ]; then
|
|
||||||
SUCCESS=1
|
|
||||||
LAST_SUCCESS=$END
|
|
||||||
else
|
|
||||||
SUCCESS=0
|
|
||||||
echo "Bank sync failed with HTTP $HTTP_CODE:"
|
|
||||||
cat /tmp/response.txt
|
|
||||||
echo ""
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Pushgateway POST preserves metrics not in the payload, so on
|
: > /tmp/payload
|
||||||
# failure we omit bank_sync_last_success_timestamp to keep the
|
rm -f /tmp/any_success
|
||||||
# prior success value — this prevents BankSyncStale from firing
|
|
||||||
# alongside BankSyncFailing after a single failed run.
|
# Per-account sync. Each account has its own PSD2/GoCardless
|
||||||
{
|
# quota (4 successful pulls per 24h), so we treat them
|
||||||
printf '# HELP bank_sync_success Whether the last bank sync succeeded (1=ok, 0=fail)\n'
|
# independently — one rate-limited account doesn't mark the
|
||||||
printf '# TYPE bank_sync_success gauge\n'
|
# run as a failure.
|
||||||
printf 'bank_sync_success %s\n' "$SUCCESS"
|
echo "$ACCOUNTS" | while IFS= read -r ACCT; do
|
||||||
printf '# HELP bank_sync_duration_seconds Duration of the last bank sync run\n'
|
[ -z "$ACCT" ] && continue
|
||||||
printf '# TYPE bank_sync_duration_seconds gauge\n'
|
ID=$(echo "$ACCT" | jq -r '.id')
|
||||||
printf 'bank_sync_duration_seconds %s\n' "$DURATION"
|
NAME=$(echo "$ACCT" | jq -r '.name')
|
||||||
if [ "$SUCCESS" = "1" ]; then
|
LABEL=$(echo "$NAME" | sed -E 's/[^a-zA-Z0-9]+/_/g')
|
||||||
printf '# HELP bank_sync_last_success_timestamp Unix timestamp of the last successful sync\n'
|
|
||||||
printf '# TYPE bank_sync_last_success_timestamp gauge\n'
|
HTTP_CODE=$(curl -s -o /tmp/r.txt -w '%%{http_code}' \
|
||||||
printf 'bank_sync_last_success_timestamp %s\n' "$LAST_SUCCESS"
|
-X POST "$API/v1/budgets/$SYNC_ID/accounts/$ID/banksync" \
|
||||||
|
-H 'accept: application/json' \
|
||||||
|
-H "x-api-key: $API_KEY" \
|
||||||
|
-H "budget-encryption-password: $PW") || HTTP_CODE=0
|
||||||
|
|
||||||
|
NOW=$(date +%s)
|
||||||
|
if [ "$HTTP_CODE" = "200" ]; then
|
||||||
|
echo "OK account=$NAME"
|
||||||
|
printf 'bank_sync_account_success{account="%s"} 1\n' "$LABEL" >> /tmp/payload
|
||||||
|
printf 'bank_sync_account_last_success_timestamp{account="%s"} %s\n' "$LABEL" "$NOW" >> /tmp/payload
|
||||||
|
: > /tmp/any_success
|
||||||
|
else
|
||||||
|
echo "FAIL account=$NAME http=$HTTP_CODE body=$(cat /tmp/r.txt)"
|
||||||
|
printf 'bank_sync_account_success{account="%s"} 0\n' "$LABEL" >> /tmp/payload
|
||||||
fi
|
fi
|
||||||
} | curl -s --data-binary @- "$PUSHGATEWAY"
|
done
|
||||||
|
|
||||||
|
END=$(date +%s)
|
||||||
|
DUR=$((END - START))
|
||||||
|
|
||||||
|
if [ -f /tmp/any_success ]; then
|
||||||
|
ANY=1
|
||||||
|
else
|
||||||
|
ANY=0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Pushgateway POST preserves prior values for label sets not
|
||||||
|
# in the payload, so per-account last_success_timestamp values
|
||||||
|
# for accounts that failed this run keep their prior good
|
||||||
|
# values — that's what BankSyncAccountStale alerts on.
|
||||||
|
{
|
||||||
|
printf '# HELP bank_sync_account_success Per-account sync result (1=ok, 0=fail)\n'
|
||||||
|
printf '# TYPE bank_sync_account_success gauge\n'
|
||||||
|
printf '# HELP bank_sync_account_last_success_timestamp Per-account Unix timestamp of last successful sync\n'
|
||||||
|
printf '# TYPE bank_sync_account_last_success_timestamp gauge\n'
|
||||||
|
cat /tmp/payload
|
||||||
|
printf '# HELP bank_sync_success 1 if at least one account synced this run\n'
|
||||||
|
printf '# TYPE bank_sync_success gauge\n'
|
||||||
|
printf 'bank_sync_success %s\n' "$ANY"
|
||||||
|
printf '# HELP bank_sync_duration_seconds Total duration of the cron run\n'
|
||||||
|
printf '# TYPE bank_sync_duration_seconds gauge\n'
|
||||||
|
printf 'bank_sync_duration_seconds %s\n' "$DUR"
|
||||||
|
if [ "$ANY" = "1" ]; then
|
||||||
|
printf '# HELP bank_sync_last_success_timestamp Unix timestamp of the most recent successful sync of any account\n'
|
||||||
|
printf '# TYPE bank_sync_last_success_timestamp gauge\n'
|
||||||
|
printf 'bank_sync_last_success_timestamp %s\n' "$END"
|
||||||
|
fi
|
||||||
|
} | curl -fsS --data-binary @- "$PG"
|
||||||
EOT
|
EOT
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2152,20 +2152,24 @@ serverFiles:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Mail server has no available replicas - mail may not be received"
|
summary: "Mail server has no available replicas - mail may not be received"
|
||||||
- alert: BankSyncFailing
|
# Note: no BankSyncFailing alert — GoCardless enforces per-account
|
||||||
expr: bank_sync_success == 0
|
# PSD2 quotas (4 successful pulls per account per 24h). Manual UI
|
||||||
for: 6h
|
# syncs consume the same quota, so the nightly cron routinely hits
|
||||||
labels:
|
# rate-limits without any real outage. Alert only on staleness.
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "Bank sync failing. Accounts may need GoCardless reauthorization. Check Pushgateway for which instance."
|
|
||||||
- alert: BankSyncStale
|
- alert: BankSyncStale
|
||||||
expr: (time() - bank_sync_last_success_timestamp) > 172800
|
expr: (time() - bank_sync_last_success_timestamp) > 172800
|
||||||
for: 1h
|
for: 1h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Bank sync has not succeeded in more than 48h. Check CronJob and account auth."
|
summary: "Bank sync (instance {{ $labels.instance }}): NO account has synced in over 48h. Likely a real outage — check CronJob, http-api logs, and GoCardless re-auth."
|
||||||
|
- alert: BankSyncAccountStale
|
||||||
|
expr: (time() - bank_sync_account_last_success_timestamp) > 259200
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Bank sync (instance {{ $labels.instance }}): account {{ $labels.account }} has not synced in over 72h. GoCardless requisition may have expired — re-link in Settings → Bank Sync."
|
||||||
- alert: EmailRoundtripFailing
|
- alert: EmailRoundtripFailing
|
||||||
expr: email_roundtrip_success{job="email-roundtrip-monitor"} == 0
|
expr: email_roundtrip_success{job="email-roundtrip-monitor"} == 0
|
||||||
for: 60m
|
for: 60m
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue