add bank sync monitoring with Pushgateway metrics and Prometheus alerts [ci skip]

CronJob now captures HTTP status, pushes bank_sync_success/duration/last_success
to Pushgateway. Alerts: BankSyncFailing (6h), BankSyncStale (48h).
This commit is contained in:
Viktor Barzin 2026-04-05 19:29:17 +03:00
parent 3f09a2d007
commit 3217a5f605
2 changed files with 80 additions and 5 deletions

View file

@ -240,13 +240,13 @@ resource "kubernetes_cron_job_v1" "bank-sync" {
concurrency_policy = "Replace"
failed_jobs_history_limit = 5
schedule = "0 0 * * *" # Daily
starting_deadline_seconds = 10
starting_deadline_seconds = 60
successful_jobs_history_limit = 10
job_template {
metadata {}
spec {
backoff_limit = 3
ttl_seconds_after_finished = 10
backoff_limit = 1
ttl_seconds_after_finished = 300
template {
metadata {}
spec {
@ -254,8 +254,41 @@ resource "kubernetes_cron_job_v1" "bank-sync" {
name = "bank-sync"
image = "curlimages/curl"
command = ["/bin/sh", "-c", <<-EOT
# set -eux # Shows credentials so use only when debugging
curl -X POST --location 'http://budget-http-api-${var.name}/v1/budgets/${var.sync_id}/accounts/banksync' --header 'accept: application/json' --header 'budget-encryption-password: ${var.budget_encryption_password}' --header 'x-api-key: ${random_string.api-key.result}'
PUSHGATEWAY="http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/bank-sync-${var.name}"
START=$(date +%s)
HTTP_CODE=$(curl -s -o /tmp/response.txt -w '%%{http_code}' \
-X POST --location \
'http://budget-http-api-${var.name}/v1/budgets/${var.sync_id}/accounts/banksync' \
--header 'accept: application/json' \
--header 'budget-encryption-password: ${var.budget_encryption_password}' \
--header 'x-api-key: ${random_string.api-key.result}')
END=$(date +%s)
DURATION=$((END - START))
if [ "$HTTP_CODE" = "200" ]; then
SUCCESS=1
LAST_SUCCESS=$END
else
SUCCESS=0
LAST_SUCCESS=0
echo "Bank sync failed with HTTP $HTTP_CODE:"
cat /tmp/response.txt
echo ""
fi
cat <<METRICS | curl -s --data-binary @- "$PUSHGATEWAY"
# HELP bank_sync_success Whether the last bank sync succeeded (1=ok, 0=fail)
# TYPE bank_sync_success gauge
bank_sync_success $SUCCESS
# HELP bank_sync_duration_seconds Duration of the last bank sync run
# TYPE bank_sync_duration_seconds gauge
bank_sync_duration_seconds $DURATION
# HELP bank_sync_last_success_timestamp Unix timestamp of the last successful sync
# TYPE bank_sync_last_success_timestamp gauge
bank_sync_last_success_timestamp $LAST_SUCCESS
METRICS
EOT
]
}

View file

@ -1073,6 +1073,34 @@ serverFiles:
severity: warning
annotations:
summary: "Backup job failed: {{ $labels.namespace }}/{{ $labels.job_name }}"
- alert: LVMSnapshotStale
expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 86400
for: 30m
labels:
severity: critical
annotations:
summary: "LVM PVC snapshots are {{ $value | humanizeDuration }} old (expected every 12h)"
- alert: LVMSnapshotNeverRun
expr: absent(lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"})
for: 48h
labels:
severity: warning
annotations:
summary: "LVM PVC snapshot job has never reported metrics to Pushgateway"
- alert: LVMSnapshotFailing
expr: lvm_snapshot_last_status{job="lvm-pvc-snapshot"} != 0
for: 0m
labels:
severity: critical
annotations:
summary: "LVM PVC snapshot job failed (status={{ $value }})"
- alert: LVMThinPoolLow
expr: lvm_snapshot_thinpool_free_pct{job="lvm-pvc-snapshot"} < 15
for: 0m
labels:
severity: warning
annotations:
summary: "LVM thin pool has only {{ $value }}% free — snapshot overhead may cause pool exhaustion"
- alert: NewTailscaleClient
expr: irate(headscale_machine_registrations_total{action="reauth"}[5m]) > 0
for: 5m
@ -1472,6 +1500,20 @@ serverFiles:
severity: warning
annotations:
summary: "Mail server has no available replicas - mail may not be received"
- alert: BankSyncFailing
expr: bank_sync_success == 0
for: 6h
labels:
severity: warning
annotations:
summary: "Bank sync failing for {{ $labels.job }}. Accounts may need GoCardless reauthorization."
- alert: BankSyncStale
expr: (time() - bank_sync_last_success_timestamp) > 172800
for: 1h
labels:
severity: warning
annotations:
summary: "Bank sync for {{ $labels.job }} has not succeeded in >48h. Check CronJob and account auth."
- alert: EmailRoundtripFailing
expr: email_roundtrip_success{job="email-roundtrip-monitor"} == 0
for: 90m