add bank sync monitoring with Pushgateway metrics and Prometheus alerts [ci skip]
CronJob now captures HTTP status, pushes bank_sync_success/duration/last_success to Pushgateway. Alerts: BankSyncFailing (6h), BankSyncStale (48h).
This commit is contained in:
parent
3f09a2d007
commit
3217a5f605
2 changed files with 80 additions and 5 deletions
|
|
@ -240,13 +240,13 @@ resource "kubernetes_cron_job_v1" "bank-sync" {
|
|||
concurrency_policy = "Replace"
|
||||
failed_jobs_history_limit = 5
|
||||
schedule = "0 0 * * *" # Daily
|
||||
starting_deadline_seconds = 10
|
||||
starting_deadline_seconds = 60
|
||||
successful_jobs_history_limit = 10
|
||||
job_template {
|
||||
metadata {}
|
||||
spec {
|
||||
backoff_limit = 3
|
||||
ttl_seconds_after_finished = 10
|
||||
backoff_limit = 1
|
||||
ttl_seconds_after_finished = 300
|
||||
template {
|
||||
metadata {}
|
||||
spec {
|
||||
|
|
@ -254,8 +254,41 @@ resource "kubernetes_cron_job_v1" "bank-sync" {
|
|||
name = "bank-sync"
|
||||
image = "curlimages/curl"
|
||||
command = ["/bin/sh", "-c", <<-EOT
|
||||
# set -eux # Shows credentials so use only when debugging
|
||||
curl -X POST --location 'http://budget-http-api-${var.name}/v1/budgets/${var.sync_id}/accounts/banksync' --header 'accept: application/json' --header 'budget-encryption-password: ${var.budget_encryption_password}' --header 'x-api-key: ${random_string.api-key.result}'
|
||||
PUSHGATEWAY="http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/bank-sync-${var.name}"
|
||||
START=$(date +%s)
|
||||
|
||||
HTTP_CODE=$(curl -s -o /tmp/response.txt -w '%%{http_code}' \
|
||||
-X POST --location \
|
||||
'http://budget-http-api-${var.name}/v1/budgets/${var.sync_id}/accounts/banksync' \
|
||||
--header 'accept: application/json' \
|
||||
--header 'budget-encryption-password: ${var.budget_encryption_password}' \
|
||||
--header 'x-api-key: ${random_string.api-key.result}')
|
||||
|
||||
END=$(date +%s)
|
||||
DURATION=$((END - START))
|
||||
|
||||
if [ "$HTTP_CODE" = "200" ]; then
|
||||
SUCCESS=1
|
||||
LAST_SUCCESS=$END
|
||||
else
|
||||
SUCCESS=0
|
||||
LAST_SUCCESS=0
|
||||
echo "Bank sync failed with HTTP $HTTP_CODE:"
|
||||
cat /tmp/response.txt
|
||||
echo ""
|
||||
fi
|
||||
|
||||
cat <<METRICS | curl -s --data-binary @- "$PUSHGATEWAY"
|
||||
# HELP bank_sync_success Whether the last bank sync succeeded (1=ok, 0=fail)
|
||||
# TYPE bank_sync_success gauge
|
||||
bank_sync_success $SUCCESS
|
||||
# HELP bank_sync_duration_seconds Duration of the last bank sync run
|
||||
# TYPE bank_sync_duration_seconds gauge
|
||||
bank_sync_duration_seconds $DURATION
|
||||
# HELP bank_sync_last_success_timestamp Unix timestamp of the last successful sync
|
||||
# TYPE bank_sync_last_success_timestamp gauge
|
||||
bank_sync_last_success_timestamp $LAST_SUCCESS
|
||||
METRICS
|
||||
EOT
|
||||
]
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1073,6 +1073,34 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "Backup job failed: {{ $labels.namespace }}/{{ $labels.job_name }}"
|
||||
- alert: LVMSnapshotStale
|
||||
expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 86400
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "LVM PVC snapshots are {{ $value | humanizeDuration }} old (expected every 12h)"
|
||||
- alert: LVMSnapshotNeverRun
|
||||
expr: absent(lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"})
|
||||
for: 48h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "LVM PVC snapshot job has never reported metrics to Pushgateway"
|
||||
- alert: LVMSnapshotFailing
|
||||
expr: lvm_snapshot_last_status{job="lvm-pvc-snapshot"} != 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "LVM PVC snapshot job failed (status={{ $value }})"
|
||||
- alert: LVMThinPoolLow
|
||||
expr: lvm_snapshot_thinpool_free_pct{job="lvm-pvc-snapshot"} < 15
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "LVM thin pool has only {{ $value }}% free — snapshot overhead may cause pool exhaustion"
|
||||
- alert: NewTailscaleClient
|
||||
expr: irate(headscale_machine_registrations_total{action="reauth"}[5m]) > 0
|
||||
for: 5m
|
||||
|
|
@ -1472,6 +1500,20 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "Mail server has no available replicas - mail may not be received"
|
||||
- alert: BankSyncFailing
|
||||
expr: bank_sync_success == 0
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Bank sync failing for {{ $labels.job }}. Accounts may need GoCardless reauthorization."
|
||||
- alert: BankSyncStale
|
||||
expr: (time() - bank_sync_last_success_timestamp) > 172800
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Bank sync for {{ $labels.job }} has not succeeded in >48h. Check CronJob and account auth."
|
||||
- alert: EmailRoundtripFailing
|
||||
expr: email_roundtrip_success{job="email-roundtrip-monitor"} == 0
|
||||
for: 90m
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue