add bank sync monitoring with Pushgateway metrics and Prometheus alerts [ci skip]
CronJob now captures HTTP status, pushes bank_sync_success/duration/last_success to Pushgateway. Alerts: BankSyncFailing (6h), BankSyncStale (48h).
This commit is contained in:
parent
3f09a2d007
commit
3217a5f605
2 changed files with 80 additions and 5 deletions
|
|
@ -240,13 +240,13 @@ resource "kubernetes_cron_job_v1" "bank-sync" {
|
||||||
concurrency_policy = "Replace"
|
concurrency_policy = "Replace"
|
||||||
failed_jobs_history_limit = 5
|
failed_jobs_history_limit = 5
|
||||||
schedule = "0 0 * * *" # Daily
|
schedule = "0 0 * * *" # Daily
|
||||||
starting_deadline_seconds = 10
|
starting_deadline_seconds = 60
|
||||||
successful_jobs_history_limit = 10
|
successful_jobs_history_limit = 10
|
||||||
job_template {
|
job_template {
|
||||||
metadata {}
|
metadata {}
|
||||||
spec {
|
spec {
|
||||||
backoff_limit = 3
|
backoff_limit = 1
|
||||||
ttl_seconds_after_finished = 10
|
ttl_seconds_after_finished = 300
|
||||||
template {
|
template {
|
||||||
metadata {}
|
metadata {}
|
||||||
spec {
|
spec {
|
||||||
|
|
@ -254,8 +254,41 @@ resource "kubernetes_cron_job_v1" "bank-sync" {
|
||||||
name = "bank-sync"
|
name = "bank-sync"
|
||||||
image = "curlimages/curl"
|
image = "curlimages/curl"
|
||||||
command = ["/bin/sh", "-c", <<-EOT
|
command = ["/bin/sh", "-c", <<-EOT
|
||||||
# set -eux # Shows credentials so use only when debugging
|
PUSHGATEWAY="http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/bank-sync-${var.name}"
|
||||||
curl -X POST --location 'http://budget-http-api-${var.name}/v1/budgets/${var.sync_id}/accounts/banksync' --header 'accept: application/json' --header 'budget-encryption-password: ${var.budget_encryption_password}' --header 'x-api-key: ${random_string.api-key.result}'
|
START=$(date +%s)
|
||||||
|
|
||||||
|
HTTP_CODE=$(curl -s -o /tmp/response.txt -w '%%{http_code}' \
|
||||||
|
-X POST --location \
|
||||||
|
'http://budget-http-api-${var.name}/v1/budgets/${var.sync_id}/accounts/banksync' \
|
||||||
|
--header 'accept: application/json' \
|
||||||
|
--header 'budget-encryption-password: ${var.budget_encryption_password}' \
|
||||||
|
--header 'x-api-key: ${random_string.api-key.result}')
|
||||||
|
|
||||||
|
END=$(date +%s)
|
||||||
|
DURATION=$((END - START))
|
||||||
|
|
||||||
|
if [ "$HTTP_CODE" = "200" ]; then
|
||||||
|
SUCCESS=1
|
||||||
|
LAST_SUCCESS=$END
|
||||||
|
else
|
||||||
|
SUCCESS=0
|
||||||
|
LAST_SUCCESS=0
|
||||||
|
echo "Bank sync failed with HTTP $HTTP_CODE:"
|
||||||
|
cat /tmp/response.txt
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
|
||||||
|
cat <<METRICS | curl -s --data-binary @- "$PUSHGATEWAY"
|
||||||
|
# HELP bank_sync_success Whether the last bank sync succeeded (1=ok, 0=fail)
|
||||||
|
# TYPE bank_sync_success gauge
|
||||||
|
bank_sync_success $SUCCESS
|
||||||
|
# HELP bank_sync_duration_seconds Duration of the last bank sync run
|
||||||
|
# TYPE bank_sync_duration_seconds gauge
|
||||||
|
bank_sync_duration_seconds $DURATION
|
||||||
|
# HELP bank_sync_last_success_timestamp Unix timestamp of the last successful sync
|
||||||
|
# TYPE bank_sync_last_success_timestamp gauge
|
||||||
|
bank_sync_last_success_timestamp $LAST_SUCCESS
|
||||||
|
METRICS
|
||||||
EOT
|
EOT
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1073,6 +1073,34 @@ serverFiles:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Backup job failed: {{ $labels.namespace }}/{{ $labels.job_name }}"
|
summary: "Backup job failed: {{ $labels.namespace }}/{{ $labels.job_name }}"
|
||||||
|
- alert: LVMSnapshotStale
|
||||||
|
expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 86400
|
||||||
|
for: 30m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "LVM PVC snapshots are {{ $value | humanizeDuration }} old (expected every 12h)"
|
||||||
|
- alert: LVMSnapshotNeverRun
|
||||||
|
expr: absent(lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"})
|
||||||
|
for: 48h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "LVM PVC snapshot job has never reported metrics to Pushgateway"
|
||||||
|
- alert: LVMSnapshotFailing
|
||||||
|
expr: lvm_snapshot_last_status{job="lvm-pvc-snapshot"} != 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "LVM PVC snapshot job failed (status={{ $value }})"
|
||||||
|
- alert: LVMThinPoolLow
|
||||||
|
expr: lvm_snapshot_thinpool_free_pct{job="lvm-pvc-snapshot"} < 15
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "LVM thin pool has only {{ $value }}% free — snapshot overhead may cause pool exhaustion"
|
||||||
- alert: NewTailscaleClient
|
- alert: NewTailscaleClient
|
||||||
expr: irate(headscale_machine_registrations_total{action="reauth"}[5m]) > 0
|
expr: irate(headscale_machine_registrations_total{action="reauth"}[5m]) > 0
|
||||||
for: 5m
|
for: 5m
|
||||||
|
|
@ -1472,6 +1500,20 @@ serverFiles:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Mail server has no available replicas - mail may not be received"
|
summary: "Mail server has no available replicas - mail may not be received"
|
||||||
|
- alert: BankSyncFailing
|
||||||
|
expr: bank_sync_success == 0
|
||||||
|
for: 6h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Bank sync failing for {{ $labels.job }}. Accounts may need GoCardless reauthorization."
|
||||||
|
- alert: BankSyncStale
|
||||||
|
expr: (time() - bank_sync_last_success_timestamp) > 172800
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Bank sync for {{ $labels.job }} has not succeeded in >48h. Check CronJob and account auth."
|
||||||
- alert: EmailRoundtripFailing
|
- alert: EmailRoundtripFailing
|
||||||
expr: email_roundtrip_success{job="email-roundtrip-monitor"} == 0
|
expr: email_roundtrip_success{job="email-roundtrip-monitor"} == 0
|
||||||
for: 90m
|
for: 90m
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue