diff --git a/stacks/monitoring/main.tf b/stacks/monitoring/main.tf index a9648720..503e3e1a 100644 --- a/stacks/monitoring/main.tf +++ b/stacks/monitoring/main.tf @@ -12,6 +12,11 @@ data "vault_kv_secret_v2" "secrets" { name = "platform" } +data "vault_kv_secret_v2" "viktor" { + mount = "secret" + name = "viktor" +} + module "monitoring" { source = "./modules/monitoring" tls_secret_name = var.tls_secret_name @@ -26,4 +31,5 @@ module "monitoring" { pve_password = data.vault_kv_secret_v2.secrets.data["pve_password"] grafana_admin_password = data.vault_kv_secret_v2.secrets.data["grafana_admin_password"] tier = local.tiers.cluster + truenas_api_key = data.vault_kv_secret_v2.viktor.data["truenas_api_key"] } diff --git a/stacks/monitoring/modules/monitoring/main.tf b/stacks/monitoring/modules/monitoring/main.tf index b420662b..048f21c9 100644 --- a/stacks/monitoring/modules/monitoring/main.tf +++ b/stacks/monitoring/modules/monitoring/main.tf @@ -29,6 +29,10 @@ variable "grafana_admin_password" { } variable "tier" { type = string } variable "mysql_host" { type = string } +variable "truenas_api_key" { + type = string + sensitive = true +} resource "kubernetes_namespace" "monitoring" { metadata { @@ -90,6 +94,99 @@ resource "kubernetes_cron_job_v1" "monitor_prom" { } } +# ----------------------------------------------------------------------------- +# Cloud Sync Monitor — check TrueNAS Cloud Sync job status, push to Pushgateway +# Runs every 6h. Alert fires if no successful sync in 8 days. +# ----------------------------------------------------------------------------- +resource "kubernetes_cron_job_v1" "cloudsync_monitor" { + metadata { + name = "cloudsync-monitor" + namespace = kubernetes_namespace.monitoring.metadata[0].name + } + spec { + concurrency_policy = "Replace" + failed_jobs_history_limit = 3 + successful_jobs_history_limit = 3 + schedule = "0 */6 * * *" + job_template { + metadata {} + spec { + backoff_limit = 2 + ttl_seconds_after_finished = 300 + template { + metadata {} + spec { + container { + name = "cloudsync-monitor" + image = "docker.io/library/alpine" + command = ["/bin/sh", "-c", <<-EOT + set -euo pipefail + apk add --no-cache curl jq + + # Query TrueNAS Cloud Sync tasks + RESPONSE=$(curl -sf -H "Authorization: Bearer $TRUENAS_API_KEY" \ + "http://10.0.10.15/api/v2.0/cloudsync" 2>&1) || { + echo "ERROR: Failed to query TrueNAS API" + exit 1 + } + + # Parse each task's last successful run + echo "$RESPONSE" | jq -c '.[]' | while read -r task; do + TASK_ID=$(echo "$task" | jq -r '.id') + TASK_DESC=$(echo "$task" | jq -r '.description // "task-\(.id)"' | tr ' ' '_' | tr -cd '[:alnum:]_-') + JOB_STATE=$(echo "$task" | jq -r '.job.state // "UNKNOWN"') + JOB_TIME=$(echo "$task" | jq -r '.job.time_finished."$date" // 0') + + if [ "$JOB_TIME" != "0" ] && [ "$JOB_TIME" != "null" ]; then + # TrueNAS returns milliseconds since epoch + EPOCH_SECS=$((JOB_TIME / 1000)) + else + EPOCH_SECS=0 + fi + + echo "Task $TASK_ID ($TASK_DESC): state=$JOB_STATE, last_finished=$EPOCH_SECS" + + # Push metrics to Pushgateway + cat <