add TrueNAS Cloud Sync monitor CronJob and bump Prometheus Helm timeout

- New cloudsync-monitor CronJob: queries TrueNAS API every 6h, pushes metrics to Pushgateway
- Increase Prometheus Helm timeout to 900s for slow iSCSI reattach
This commit is contained in:
Viktor Barzin 2026-03-23 02:24:39 +02:00
parent e463281205
commit e4cf0dee83
3 changed files with 105 additions and 0 deletions

View file

@ -12,6 +12,11 @@ data "vault_kv_secret_v2" "secrets" {
name = "platform"
}
data "vault_kv_secret_v2" "viktor" {
mount = "secret"
name = "viktor"
}
module "monitoring" {
source = "./modules/monitoring"
tls_secret_name = var.tls_secret_name
@ -26,4 +31,5 @@ module "monitoring" {
pve_password = data.vault_kv_secret_v2.secrets.data["pve_password"]
grafana_admin_password = data.vault_kv_secret_v2.secrets.data["grafana_admin_password"]
tier = local.tiers.cluster
truenas_api_key = data.vault_kv_secret_v2.viktor.data["truenas_api_key"]
}

View file

@ -29,6 +29,10 @@ variable "grafana_admin_password" {
}
variable "tier" { type = string }
variable "mysql_host" { type = string }
variable "truenas_api_key" {
type = string
sensitive = true
}
resource "kubernetes_namespace" "monitoring" {
metadata {
@ -90,6 +94,99 @@ resource "kubernetes_cron_job_v1" "monitor_prom" {
}
}
# -----------------------------------------------------------------------------
# Cloud Sync Monitor check TrueNAS Cloud Sync job status, push to Pushgateway
# Runs every 6h. Alert fires if no successful sync in 8 days.
# -----------------------------------------------------------------------------
resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
metadata {
name = "cloudsync-monitor"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec {
concurrency_policy = "Replace"
failed_jobs_history_limit = 3
successful_jobs_history_limit = 3
schedule = "0 */6 * * *"
job_template {
metadata {}
spec {
backoff_limit = 2
ttl_seconds_after_finished = 300
template {
metadata {}
spec {
container {
name = "cloudsync-monitor"
image = "docker.io/library/alpine"
command = ["/bin/sh", "-c", <<-EOT
set -euo pipefail
apk add --no-cache curl jq
# Query TrueNAS Cloud Sync tasks
RESPONSE=$(curl -sf -H "Authorization: Bearer $TRUENAS_API_KEY" \
"http://10.0.10.15/api/v2.0/cloudsync" 2>&1) || {
echo "ERROR: Failed to query TrueNAS API"
exit 1
}
# Parse each task's last successful run
echo "$RESPONSE" | jq -c '.[]' | while read -r task; do
TASK_ID=$(echo "$task" | jq -r '.id')
TASK_DESC=$(echo "$task" | jq -r '.description // "task-\(.id)"' | tr ' ' '_' | tr -cd '[:alnum:]_-')
JOB_STATE=$(echo "$task" | jq -r '.job.state // "UNKNOWN"')
JOB_TIME=$(echo "$task" | jq -r '.job.time_finished."$date" // 0')
if [ "$JOB_TIME" != "0" ] && [ "$JOB_TIME" != "null" ]; then
# TrueNAS returns milliseconds since epoch
EPOCH_SECS=$((JOB_TIME / 1000))
else
EPOCH_SECS=0
fi
echo "Task $TASK_ID ($TASK_DESC): state=$JOB_STATE, last_finished=$EPOCH_SECS"
# Push metrics to Pushgateway
cat <<METRICS | curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/cloudsync-monitor/task_id/$TASK_ID"
# HELP cloudsync_last_success_timestamp Last successful Cloud Sync completion (unix epoch)
# TYPE cloudsync_last_success_timestamp gauge
cloudsync_last_success_timestamp $EPOCH_SECS
# HELP cloudsync_job_state Cloud Sync job state (1=SUCCESS, 0=other)
# TYPE cloudsync_job_state gauge
cloudsync_job_state $([ "$JOB_STATE" = "SUCCESS" ] && echo 1 || echo 0)
METRICS
done
echo "Cloud Sync monitor complete"
EOT
]
env {
name = "TRUENAS_API_KEY"
value = var.truenas_api_key
}
resources {
requests = {
memory = "32Mi"
cpu = "10m"
}
limits = {
memory = "64Mi"
}
}
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
}
}
}
resource "kubernetes_manifest" "status_redirect_middleware" {
manifest = {
apiVersion = "traefik.io/v1alpha1"

View file

@ -35,5 +35,7 @@ resource "helm_release" "prometheus" {
# version = "15.0.2"
version = "25.8.2"
timeout = 900 # 15 min Recreate strategy + iSCSI reattach is slow
values = [templatefile("${path.module}/prometheus_chart_values.tpl", { alertmanager_mail_pass = var.alertmanager_account_password, alertmanager_slack_api_url = var.alertmanager_slack_api_url, tuya_api_key = var.tiny_tuya_service_secret, haos_api_token = var.haos_api_token })]
}