variable "tls_secret_name" {} variable "alertmanager_account_password" {} variable "idrac_host" { default = "192.168.1.4" } variable "idrac_username" { default = "root" } variable "idrac_password" { default = "calvin" sensitive = true } variable "alertmanager_slack_api_url" {} variable "tiny_tuya_service_secret" { type = string sensitive = true } variable "haos_api_token" { type = string sensitive = true } variable "pve_password" { type = string sensitive = true } variable "grafana_admin_password" { type = string sensitive = true } variable "tier" { type = string } variable "mysql_host" { type = string } variable "truenas_api_key" { type = string sensitive = true } resource "kubernetes_namespace" "monitoring" { metadata { name = "monitoring" labels = { "istio-injection" : "disabled" tier = var.tier "resource-governance/custom-quota" = "true" } } } module "tls_secret" { source = "../../../../modules/kubernetes/setup_tls_secret" namespace = kubernetes_namespace.monitoring.metadata[0].name tls_secret_name = var.tls_secret_name } # Terraform get angry with the 30k values file :/ use ansible until solved # resource "helm_release" "ups_prometheus_snmp_exporter" { # namespace = kubernetes_namespace.monitoring.metadata[0].name # create_namespace = true # name = "ups_prometheus_exporter" # repository = "https://prometheus-community.github.io/helm-charts" # chart = "prometheus-snmp-exporter" # values = [file("${path.module}/ups_snmp_values.yaml")] # } resource "kubernetes_cron_job_v1" "monitor_prom" { metadata { name = "monitor-prometheus" } spec { concurrency_policy = "Replace" failed_jobs_history_limit = 5 schedule = "*/30 * * * *" job_template { metadata { } spec { template { metadata { } spec { container { name = "monitor-prometheus" image = "alpine" command = ["/bin/sh", "-c", "apk add --update curl && curl --connect-timeout 2 prometheus-server.monitoring.svc.cluster.local || curl https://webhook.viktorbarzin.me/fb/message-viktor -d 'Prometheus is down!'"] } } } } } } } # ----------------------------------------------------------------------------- # Cloud Sync Monitor — check TrueNAS Cloud Sync job status, push to Pushgateway # Runs every 6h. Alert fires if no successful sync in 8 days. # ----------------------------------------------------------------------------- resource "kubernetes_cron_job_v1" "cloudsync_monitor" { metadata { name = "cloudsync-monitor" namespace = kubernetes_namespace.monitoring.metadata[0].name } spec { concurrency_policy = "Replace" failed_jobs_history_limit = 3 successful_jobs_history_limit = 3 schedule = "0 */6 * * *" job_template { metadata {} spec { backoff_limit = 2 ttl_seconds_after_finished = 300 template { metadata {} spec { container { name = "cloudsync-monitor" image = "docker.io/library/alpine" command = ["/bin/sh", "-c", <<-EOT set -euo pipefail apk add --no-cache curl jq # Query TrueNAS Cloud Sync tasks RESPONSE=$(curl -sf -H "Authorization: Bearer $TRUENAS_API_KEY" \ "http://10.0.10.15/api/v2.0/cloudsync" 2>&1) || { echo "ERROR: Failed to query TrueNAS API" exit 1 } # Parse each task's last successful run echo "$RESPONSE" | jq -c '.[]' | while read -r task; do TASK_ID=$(echo "$task" | jq -r '.id') TASK_DESC=$(echo "$task" | jq -r '.description // "task-\(.id)"' | tr ' ' '_' | tr -cd '[:alnum:]_-') JOB_STATE=$(echo "$task" | jq -r '.job.state // "UNKNOWN"') JOB_TIME=$(echo "$task" | jq -r '.job.time_finished."$date" // 0') if [ "$JOB_TIME" != "0" ] && [ "$JOB_TIME" != "null" ]; then # TrueNAS returns milliseconds since epoch EPOCH_SECS=$((JOB_TIME / 1000)) else EPOCH_SECS=0 fi # Extract transfer stats from job progress description # Format: "1182 / 1182, 3.928 GiB / 3.928 GiB, 8.737 MiB/s, ..." JOB_PROGRESS=$(echo "$task" | jq -r '.job.progress.description // ""') TX_TOTAL=$(echo "$JOB_PROGRESS" | awk -F', ' '{split($2, a, " / "); print a[2]}') TX_NUM=$(echo "$TX_TOTAL" | awk '{print $1}') TX_NUM=$${TX_NUM:-0} TX_UNIT=$(echo "$TX_TOTAL" | awk '{print $2}') TX_UNIT=$${TX_UNIT:-Bytes} case "$TX_UNIT" in Bytes|B) TX_MULT=1 ;; KiB|kB) TX_MULT=1024 ;; MiB|MB) TX_MULT=1048576 ;; GiB|GB) TX_MULT=1073741824 ;; *) TX_MULT=1 ;; esac TRANSFERRED_BYTES=$(echo "$TX_NUM $TX_MULT" | awk '{printf "%.0f", $1 * $2}') JOB_STARTED=$(echo "$task" | jq -r '.job.time_started."$date" // 0') JOB_FINISHED=$(echo "$task" | jq -r '.job.time_finished."$date" // 0') if [ "$JOB_STARTED" != "0" ] && [ "$JOB_STARTED" != "null" ] && [ "$JOB_FINISHED" != "0" ] && [ "$JOB_FINISHED" != "null" ]; then SYNC_DURATION=$(( (JOB_FINISHED - JOB_STARTED) / 1000 )) else SYNC_DURATION=0 fi echo "Task $TASK_ID ($TASK_DESC): state=$JOB_STATE, last_finished=$EPOCH_SECS, duration=$${SYNC_DURATION}s" # Push metrics to Pushgateway cat <&1) || { echo "ERROR: Failed to query Technitium stats API" exit 1 } # Parse key metrics TOTAL_QUERIES=$(echo "$STATS" | jq -r '.response.stats.totalQueries // 0') SERVER_FAILURE=$(echo "$STATS" | jq -r '.response.stats.serverFailure // 0') NX_DOMAIN=$(echo "$STATS" | jq -r '.response.stats.nxDomain // 0') BLOCKED=$(echo "$STATS" | jq -r '.response.stats.blocked // 0') NO_ERROR=$(echo "$STATS" | jq -r '.response.stats.noError // 0') echo "DNS Stats (last hour): total=$TOTAL_QUERIES noError=$NO_ERROR nxDomain=$NX_DOMAIN serverFailure=$SERVER_FAILURE blocked=$BLOCKED" # Get top clients for anomaly context TOP_CLIENTS=$(curl -sf "$TECHNITIUM_URL/api/stats/getTopClients?token=&type=LastHour&limit=10" 2>&1) || true # Get top domains for DGA/tunneling detection TOP_DOMAINS=$(curl -sf "$TECHNITIUM_URL/api/stats/getTopDomains?token=&type=LastHour&limit=20" 2>&1) || true # Check for high-entropy domains (potential DGA) DGA_SUSPECT=0 if [ -n "$TOP_DOMAINS" ]; then # Simple heuristic: domains with many consonant clusters or very long labels DGA_SUSPECT=$(echo "$TOP_DOMAINS" | jq -r '[.response.topDomains[]?.name // empty | select(length > 30 or test("[bcdfghjklmnpqrstvwxyz]{5,}"))] | length') fi # Push metrics to Pushgateway cat <