add TrueNAS Cloud Sync monitor CronJob and bump Prometheus Helm timeout
- New cloudsync-monitor CronJob: queries TrueNAS API every 6h, pushes metrics to Pushgateway - Increase Prometheus Helm timeout to 900s for slow iSCSI reattach
This commit is contained in:
parent
e463281205
commit
e4cf0dee83
3 changed files with 105 additions and 0 deletions
|
|
@ -12,6 +12,11 @@ data "vault_kv_secret_v2" "secrets" {
|
||||||
name = "platform"
|
name = "platform"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
data "vault_kv_secret_v2" "viktor" {
|
||||||
|
mount = "secret"
|
||||||
|
name = "viktor"
|
||||||
|
}
|
||||||
|
|
||||||
module "monitoring" {
|
module "monitoring" {
|
||||||
source = "./modules/monitoring"
|
source = "./modules/monitoring"
|
||||||
tls_secret_name = var.tls_secret_name
|
tls_secret_name = var.tls_secret_name
|
||||||
|
|
@ -26,4 +31,5 @@ module "monitoring" {
|
||||||
pve_password = data.vault_kv_secret_v2.secrets.data["pve_password"]
|
pve_password = data.vault_kv_secret_v2.secrets.data["pve_password"]
|
||||||
grafana_admin_password = data.vault_kv_secret_v2.secrets.data["grafana_admin_password"]
|
grafana_admin_password = data.vault_kv_secret_v2.secrets.data["grafana_admin_password"]
|
||||||
tier = local.tiers.cluster
|
tier = local.tiers.cluster
|
||||||
|
truenas_api_key = data.vault_kv_secret_v2.viktor.data["truenas_api_key"]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,10 @@ variable "grafana_admin_password" {
|
||||||
}
|
}
|
||||||
variable "tier" { type = string }
|
variable "tier" { type = string }
|
||||||
variable "mysql_host" { type = string }
|
variable "mysql_host" { type = string }
|
||||||
|
variable "truenas_api_key" {
|
||||||
|
type = string
|
||||||
|
sensitive = true
|
||||||
|
}
|
||||||
|
|
||||||
resource "kubernetes_namespace" "monitoring" {
|
resource "kubernetes_namespace" "monitoring" {
|
||||||
metadata {
|
metadata {
|
||||||
|
|
@ -90,6 +94,99 @@ resource "kubernetes_cron_job_v1" "monitor_prom" {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Cloud Sync Monitor — check TrueNAS Cloud Sync job status, push to Pushgateway
|
||||||
|
# Runs every 6h. Alert fires if no successful sync in 8 days.
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
resource "kubernetes_cron_job_v1" "cloudsync_monitor" {
|
||||||
|
metadata {
|
||||||
|
name = "cloudsync-monitor"
|
||||||
|
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||||
|
}
|
||||||
|
spec {
|
||||||
|
concurrency_policy = "Replace"
|
||||||
|
failed_jobs_history_limit = 3
|
||||||
|
successful_jobs_history_limit = 3
|
||||||
|
schedule = "0 */6 * * *"
|
||||||
|
job_template {
|
||||||
|
metadata {}
|
||||||
|
spec {
|
||||||
|
backoff_limit = 2
|
||||||
|
ttl_seconds_after_finished = 300
|
||||||
|
template {
|
||||||
|
metadata {}
|
||||||
|
spec {
|
||||||
|
container {
|
||||||
|
name = "cloudsync-monitor"
|
||||||
|
image = "docker.io/library/alpine"
|
||||||
|
command = ["/bin/sh", "-c", <<-EOT
|
||||||
|
set -euo pipefail
|
||||||
|
apk add --no-cache curl jq
|
||||||
|
|
||||||
|
# Query TrueNAS Cloud Sync tasks
|
||||||
|
RESPONSE=$(curl -sf -H "Authorization: Bearer $TRUENAS_API_KEY" \
|
||||||
|
"http://10.0.10.15/api/v2.0/cloudsync" 2>&1) || {
|
||||||
|
echo "ERROR: Failed to query TrueNAS API"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Parse each task's last successful run
|
||||||
|
echo "$RESPONSE" | jq -c '.[]' | while read -r task; do
|
||||||
|
TASK_ID=$(echo "$task" | jq -r '.id')
|
||||||
|
TASK_DESC=$(echo "$task" | jq -r '.description // "task-\(.id)"' | tr ' ' '_' | tr -cd '[:alnum:]_-')
|
||||||
|
JOB_STATE=$(echo "$task" | jq -r '.job.state // "UNKNOWN"')
|
||||||
|
JOB_TIME=$(echo "$task" | jq -r '.job.time_finished."$date" // 0')
|
||||||
|
|
||||||
|
if [ "$JOB_TIME" != "0" ] && [ "$JOB_TIME" != "null" ]; then
|
||||||
|
# TrueNAS returns milliseconds since epoch
|
||||||
|
EPOCH_SECS=$((JOB_TIME / 1000))
|
||||||
|
else
|
||||||
|
EPOCH_SECS=0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Task $TASK_ID ($TASK_DESC): state=$JOB_STATE, last_finished=$EPOCH_SECS"
|
||||||
|
|
||||||
|
# Push metrics to Pushgateway
|
||||||
|
cat <<METRICS | curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/cloudsync-monitor/task_id/$TASK_ID"
|
||||||
|
# HELP cloudsync_last_success_timestamp Last successful Cloud Sync completion (unix epoch)
|
||||||
|
# TYPE cloudsync_last_success_timestamp gauge
|
||||||
|
cloudsync_last_success_timestamp $EPOCH_SECS
|
||||||
|
# HELP cloudsync_job_state Cloud Sync job state (1=SUCCESS, 0=other)
|
||||||
|
# TYPE cloudsync_job_state gauge
|
||||||
|
cloudsync_job_state $([ "$JOB_STATE" = "SUCCESS" ] && echo 1 || echo 0)
|
||||||
|
METRICS
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Cloud Sync monitor complete"
|
||||||
|
EOT
|
||||||
|
]
|
||||||
|
env {
|
||||||
|
name = "TRUENAS_API_KEY"
|
||||||
|
value = var.truenas_api_key
|
||||||
|
}
|
||||||
|
resources {
|
||||||
|
requests = {
|
||||||
|
memory = "32Mi"
|
||||||
|
cpu = "10m"
|
||||||
|
}
|
||||||
|
limits = {
|
||||||
|
memory = "64Mi"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dns_config {
|
||||||
|
option {
|
||||||
|
name = "ndots"
|
||||||
|
value = "2"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
resource "kubernetes_manifest" "status_redirect_middleware" {
|
resource "kubernetes_manifest" "status_redirect_middleware" {
|
||||||
manifest = {
|
manifest = {
|
||||||
apiVersion = "traefik.io/v1alpha1"
|
apiVersion = "traefik.io/v1alpha1"
|
||||||
|
|
|
||||||
|
|
@ -35,5 +35,7 @@ resource "helm_release" "prometheus" {
|
||||||
# version = "15.0.2"
|
# version = "15.0.2"
|
||||||
version = "25.8.2"
|
version = "25.8.2"
|
||||||
|
|
||||||
|
timeout = 900 # 15 min — Recreate strategy + iSCSI reattach is slow
|
||||||
|
|
||||||
values = [templatefile("${path.module}/prometheus_chart_values.tpl", { alertmanager_mail_pass = var.alertmanager_account_password, alertmanager_slack_api_url = var.alertmanager_slack_api_url, tuya_api_key = var.tiny_tuya_service_secret, haos_api_token = var.haos_api_token })]
|
values = [templatefile("${path.module}/prometheus_chart_values.tpl", { alertmanager_mail_pass = var.alertmanager_account_password, alertmanager_slack_api_url = var.alertmanager_slack_api_url, tuya_api_key = var.tiny_tuya_service_secret, haos_api_token = var.haos_api_token })]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue