New alert TuyaCloudDown fires when any *_tuya_cloud_up gauge == 0 (i.e., the Tuya Cloud API rejects scrape calls — the symptom during last night's iot.tuya.com trial expiry, code=28841002). 5m for-duration beats the 15m window of the seven downstream *MetricsMissing alerts, so the new Alertmanager inhibit rule suppresses the per-device noise and only TuyaCloudDown pages. Also flips helm_release.prometheus.force_update from true to false: force_update was tripping on the pushgateway PVC added in rev 188 (commit e51c104) — Helm's --force path tried to reset spec.volumeName on a bound PVC. Disabled here; re-enable temporarily when a StatefulSet volumeClaimTemplate change actually needs --force. Bundled with pre-existing working-tree additions for Fuse/Thermostat threshold alerts and expanded PowerOutage inhibit regex (landed in the same Helm revision 190). Verified: rule loaded, value=7 (all 7 tuya-bridge devices report cloud_up=0 right now), TuyaCloudDown moved pending→firing after 5m, 3 *MetricsMissing alerts currently suppressed in Alertmanager with inhibitedBy=1 (thermostat alerts still pending their 15m window, will be suppressed on transition).
50 lines
1.7 KiB
HCL
50 lines
1.7 KiB
HCL
|
|
|
|
resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" {
|
|
metadata {
|
|
name = "prometheus-data-proxmox"
|
|
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
|
annotations = {
|
|
"resize.topolvm.io/threshold" = "90%"
|
|
"resize.topolvm.io/increase" = "10%"
|
|
"resize.topolvm.io/storage_limit" = "500Gi"
|
|
}
|
|
}
|
|
|
|
spec {
|
|
access_modes = ["ReadWriteOnce"]
|
|
storage_class_name = "proxmox-lvm"
|
|
resources {
|
|
requests = {
|
|
storage = "200Gi"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
module "nfs_prometheus_backup_host" {
|
|
source = "../../../../modules/kubernetes/nfs_volume"
|
|
name = "monitoring-prometheus-backup-host"
|
|
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
|
nfs_server = "192.168.1.127"
|
|
nfs_path = "/srv/nfs/prometheus-backup"
|
|
}
|
|
|
|
resource "helm_release" "prometheus" {
|
|
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
|
create_namespace = true
|
|
name = "prometheus"
|
|
|
|
repository = "https://prometheus-community.github.io/helm-charts"
|
|
chart = "prometheus"
|
|
# version = "15.0.2"
|
|
version = "25.8.2"
|
|
|
|
timeout = 900 # 15 min — Recreate strategy + iSCSI reattach is slow
|
|
# force_update disabled 2026-04-23: caused Helm to try replacing the bound
|
|
# pushgateway PVC (added in rev 188, see commit e51c104), which is immutable.
|
|
# Re-enable temporarily only when a StatefulSet volumeClaimTemplate change needs --force.
|
|
force_update = false
|
|
|
|
values = [templatefile("${path.module}/prometheus_chart_values.tpl", { alertmanager_mail_pass = var.alertmanager_account_password, alertmanager_slack_api_url = var.alertmanager_slack_api_url, tuya_api_key = var.tiny_tuya_service_secret, haos_api_token = var.haos_api_token })]
|
|
}
|