diff --git a/.claude/cluster-health.sh b/.claude/cluster-health.sh index 640a161e..d14187c0 100755 --- a/.claude/cluster-health.sh +++ b/.claude/cluster-health.sh @@ -654,7 +654,11 @@ try: for mid, name in id_to_name.items(): beats = heartbeats.get(mid, []) if beats: - status = beats[-1].get("status", 0) + last_beat = beats[-1] + # Handle nested lists (some monitors return list of lists) + if isinstance(last_beat, list): + last_beat = last_beat[-1] if last_beat else {} + status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0 # Handle both enum and int (MonitorStatus.UP == 1) if status == 1: up_count += 1 diff --git a/scripts/cluster_healthcheck.sh b/scripts/cluster_healthcheck.sh index 4f2ccb6a..8e386d30 100755 --- a/scripts/cluster_healthcheck.sh +++ b/scripts/cluster_healthcheck.sh @@ -618,7 +618,11 @@ try: for mid, name in id_to_name.items(): beats = heartbeats.get(mid, []) if beats: - status = beats[-1].get("status", 0) + last_beat = beats[-1] + # Handle nested lists (some monitors return list of lists) + if isinstance(last_beat, list): + last_beat = last_beat[-1] if last_beat else {} + status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0 if status == 1: up_count += 1 elif status == 3: diff --git a/stacks/grampsweb/main.tf b/stacks/grampsweb/main.tf index 5d8799f1..1147aec8 100644 --- a/stacks/grampsweb/main.tf +++ b/stacks/grampsweb/main.tf @@ -186,11 +186,11 @@ resource "kubernetes_deployment" "grampsweb" { resources { requests = { cpu = "50m" - memory = "256Mi" + memory = "512Mi" } limits = { - cpu = "500m" - memory = "512Mi" + cpu = "1" + memory = "2Gi" } } } @@ -256,7 +256,7 @@ resource "kubernetes_deployment" "grampsweb" { } limits = { cpu = "500m" - memory = "512Mi" + memory = "1Gi" } } } diff --git a/stacks/platform/modules/infra-maintenance/main.tf b/stacks/platform/modules/infra-maintenance/main.tf index 0eeb68b2..6df6c57a 100644 --- a/stacks/platform/modules/infra-maintenance/main.tf +++ b/stacks/platform/modules/infra-maintenance/main.tf @@ -100,7 +100,7 @@ resource "kubernetes_cron_job_v1" "backup-etcd" { host_network = true container { name = "backup-etcd" - image = "k8s.gcr.io/etcd-amd64:3.3.15" + image = "registry.k8s.io/etcd:3.6.5-0" command = ["/bin/sh"] args = ["-c", "etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y_%m_%d_%H:%M:%S_%Z).db"] env { diff --git a/stacks/platform/modules/monitoring/loki.yaml b/stacks/platform/modules/monitoring/loki.yaml index 63be79f8..53996f5a 100644 --- a/stacks/platform/modules/monitoring/loki.yaml +++ b/stacks/platform/modules/monitoring/loki.yaml @@ -70,10 +70,10 @@ singleBinary: resources: requests: cpu: 250m - memory: 4Gi + memory: 2Gi limits: cpu: "1" - memory: 6Gi + memory: 4Gi # Zero out replica counts of other deployment modes backend: diff --git a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl index dfbc67a8..8a7e097c 100755 --- a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl @@ -335,6 +335,13 @@ serverFiles: severity: warning annotations: summary: "PV {{ $labels.persistentvolumeclaim }} in {{ $labels.namespace }} predicted to fill within 24h" + - alert: NFSServerUnresponsive + expr: sum(rate(node_nfs_requests_total[5m])) == 0 + for: 10m + labels: + severity: critical + annotations: + summary: "All NFS operations across the cluster are zero for 10m — TrueNAS (10.0.10.15) may be down" - name: K8s Health rules: - alert: PodCrashLooping @@ -408,29 +415,43 @@ serverFiles: severity: warning annotations: summary: "Prometheus notification errors: {{ $value | printf \"%.2f\" }}/s" + - alert: EtcdBackupStale + expr: (time() - kube_cronjob_status_last_successful_time{cronjob="backup-etcd", namespace="default"}) > 129600 + for: 30m + labels: + severity: critical + annotations: + summary: "etcd backup is {{ $value | humanizeDuration }} old (threshold: 36h)" + - alert: EtcdBackupNeverSucceeded + expr: kube_cronjob_status_last_successful_time{cronjob="backup-etcd", namespace="default"} == 0 + for: 1h + labels: + severity: critical + annotations: + summary: "etcd backup CronJob has never completed successfully" - name: Critical Services rules: - alert: PostgreSQLDown - expr: (kube_deployment_status_replicas_available{namespace="dbaas", deployment=~"postgresql.*"} or on() vector(0)) < 1 + expr: kube_pod_status_ready{namespace="dbaas", pod=~"pg-cluster-.*", condition="true"} != 1 for: 5m labels: severity: critical annotations: - summary: "PostgreSQL has no available replicas" + summary: "PostgreSQL pod {{ $labels.pod }} is not ready" - alert: MySQLDown - expr: (kube_deployment_status_replicas_available{namespace="dbaas", deployment=~"mysql.*"} or on() vector(0)) < 1 + expr: kube_statefulset_status_replicas_ready{namespace="dbaas", statefulset="mysql-cluster"} < 1 for: 5m labels: severity: critical annotations: - summary: "MySQL has no available replicas" + summary: "MySQL InnoDB Cluster has no ready replicas" - alert: RedisDown - expr: (kube_deployment_status_replicas_available{namespace="redis"} or on() vector(0)) < 1 + expr: kube_statefulset_status_replicas_ready{namespace="redis", statefulset="redis-node"} < 1 for: 5m labels: severity: critical annotations: - summary: "Redis has no available replicas" + summary: "Redis has no ready replicas" - alert: HeadscaleDown expr: (kube_deployment_status_replicas_available{namespace="headscale"} or on() vector(0)) < 1 for: 5m