[ci skip] fix post-NFS-migration issues: MySQL GR, Loki, grampsweb, alerts

- Loki: reduce memory limit from 6Gi to 4Gi (within LimitRange max)
- Grampsweb: increase memory to 2Gi (was OOMKilled at 512Mi)
- Fix PostgreSQLDown alert: check pod readiness instead of deployment
- Fix MySQLDown alert: check StatefulSet replicas instead of deployment
- Fix RedisDown alert: check StatefulSet replicas instead of deployment
- Fix NFSServerUnresponsive: aggregate all NFS versions cluster-wide
- Fix Uptime Kuma healthcheck: handle nested list heartbeat format
- Update etcd backup image to registry.k8s.io/etcd:3.6.5-0
This commit is contained in:
Viktor Barzin 2026-03-03 21:10:26 +00:00
parent 065090dfe0
commit a8e07ad930
No known key found for this signature in database
GPG key ID: 0EB088298288D958
6 changed files with 44 additions and 15 deletions

View file

@ -654,7 +654,11 @@ try:
for mid, name in id_to_name.items():
beats = heartbeats.get(mid, [])
if beats:
status = beats[-1].get("status", 0)
last_beat = beats[-1]
# Handle nested lists (some monitors return list of lists)
if isinstance(last_beat, list):
last_beat = last_beat[-1] if last_beat else {}
status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0
# Handle both enum and int (MonitorStatus.UP == 1)
if status == 1:
up_count += 1

View file

@ -618,7 +618,11 @@ try:
for mid, name in id_to_name.items():
beats = heartbeats.get(mid, [])
if beats:
status = beats[-1].get("status", 0)
last_beat = beats[-1]
# Handle nested lists (some monitors return list of lists)
if isinstance(last_beat, list):
last_beat = last_beat[-1] if last_beat else {}
status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0
if status == 1:
up_count += 1
elif status == 3:

View file

@ -186,11 +186,11 @@ resource "kubernetes_deployment" "grampsweb" {
resources {
requests = {
cpu = "50m"
memory = "256Mi"
memory = "512Mi"
}
limits = {
cpu = "500m"
memory = "512Mi"
cpu = "1"
memory = "2Gi"
}
}
}
@ -256,7 +256,7 @@ resource "kubernetes_deployment" "grampsweb" {
}
limits = {
cpu = "500m"
memory = "512Mi"
memory = "1Gi"
}
}
}

View file

@ -100,7 +100,7 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
host_network = true
container {
name = "backup-etcd"
image = "k8s.gcr.io/etcd-amd64:3.3.15"
image = "registry.k8s.io/etcd:3.6.5-0"
command = ["/bin/sh"]
args = ["-c", "etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y_%m_%d_%H:%M:%S_%Z).db"]
env {

View file

@ -70,10 +70,10 @@ singleBinary:
resources:
requests:
cpu: 250m
memory: 4Gi
memory: 2Gi
limits:
cpu: "1"
memory: 6Gi
memory: 4Gi
# Zero out replica counts of other deployment modes
backend:

View file

@ -335,6 +335,13 @@ serverFiles:
severity: warning
annotations:
summary: "PV {{ $labels.persistentvolumeclaim }} in {{ $labels.namespace }} predicted to fill within 24h"
- alert: NFSServerUnresponsive
expr: sum(rate(node_nfs_requests_total[5m])) == 0
for: 10m
labels:
severity: critical
annotations:
summary: "All NFS operations across the cluster are zero for 10m — TrueNAS (10.0.10.15) may be down"
- name: K8s Health
rules:
- alert: PodCrashLooping
@ -408,29 +415,43 @@ serverFiles:
severity: warning
annotations:
summary: "Prometheus notification errors: {{ $value | printf \"%.2f\" }}/s"
- alert: EtcdBackupStale
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="backup-etcd", namespace="default"}) > 129600
for: 30m
labels:
severity: critical
annotations:
summary: "etcd backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
- alert: EtcdBackupNeverSucceeded
expr: kube_cronjob_status_last_successful_time{cronjob="backup-etcd", namespace="default"} == 0
for: 1h
labels:
severity: critical
annotations:
summary: "etcd backup CronJob has never completed successfully"
- name: Critical Services
rules:
- alert: PostgreSQLDown
expr: (kube_deployment_status_replicas_available{namespace="dbaas", deployment=~"postgresql.*"} or on() vector(0)) < 1
expr: kube_pod_status_ready{namespace="dbaas", pod=~"pg-cluster-.*", condition="true"} != 1
for: 5m
labels:
severity: critical
annotations:
summary: "PostgreSQL has no available replicas"
summary: "PostgreSQL pod {{ $labels.pod }} is not ready"
- alert: MySQLDown
expr: (kube_deployment_status_replicas_available{namespace="dbaas", deployment=~"mysql.*"} or on() vector(0)) < 1
expr: kube_statefulset_status_replicas_ready{namespace="dbaas", statefulset="mysql-cluster"} < 1
for: 5m
labels:
severity: critical
annotations:
summary: "MySQL has no available replicas"
summary: "MySQL InnoDB Cluster has no ready replicas"
- alert: RedisDown
expr: (kube_deployment_status_replicas_available{namespace="redis"} or on() vector(0)) < 1
expr: kube_statefulset_status_replicas_ready{namespace="redis", statefulset="redis-node"} < 1
for: 5m
labels:
severity: critical
annotations:
summary: "Redis has no available replicas"
summary: "Redis has no ready replicas"
- alert: HeadscaleDown
expr: (kube_deployment_status_replicas_available{namespace="headscale"} or on() vector(0)) < 1
for: 5m