[ci skip] fix post-NFS-migration issues: MySQL GR, Loki, grampsweb, alerts

- Loki: reduce memory limit from 6Gi to 4Gi (within LimitRange max)
- Grampsweb: increase memory to 2Gi (was OOMKilled at 512Mi)
- Fix PostgreSQLDown alert: check pod readiness instead of deployment
- Fix MySQLDown alert: check StatefulSet replicas instead of deployment
- Fix RedisDown alert: check StatefulSet replicas instead of deployment
- Fix NFSServerUnresponsive: aggregate all NFS versions cluster-wide
- Fix Uptime Kuma healthcheck: handle nested list heartbeat format
- Update etcd backup image to registry.k8s.io/etcd:3.6.5-0
This commit is contained in:
Viktor Barzin 2026-03-03 21:10:26 +00:00
parent 065090dfe0
commit a8e07ad930
No known key found for this signature in database
GPG key ID: 0EB088298288D958
6 changed files with 44 additions and 15 deletions

View file

@ -654,7 +654,11 @@ try:
for mid, name in id_to_name.items(): for mid, name in id_to_name.items():
beats = heartbeats.get(mid, []) beats = heartbeats.get(mid, [])
if beats: if beats:
status = beats[-1].get("status", 0) last_beat = beats[-1]
# Handle nested lists (some monitors return list of lists)
if isinstance(last_beat, list):
last_beat = last_beat[-1] if last_beat else {}
status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0
# Handle both enum and int (MonitorStatus.UP == 1) # Handle both enum and int (MonitorStatus.UP == 1)
if status == 1: if status == 1:
up_count += 1 up_count += 1

View file

@ -618,7 +618,11 @@ try:
for mid, name in id_to_name.items(): for mid, name in id_to_name.items():
beats = heartbeats.get(mid, []) beats = heartbeats.get(mid, [])
if beats: if beats:
status = beats[-1].get("status", 0) last_beat = beats[-1]
# Handle nested lists (some monitors return list of lists)
if isinstance(last_beat, list):
last_beat = last_beat[-1] if last_beat else {}
status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0
if status == 1: if status == 1:
up_count += 1 up_count += 1
elif status == 3: elif status == 3:

View file

@ -186,11 +186,11 @@ resource "kubernetes_deployment" "grampsweb" {
resources { resources {
requests = { requests = {
cpu = "50m" cpu = "50m"
memory = "256Mi" memory = "512Mi"
} }
limits = { limits = {
cpu = "500m" cpu = "1"
memory = "512Mi" memory = "2Gi"
} }
} }
} }
@ -256,7 +256,7 @@ resource "kubernetes_deployment" "grampsweb" {
} }
limits = { limits = {
cpu = "500m" cpu = "500m"
memory = "512Mi" memory = "1Gi"
} }
} }
} }

View file

@ -100,7 +100,7 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
host_network = true host_network = true
container { container {
name = "backup-etcd" name = "backup-etcd"
image = "k8s.gcr.io/etcd-amd64:3.3.15" image = "registry.k8s.io/etcd:3.6.5-0"
command = ["/bin/sh"] command = ["/bin/sh"]
args = ["-c", "etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y_%m_%d_%H:%M:%S_%Z).db"] args = ["-c", "etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y_%m_%d_%H:%M:%S_%Z).db"]
env { env {

View file

@ -70,10 +70,10 @@ singleBinary:
resources: resources:
requests: requests:
cpu: 250m cpu: 250m
memory: 4Gi memory: 2Gi
limits: limits:
cpu: "1" cpu: "1"
memory: 6Gi memory: 4Gi
# Zero out replica counts of other deployment modes # Zero out replica counts of other deployment modes
backend: backend:

View file

@ -335,6 +335,13 @@ serverFiles:
severity: warning severity: warning
annotations: annotations:
summary: "PV {{ $labels.persistentvolumeclaim }} in {{ $labels.namespace }} predicted to fill within 24h" summary: "PV {{ $labels.persistentvolumeclaim }} in {{ $labels.namespace }} predicted to fill within 24h"
- alert: NFSServerUnresponsive
expr: sum(rate(node_nfs_requests_total[5m])) == 0
for: 10m
labels:
severity: critical
annotations:
summary: "All NFS operations across the cluster are zero for 10m — TrueNAS (10.0.10.15) may be down"
- name: K8s Health - name: K8s Health
rules: rules:
- alert: PodCrashLooping - alert: PodCrashLooping
@ -408,29 +415,43 @@ serverFiles:
severity: warning severity: warning
annotations: annotations:
summary: "Prometheus notification errors: {{ $value | printf \"%.2f\" }}/s" summary: "Prometheus notification errors: {{ $value | printf \"%.2f\" }}/s"
- alert: EtcdBackupStale
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="backup-etcd", namespace="default"}) > 129600
for: 30m
labels:
severity: critical
annotations:
summary: "etcd backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
- alert: EtcdBackupNeverSucceeded
expr: kube_cronjob_status_last_successful_time{cronjob="backup-etcd", namespace="default"} == 0
for: 1h
labels:
severity: critical
annotations:
summary: "etcd backup CronJob has never completed successfully"
- name: Critical Services - name: Critical Services
rules: rules:
- alert: PostgreSQLDown - alert: PostgreSQLDown
expr: (kube_deployment_status_replicas_available{namespace="dbaas", deployment=~"postgresql.*"} or on() vector(0)) < 1 expr: kube_pod_status_ready{namespace="dbaas", pod=~"pg-cluster-.*", condition="true"} != 1
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "PostgreSQL has no available replicas" summary: "PostgreSQL pod {{ $labels.pod }} is not ready"
- alert: MySQLDown - alert: MySQLDown
expr: (kube_deployment_status_replicas_available{namespace="dbaas", deployment=~"mysql.*"} or on() vector(0)) < 1 expr: kube_statefulset_status_replicas_ready{namespace="dbaas", statefulset="mysql-cluster"} < 1
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "MySQL has no available replicas" summary: "MySQL InnoDB Cluster has no ready replicas"
- alert: RedisDown - alert: RedisDown
expr: (kube_deployment_status_replicas_available{namespace="redis"} or on() vector(0)) < 1 expr: kube_statefulset_status_replicas_ready{namespace="redis", statefulset="redis-node"} < 1
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Redis has no available replicas" summary: "Redis has no ready replicas"
- alert: HeadscaleDown - alert: HeadscaleDown
expr: (kube_deployment_status_replicas_available{namespace="headscale"} or on() vector(0)) < 1 expr: (kube_deployment_status_replicas_available{namespace="headscale"} or on() vector(0)) < 1
for: 5m for: 5m