[ci skip] fix post-NFS-migration issues: MySQL GR, Loki, grampsweb, alerts
- Loki: reduce memory limit from 6Gi to 4Gi (within LimitRange max) - Grampsweb: increase memory to 2Gi (was OOMKilled at 512Mi) - Fix PostgreSQLDown alert: check pod readiness instead of deployment - Fix MySQLDown alert: check StatefulSet replicas instead of deployment - Fix RedisDown alert: check StatefulSet replicas instead of deployment - Fix NFSServerUnresponsive: aggregate all NFS versions cluster-wide - Fix Uptime Kuma healthcheck: handle nested list heartbeat format - Update etcd backup image to registry.k8s.io/etcd:3.6.5-0
This commit is contained in:
parent
065090dfe0
commit
a8e07ad930
6 changed files with 44 additions and 15 deletions
|
|
@ -654,7 +654,11 @@ try:
|
|||
for mid, name in id_to_name.items():
|
||||
beats = heartbeats.get(mid, [])
|
||||
if beats:
|
||||
status = beats[-1].get("status", 0)
|
||||
last_beat = beats[-1]
|
||||
# Handle nested lists (some monitors return list of lists)
|
||||
if isinstance(last_beat, list):
|
||||
last_beat = last_beat[-1] if last_beat else {}
|
||||
status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0
|
||||
# Handle both enum and int (MonitorStatus.UP == 1)
|
||||
if status == 1:
|
||||
up_count += 1
|
||||
|
|
|
|||
|
|
@ -618,7 +618,11 @@ try:
|
|||
for mid, name in id_to_name.items():
|
||||
beats = heartbeats.get(mid, [])
|
||||
if beats:
|
||||
status = beats[-1].get("status", 0)
|
||||
last_beat = beats[-1]
|
||||
# Handle nested lists (some monitors return list of lists)
|
||||
if isinstance(last_beat, list):
|
||||
last_beat = last_beat[-1] if last_beat else {}
|
||||
status = last_beat.get("status", 0) if isinstance(last_beat, dict) else 0
|
||||
if status == 1:
|
||||
up_count += 1
|
||||
elif status == 3:
|
||||
|
|
|
|||
|
|
@ -186,11 +186,11 @@ resource "kubernetes_deployment" "grampsweb" {
|
|||
resources {
|
||||
requests = {
|
||||
cpu = "50m"
|
||||
memory = "256Mi"
|
||||
memory = "512Mi"
|
||||
}
|
||||
limits = {
|
||||
cpu = "500m"
|
||||
memory = "512Mi"
|
||||
cpu = "1"
|
||||
memory = "2Gi"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -256,7 +256,7 @@ resource "kubernetes_deployment" "grampsweb" {
|
|||
}
|
||||
limits = {
|
||||
cpu = "500m"
|
||||
memory = "512Mi"
|
||||
memory = "1Gi"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -100,7 +100,7 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
|
|||
host_network = true
|
||||
container {
|
||||
name = "backup-etcd"
|
||||
image = "k8s.gcr.io/etcd-amd64:3.3.15"
|
||||
image = "registry.k8s.io/etcd:3.6.5-0"
|
||||
command = ["/bin/sh"]
|
||||
args = ["-c", "etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y_%m_%d_%H:%M:%S_%Z).db"]
|
||||
env {
|
||||
|
|
|
|||
|
|
@ -70,10 +70,10 @@ singleBinary:
|
|||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 4Gi
|
||||
memory: 2Gi
|
||||
limits:
|
||||
cpu: "1"
|
||||
memory: 6Gi
|
||||
memory: 4Gi
|
||||
|
||||
# Zero out replica counts of other deployment modes
|
||||
backend:
|
||||
|
|
|
|||
|
|
@ -335,6 +335,13 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "PV {{ $labels.persistentvolumeclaim }} in {{ $labels.namespace }} predicted to fill within 24h"
|
||||
- alert: NFSServerUnresponsive
|
||||
expr: sum(rate(node_nfs_requests_total[5m])) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "All NFS operations across the cluster are zero for 10m — TrueNAS (10.0.10.15) may be down"
|
||||
- name: K8s Health
|
||||
rules:
|
||||
- alert: PodCrashLooping
|
||||
|
|
@ -408,29 +415,43 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "Prometheus notification errors: {{ $value | printf \"%.2f\" }}/s"
|
||||
- alert: EtcdBackupStale
|
||||
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="backup-etcd", namespace="default"}) > 129600
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "etcd backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
|
||||
- alert: EtcdBackupNeverSucceeded
|
||||
expr: kube_cronjob_status_last_successful_time{cronjob="backup-etcd", namespace="default"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "etcd backup CronJob has never completed successfully"
|
||||
- name: Critical Services
|
||||
rules:
|
||||
- alert: PostgreSQLDown
|
||||
expr: (kube_deployment_status_replicas_available{namespace="dbaas", deployment=~"postgresql.*"} or on() vector(0)) < 1
|
||||
expr: kube_pod_status_ready{namespace="dbaas", pod=~"pg-cluster-.*", condition="true"} != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL has no available replicas"
|
||||
summary: "PostgreSQL pod {{ $labels.pod }} is not ready"
|
||||
- alert: MySQLDown
|
||||
expr: (kube_deployment_status_replicas_available{namespace="dbaas", deployment=~"mysql.*"} or on() vector(0)) < 1
|
||||
expr: kube_statefulset_status_replicas_ready{namespace="dbaas", statefulset="mysql-cluster"} < 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "MySQL has no available replicas"
|
||||
summary: "MySQL InnoDB Cluster has no ready replicas"
|
||||
- alert: RedisDown
|
||||
expr: (kube_deployment_status_replicas_available{namespace="redis"} or on() vector(0)) < 1
|
||||
expr: kube_statefulset_status_replicas_ready{namespace="redis", statefulset="redis-node"} < 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Redis has no available replicas"
|
||||
summary: "Redis has no ready replicas"
|
||||
- alert: HeadscaleDown
|
||||
expr: (kube_deployment_status_replicas_available{namespace="headscale"} or on() vector(0)) < 1
|
||||
for: 5m
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue