diff --git a/docs/architecture/backup-dr.md b/docs/architecture/backup-dr.md index ab9c903c..d2fc0ce8 100644 --- a/docs/architecture/backup-dr.md +++ b/docs/architecture/backup-dr.md @@ -54,46 +54,6 @@ The **bypass list** (leg 2) is just `/srv/nfs/immich/` — too big for sda (1.5 ## Architecture Diagram -### Data Routing — where each path goes (post-2026-05-26) - -```mermaid -flowchart LR - classDef live fill:#e1f5ff,stroke:#01579b - classDef sda fill:#fff9c4,stroke:#f57f17 - classDef syn fill:#c8e6c9,stroke:#1b5e20 - classDef none fill:#ffcdd2,stroke:#b71c1c - - subgraph sdc["sdc /srv/nfs/ — Tier 1 live"] - IMM["immich/ 1.5T"]:::live - FRI["frigate/ 131G"]:::live - TMP["temp/ 12G"]:::live - ANE["anca-elements/ 771G
legacy"]:::live - APP["everything else
(mysql, postgresql, nextcloud,
mailserver, servarr, audiobookshelf,
ollama, audiblez, ebook2audiobook,
*-backup CronJob outputs, …)"]:::live - end - - subgraph sdcssd["sdc /srv/nfs-ssd/"] - IMM_ML["immich/ 62G"]:::live - OLL_S["ollama/ 59G"]:::live - LLA["llamacpp/ 26G"]:::live - end - - SDA[("sda /mnt/backup/
Tier 2 local")]:::sda - SYN_PVE[("Synology
/Viki/pve-backup/")]:::syn - SYN_NFS[("Synology
/Viki/nfs/")]:::syn - SYN_SSD[("Synology
/Viki/nfs-ssd/")]:::syn - NOPE([NOT BACKED UP]):::none - - APP -- "nfs-mirror daily 02:00" --> SDA - SDA -- "offsite-sync Step 1
daily 06:00" --> SYN_PVE - IMM -- "Step 2 inotify direct
daily 06:00" --> SYN_NFS - IMM_ML --> SYN_SSD - OLL_S --> SYN_SSD - LLA --> SYN_SSD - FRI --- NOPE - TMP --- NOPE - ANE --- NOPE -``` - ### Overall Backup Flow ```mermaid @@ -103,24 +63,18 @@ graph TB sda["sda: 1.1TB RAID1 SAS
VG backup, LV data (ext4)
/mnt/backup"] subgraph Layer1["Layer 1: LVM Thin Snapshots"] - Snap["Twice daily 00:00, 12:00
7-day retention
62 PVCs (excludes dbaas+monitoring)"] + Snap["Daily 03:00
7-day retention
62 PVCs (excludes dbaas+monitoring)"] end - subgraph Layer2a["Layer 2a: Daily NFS Mirror (nfs-mirror)"] - NFSMirror["Daily 02:00
/srv/nfs/* → /mnt/backup//
excludes: immich, frigate, temp, anca-elements"] - end - - subgraph Layer2b["Layer 2b: Daily PVC File Backup (daily-backup)"] - PVCBackup["PVC File Copy
Daily 05:00
4 weekly versions via --link-dest
/mnt/backup/pvc-data//"] + subgraph Layer2["Layer 2: Weekly File Backup"] + PVCBackup["PVC File Copy
Daily 05:00
4 weekly versions
/mnt/backup/pvc-data//"] SQLiteBackup["Auto SQLite Backup
magic number check + ?mode=ro
from PVC snapshots"] PfsenseBackup["pfSense Backup
config.xml + full tar
4 weekly versions"] PVEConfig["PVE Config
/etc/pve + scripts"] end sdc --> Snap - sdc --> NFSMirror sdc --> PVCBackup - NFSMirror --> sda PVCBackup --> sda SQLiteBackup --> sda PfsenseBackup --> sda @@ -128,72 +82,63 @@ graph TB end subgraph NFS_Storage["Proxmox NFS (/srv/nfs)"] - NFS_Backup["NFS *-backup dirs
(populated by in-cluster CronJobs)"] + NFS_Backup["NFS dirs
/srv/nfs/*-backup/"] subgraph AppBackups["App-Level Backup CronJobs"] CronDaily["Daily 00:00-00:30
PostgreSQL, MySQL
14d retention"] - CronWeekly["Weekly Sunday
etcd, Vault, Redis
Vaultwarden 6h
30d retention"] + CronWeekly["Weekly Sunday
etcd, Vault, Redis
Vaultwarden
30d retention"] end CronDaily --> NFS_Backup CronWeekly --> NFS_Backup - NFS_Backup --> NFSMirror end - subgraph Layer3["Layer 3: Offsite Sync (offsite-sync-backup, daily 06:00)"] - PVEOffsite["Step 1: sda → Synology
/Viki/pve-backup/
incremental via manifest"] - NFSOffsite["Step 2: sdc/immich + nfs-ssd → Synology
/Viki/nfs/ + /Viki/nfs-ssd/
inotify change-tracked"] + subgraph Layer3["Layer 3: Offsite Sync"] + PVEOffsite["Step 1: sda → Synology
Daily 06:00
pve-backup/ only"] + NFSOffsite["Step 2: NFS → Synology
inotify change-tracked
rsync --files-from
nfs/ + nfs-ssd/"] end sda --> PVEOffsite - NFS_Storage -. "/srv/nfs/immich only" .-> NFSOffsite + NFS_Storage --> NFSOffsite - Synology["Synology NAS
192.168.1.13
520 GB free / 5.3 TB total"] + Synology["Synology NAS
192.168.1.13
Offsite protection"] PVEOffsite --> Synology NFSOffsite --> Synology + NFS_Backup -.->|app-level dumps| NFS_Storage + subgraph Monitoring["Monitoring & Alerting"] - Prometheus["Prometheus Alerts
PostgreSQLBackupStale, MySQLBackupStale
NfsMirrorStale, OffsiteBackupSyncStale
LVMSnapshotStale, BackupDiskFull
VaultwardenIntegrityFail"] + Prometheus["Prometheus Alerts
PostgreSQLBackupStale, MySQLBackupStale
WeeklyBackupStale, OffsiteBackupSyncStale
LVMSnapshotStale, BackupDiskFull
VaultwardenIntegrityFail"] Pushgateway["Pushgateway
backup script metrics
vaultwarden integrity"] end PVCBackup -.->|push metrics| Pushgateway - NFSMirror -.->|push metrics| Pushgateway - PVEOffsite -.->|push metrics| Pushgateway Snap -.->|push metrics| Pushgateway Pushgateway --> Prometheus style Layer1 fill:#c8e6c9 - style Layer2a fill:#ffe0b2 - style Layer2b fill:#ffe0b2 + style Layer2 fill:#ffe0b2 style Layer3 fill:#e1f5ff style Monitoring fill:#f3e5f5 ``` -### Daily Backup Timeline (EEST) +### Weekly Backup Timeline ```mermaid graph LR - subgraph Continuous["Continuous"] - INO["nfs-change-tracker
inotify on /srv/nfs[-ssd]
writes /mnt/backup/.nfs-changes.log"] + subgraph Sunday["Sunday Timeline"] + S01["01:00 etcd backup
(CronJob)"] + S02["02:00 Vault backup
(CronJob)"] + S03a["03:00 Redis backup
(CronJob)"] + S03b["03:00 LVM snapshots
(lvm-pvc-snapshot timer)"] + S05["05:00 Daily backup
(daily-backup timer)
1. PVC file copy (auto-discovered BACKUP_DIRS)
2. Auto SQLite backup (magic number + ?mode=ro)
3. pfSense backup
4. PVE config
5. Prune snapshots"] + S08["08:00 Offsite sync
(offsite-sync-backup timer)
Step 1: sda → Synology pve-backup/
Step 2: NFS → Synology nfs/ + nfs-ssd/
(inotify change-tracked)"] end - subgraph Nightly["Nightly Timeline"] - T0000["00:00 LVM thin snapshots
(lvm-pvc-snapshot)
sdc PVCs CoW"] - T0015["00:15 PostgreSQL per-DB dumps
(CronJob)"] - T0045["00:45 MySQL per-DB dumps
(CronJob)"] - T0200["02:00 nfs-mirror (daily)
sdc /srv/nfs/* → sda /mnt/backup//
~10-20 min steady state"] - T0500["05:00 daily-backup
mount LVM snapshots ro
rsync PVC files → /mnt/backup/pvc-data/
+ sqlite + pfsense + pve-config"] - T0600["06:00 offsite-sync-backup
Step 1: sda → Synology /Viki/pve-backup/
Step 2: sdc/immich + nfs-ssd → /Viki/nfs[-ssd]/"] - T1200["12:00 LVM thin snapshots (midday)
second daily snapshot"] - end + S01 --> S02 --> S03a --> S03b --> S05 --> S08 - T0000 --> T0015 --> T0045 --> T0200 --> T0500 --> T0600 --> T1200 - INO -.->|change events feed Step 2| T0600 - - style Nightly fill:#ffe0b2 - style Continuous fill:#e1f5ff + style Sunday fill:#ffe0b2 ``` ### Physical Disk Layout @@ -201,27 +146,24 @@ graph LR ```mermaid graph TB subgraph PVE["Proxmox Host (192.168.1.127)"] - subgraph sda["sda: 1.1TB RAID1 SAS — 70% used (315 GB free)"] + subgraph sda["sda: 1.1TB RAID1 SAS"] sda_vg["VG: backup
LV: data (ext4)
/mnt/backup"] - sda_content["pvc-data////
sqlite-backup/, pfsense//, pve-config/
+ daily mirror of /srv/nfs// via nfs-mirror"] + sda_content["pvc-data////
sqlite-backup/
pfsense//
pve-config/"] end subgraph sdb["sdb: 931GB SSD"] sdb_vg["VG: pve
LV: root (ext4)
PVE host OS"] end - subgraph sdc["sdc: 10.7TB RAID1 HDD — 2.8 TB used"] - sdc_vg["VG: pve
LV: data (thin pool)
/srv/nfs/* (live NFS)
65 proxmox-lvm PVCs
+ VM disks"] + subgraph sdc["sdc: 10.7TB RAID1 HDD"] + sdc_vg["VG: pve
LV: data (thin pool)
65 proxmox-lvm PVCs
+ VM disks"] end sda_vg --> sda_content end - sdc -. "daily snapshot ro + nfs-mirror" .-> sda - sdc -. "immich only
(inotify, daily 06:00)" .-> Synology - sda -. "daily 06:00
incremental rsync" .-> Synology - - Synology["Synology NAS 192.168.1.13
91% used / 520 GB free
/Backup/Viki/{pve-backup, nfs (immich), nfs-ssd}"] + sdc -.->|weekly backup
mount snapshot ro| sda + sda -.->|offsite sync
rsync| Synology["Synology NAS
192.168.1.13
/Backup/Viki/{pve-backup,nfs,nfs-ssd}/"] style sda fill:#fff9c4 style sdb fill:#c8e6c9 @@ -232,30 +174,29 @@ graph TB ```mermaid graph TB - Start["Data loss detected"]:::start + Start["Data loss detected"] Age{"How old is
the lost data?"} Type{"What type
of data?"} Start --> Age - Age -->|"< 12 h"| LVM["LVM thin snapshot on sdc
lvm-pvc-snapshot restore
RTO: <5 min
(7-day retention, 2x daily)"]:::fast - Age -->|"12 h - 4 weeks"| FileBackup["sda file backup
/mnt/backup/pvc-data// (PVCs)
/mnt/backup// (NFS dirs)
RTO: <15 min"]:::med - Age -->|"> 4 weeks or
site disaster"| Offsite["Synology /Viki/pve-backup/
(or /Viki/nfs/immich for photos)
RTO: <4 hours"]:::slow + Age -->|"< 7 days"| LVM["Use LVM snapshot
lvm-pvc-snapshot restore
RTO: <5 min"] + Age -->|"> 7 days,
< 4 weeks"| FileBackup["Use sda file backup
/mnt/backup/pvc-data//
RTO: <15 min"] + Age -->|"> 4 weeks or
site disaster"| Offsite["Use Synology backup
Synology/pve-backup/
RTO: <4 hours"] LVM --> Type FileBackup --> Type Offsite --> Type - Type -->|"Database (logical)"| AppBackup["App-level dump
/srv/nfs/-backup/
OR Synology /Viki/pve-backup/-backup/
RTO: <10 min (single-DB or full)"]:::db - Type -->|"PVC binary state"| Proceed["Proceed with
selected restore method"] - Type -->|"NFS files (nextcloud,
audiobookshelf, …)"| NFSRestore["sda /mnt/backup//
OR Synology /Viki/pve-backup//
RTO: varies by size"]:::med - Type -->|"Immich photos"| ImmichRestore["Synology /Viki/nfs/immich
(only offsite copy)
RTO: varies by size"]:::slow + Type -->|"Database"| AppBackup["Use app-level dump
/srv/nfs/-backup/
OR Synology/nfs/-backup/
RTO: <10 min"] + Type -->|"PVC files"| Proceed["Proceed with
selected restore method"] + Type -->|"Media (NFS)"| OffsiteMedia["Use Synology backup
Synology/nfs/ or nfs-ssd/
RTO: varies by size"] - classDef start fill:#ffcdd2,stroke:#b71c1c - classDef fast fill:#c8e6c9,stroke:#1b5e20 - classDef med fill:#fff9c4,stroke:#f57f17 - classDef slow fill:#e1f5ff,stroke:#01579b - classDef db fill:#e1bee7,stroke:#4a148c + style Start fill:#ffcdd2 + style LVM fill:#c8e6c9 + style FileBackup fill:#fff9c4 + style Offsite fill:#e1f5ff + style AppBackup fill:#e1bee7 ``` ### Vaultwarden Enhanced Protection diff --git a/scripts/offsite-sync-backup.sh b/scripts/offsite-sync-backup.sh index 790215e1..85e4134c 100644 --- a/scripts/offsite-sync-backup.sh +++ b/scripts/offsite-sync-backup.sh @@ -132,14 +132,9 @@ elif [ -s "${NFS_CHANGE_LOG}" ]; then sort -u "${NFS_CHANGE_LOG}" > /tmp/nfs-changes-deduped # HDD NFS — include only /srv/nfs/immich/ paths. - # `|| true` is REQUIRED: if the last iteration's `[ -f "$f" ]` is false - # (file was deleted between inotify capture and now — e.g., immich - # encoded-video temp file that got cleaned up), the while loop returns - # 1, pipefail propagates, and `set -e` kills the script silently before - # reaching the rsync. Matches the SSD section's pattern below. grep -E "${NFS_SDA_BYPASS_RE}" /tmp/nfs-changes-deduped | \ while IFS= read -r f; do [ -f "$f" ] && echo "${f#/srv/nfs/}"; done \ - > /tmp/sync-nfs.list 2>/dev/null || true + > /tmp/sync-nfs.list 2>/dev/null NFS_COUNT=$(wc -l < /tmp/sync-nfs.list 2>/dev/null || echo 0) if [ "${NFS_COUNT:-0}" -gt 0 ]; then rsync -rlt --files-from=/tmp/sync-nfs.list /srv/nfs/ "${NFS_DEST}/" 2>&1 \ diff --git a/stacks/broker-sync/main.tf b/stacks/broker-sync/main.tf index a5bf368d..b27c4eb9 100644 --- a/stacks/broker-sync/main.tf +++ b/stacks/broker-sync/main.tf @@ -10,8 +10,8 @@ resource "kubernetes_namespace" "broker_sync" { metadata { name = "broker-sync" labels = { - "istio-injection" = "disabled" - tier = local.tiers.aux + "istio-injection" = "disabled" + tier = local.tiers.aux "keel.sh/enrolled" = "true" } } @@ -290,14 +290,6 @@ resource "kubernetes_cron_job_v1" "imap" { name = "BROKER_SYNC_DATA_DIR" value = "/data" } - # 2026-05-26: skip InvestEngine email parsing. IE has its own - # bearer-token API path (`broker-sync invest-engine`) — running - # both produces duplicate BUYs in Wealthfolio because the two - # generate different external_ids for the same fill. - env { - name = "BROKER_SYNC_IMAP_EXCLUDE_PROVIDERS" - value = "invest-engine" - } env { name = "WF_SESSION_PATH" value = "/data/wealthfolio_session.json" diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index e3776b51..94236930 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -130,14 +130,6 @@ resource "kubernetes_stateful_set_v1" "mysql_standalone" { "app.kubernetes.io/name" = "mysql" "app.kubernetes.io/instance" = "mysql-standalone" "app.kubernetes.io/component" = "primary" - # 2026-05-26: defense-in-depth on top of the annotation below. The - # Kyverno `inject-keel-annotations` ClusterPolicy reads this LABEL - # via its `exclude.any[].resources.selector.matchLabels` rule, so - # even if the dbaas namespace exclude were lost the label still - # bypasses the mutation. Without the label, a Kyverno reconcile - # had silently overwritten our annotation=never → patch this turn - # and Keel patch-bumped mysql:8.4.8 → 8.4.9, stalling the DD upgrade. - "keel.sh/policy" = "never" } # Explicit Keel opt-out. The dbaas namespace is already excluded # from the `inject-keel-annotations` Kyverno ClusterPolicy, but the diff --git a/stacks/nvidia/modules/nvidia/main.tf b/stacks/nvidia/modules/nvidia/main.tf index 2268baf4..2c285517 100644 --- a/stacks/nvidia/modules/nvidia/main.tf +++ b/stacks/nvidia/modules/nvidia/main.tf @@ -137,14 +137,6 @@ resource "kubernetes_deployment" "nvidia-exporter" { labels = { app = "nvidia-exporter" tier = var.tier - # 2026-05-26: Keel tag-rewrote :latest → :4.5.2-4.8.1-ubuntu22.04 - # and the new image OOMs at 192Mi. Adding both LABEL + ANNOTATION - # to opt out of Keel cluster-wide auto-update — bump nvidia images - # in a separate planned change once we've sized the memory limit. - "keel.sh/policy" = "never" - } - annotations = { - "keel.sh/policy" = "never" } } spec { @@ -184,13 +176,10 @@ resource "kubernetes_deployment" "nvidia-exporter" { } resources { requests = { - memory = "256Mi" + memory = "192Mi" } limits = { - # Bumped 192Mi → 512Mi (2026-05-26): dcgm-exporter - # 4.5.2-4.8.1-ubuntu22.04 OOMKills at 192Mi. Older versions - # ran comfortably under 192Mi but post-bump we need headroom. - memory = "512Mi" + memory = "192Mi" "nvidia.com/gpu" = "1" } } diff --git a/stacks/redis/modules/redis/main.tf b/stacks/redis/modules/redis/main.tf index 898fab34..ee358b06 100644 --- a/stacks/redis/modules/redis/main.tf +++ b/stacks/redis/modules/redis/main.tf @@ -428,16 +428,6 @@ resource "kubernetes_stateful_set_v1" "redis_v2" { namespace = kubernetes_namespace.redis.metadata[0].name labels = { app = "redis-v2" - # 2026-05-26: Keel patch-bumped :8-alpine → :8.0.6-alpine, which - # rejected the `aof-load-corrupt-tail-max-size` config and crashed - # redis-v2-2. The bump is also semantically a downgrade (8-alpine is - # 8.6.2, 8.0.6 is older). Both LABEL + ANNOTATION are required for - # full opt-out: label drives Kyverno's selector exclude, annotation - # drives Keel's own gate. - "keel.sh/policy" = "never" - } - annotations = { - "keel.sh/policy" = "never" } } spec { diff --git a/stacks/status-page/main.tf b/stacks/status-page/main.tf index 8fae372c..6c943d6c 100644 --- a/stacks/status-page/main.tf +++ b/stacks/status-page/main.tf @@ -68,554 +68,549 @@ resource "kubernetes_cluster_role_binding_v1" "ingress_reader" { } # ============================================================================= -# Status Page Pusher ── DISABLED 2026-05-26 -# Reads Uptime Kuma monitors, generates status.json, pushes to GitHub Pages. -# -# Disabled because per-invocation `apk add git` + `pip install uptime-kuma-api` -# was hammering the Proxmox sdc thin pool (~3.2 MB/s of the ~8 MB/s sustained -# host-side, ~804 GB written over 18 h). Re-enable with a custom image that -# bakes git + uptime-kuma-api so cold-install is gone. +# Status Page Pusher +# Reads Uptime Kuma monitors, generates status.json, pushes to GitHub Pages # ============================================================================= -# resource "kubernetes_cron_job_v1" "status_page_pusher" { -# metadata { -# name = "status-page-pusher" -# namespace = kubernetes_namespace_v1.status_page.metadata[0].name -# } -# spec { -# concurrency_policy = "Forbid" -# failed_jobs_history_limit = 3 -# successful_jobs_history_limit = 3 -# schedule = "*/5 * * * *" -# job_template { -# metadata {} -# spec { -# backoff_limit = 1 -# ttl_seconds_after_finished = 300 -# template { -# metadata {} -# spec { -# service_account_name = kubernetes_service_account_v1.status_page.metadata[0].name -# container { -# name = "status-pusher" -# image = "docker.io/library/python:3.12-alpine" -# command = ["/bin/sh", "-c", <<-EOT -# apk add --no-cache git >/dev/null 2>&1 -# pip install --quiet --disable-pip-version-check uptime-kuma-api -# python3 << 'PYEOF' -# import os, sys, json, time, subprocess -# from datetime import datetime, timezone, timedelta -# from uptime_kuma_api import UptimeKumaApi -# -# UPTIME_KUMA_URL = "http://uptime-kuma.uptime-kuma.svc.cluster.local" -# UPTIME_KUMA_PASS = os.environ["UPTIME_KUMA_PASSWORD"] -# GITHUB_TOKEN = os.environ["GITHUB_TOKEN"] -# REPO = "ViktorBarzin/status-page" -# REPO_URL = "https://" + GITHUB_TOKEN + "@github.com/" + REPO + ".git" -# -# TYPE_NAMES = { -# "http": "HTTP", -# "port": "TCP Port", -# "ping": "Ping", -# "keyword": "HTTP Keyword", -# "grpc-keyword": "gRPC", -# "dns": "DNS", -# "docker": "Docker", -# "push": "Push", -# "steam": "Steam", -# "gamedig": "GameDig", -# "mqtt": "MQTT", -# "sqlserver": "SQL Server", -# "postgres": "PostgreSQL", -# "mysql": "MySQL", -# "mongodb": "MongoDB", -# "radius": "RADIUS", -# "redis": "Redis", -# "tailscale-ping": "Tailscale Ping", -# "real-browser": "Real Browser", -# "group": "Group", -# "snmp": "SNMP", -# "json-query": "JSON Query", -# } -# -# def beat_status_is_up(status_val): -# """Handle both enum and int status values.""" -# if hasattr(status_val, "value"): -# return status_val.value == 1 -# return status_val == 1 -# -# # Build namespace -> external URL map from K8s ingresses -# ingress_map = {} -# try: -# import ssl, urllib.request -# token_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" -# ca_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" -# if os.path.exists(token_path): -# with open(token_path) as f: -# token = f.read().strip() -# ctx = ssl.create_default_context(cafile=ca_path) -# k8s_host = os.environ.get("KUBERNETES_SERVICE_HOST", "kubernetes.default.svc") -# k8s_port = os.environ.get("KUBERNETES_SERVICE_PORT", "443") -# req = urllib.request.Request( -# "https://" + k8s_host + ":" + k8s_port + "/apis/networking.k8s.io/v1/ingresses", -# headers={"Authorization": "Bearer " + token} -# ) -# resp = urllib.request.urlopen(req, context=ctx, timeout=10) -# ing_data = json.loads(resp.read()) -# for item in ing_data.get("items", []): -# ns = item["metadata"]["namespace"] -# rules = item.get("spec", {}).get("rules", []) -# if rules and rules[0].get("host"): -# host = rules[0]["host"] -# if ns not in ingress_map: -# ingress_map[ns] = "https://" + host -# print(f"Built ingress map: {len(ingress_map)} namespaces") -# except Exception as e: -# print(f"Warning: could not build ingress map: {e}") -# -# print("Connecting to Uptime Kuma...") -# api = UptimeKumaApi(UPTIME_KUMA_URL, timeout=30) -# api.login("admin", UPTIME_KUMA_PASS) -# -# monitors = api.get_monitors() -# print(f"Fetched {len(monitors)} monitors") -# -# # Get current heartbeats for live status -# heartbeats = api.get_heartbeats() -# -# now = datetime.now(timezone.utc) -# -# def calc_uptime(beat_list, hours): -# cutoff = now - timedelta(hours=hours) -# relevant = [] -# for b in beat_list: -# t = str(b["time"]) -# try: -# bt = datetime.fromisoformat(t.replace("Z", "+00:00")) -# except (ValueError, TypeError): -# continue -# if bt.tzinfo is None: -# bt = bt.replace(tzinfo=timezone.utc) -# if bt > cutoff: -# relevant.append(b) -# if not relevant: -# return None -# up_count = sum(1 for b in relevant if beat_status_is_up(b.get("status", 0))) -# return round(up_count / len(relevant) * 100, 1) -# -# groups = {} -# for m in monitors: -# raw_type = m.get("type", "unknown") -# monitor_type = raw_type.value if hasattr(raw_type, "value") else str(raw_type) -# monitor_type = monitor_type.lower().replace("monitortype.", "") -# if m["name"].startswith("[External] "): -# group_name = "External Reachability" -# else: -# group_name = TYPE_NAMES.get(monitor_type, monitor_type.upper()) -# -# if not m.get("active", True): -# continue -# else: -# # Get latest heartbeat for current status -# mid = m["id"] -# mon_beats = heartbeats.get(mid, heartbeats.get(str(mid), [])) -# if mon_beats: -# # Flatten nested lists (API format varies by version) -# flat = [] -# for item in mon_beats: -# if isinstance(item, list): -# flat.extend(item) -# elif isinstance(item, dict): -# flat.append(item) -# mon_beats = flat if flat else mon_beats -# latest = mon_beats[-1] if mon_beats else None -# if latest and isinstance(latest, dict) and beat_status_is_up(latest.get("status", 0)): -# status = "up" -# else: -# status = "down" -# else: -# status = "pending" -# -# uptime_24h = None -# uptime_7d = None -# uptime_30d = None -# try: -# beats = api.get_monitor_beats(m["id"], 720) -# if beats: -# uptime_24h = calc_uptime(beats, 24) -# uptime_7d = calc_uptime(beats, 168) -# uptime_30d = calc_uptime(beats, 720) -# except Exception as e: -# print(f" Warning: could not get beats for {m['name']}: {e}") -# -# if group_name not in groups: -# groups[group_name] = [] -# -# # Extract external URL for HTTP monitors -# monitor_url = None -# raw_url = m.get("url", "") or "" -# if monitor_type == "http" and raw_url: -# if ".svc.cluster.local" not in raw_url and raw_url.startswith("http"): -# monitor_url = raw_url.rstrip("/") -# else: -# # Internal URL — derive external from namespace -# import re as _re -# ns_match = _re.search(r"//[^.]+\.([^.]+)\.svc\.cluster\.local", raw_url) -# if ns_match: -# ns = ns_match.group(1) -# if ns in ingress_map: -# monitor_url = ingress_map[ns] -# -# entry = { -# "name": m["name"], -# "status": status, -# "uptime_24h": uptime_24h, -# "uptime_7d": uptime_7d, -# "uptime_30d": uptime_30d, -# } -# if monitor_url: -# entry["url"] = monitor_url -# -# groups[group_name].append(entry) -# -# api.disconnect() -# print(f"Generated {len(groups)} groups") -# -# # ============ Detect external-down / internal-up divergence ============ -# external_status = {} -# internal_status = {} -# for gname, gmonitors in groups.items(): -# for mon in gmonitors: -# if mon["name"].startswith("[External] "): -# svc = mon["name"].replace("[External] ", "").lower() -# external_status[svc] = mon["status"] -# elif gname != "External Reachability": -# internal_status[mon["name"].lower()] = mon["status"] -# -# divergent = [] -# for svc, ext_st in external_status.items(): -# if ext_st != "down": -# continue -# for iname, int_st in internal_status.items(): -# if svc in iname or iname in svc: -# if int_st == "up": -# divergent.append(svc) -# break -# -# divergence_count = len(divergent) -# metric_body = ( -# "# HELP external_internal_divergence_count Services externally down but internally up\n" -# "# TYPE external_internal_divergence_count gauge\n" -# f"external_internal_divergence_count {divergence_count}\n" -# ) -# for svc in divergent: -# metric_body += f'external_internal_divergence_services{{service="{svc}"}} 1\n' -# -# try: -# import urllib.request as _ur -# req = _ur.Request( -# "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/external-monitor-divergence", -# data=metric_body.encode(), -# method="POST" -# ) -# _ur.urlopen(req, timeout=10) -# if divergent: -# print(f"WARNING: {len(divergent)} services externally down but internally up: {divergent}") -# else: -# print("No external/internal divergence detected") -# except Exception as e: -# print(f"Warning: could not push divergence metric: {e}") -# -# # ============ Fetch incidents from GitHub Issues ============ -# import urllib.request, urllib.error, re as _re2 -# -# def fetch_github_json(url): -# req = urllib.request.Request(url, headers={ -# "Authorization": "token " + GITHUB_TOKEN, -# "Accept": "application/vnd.github.v3+json", -# "User-Agent": "status-page-pusher", -# }) -# resp = urllib.request.urlopen(req, timeout=15) -# return json.loads(resp.read()) -# -# def parse_severity(labels): -# for lbl in labels: -# name = lbl["name"].lower() -# if name in ("sev1", "sev2", "sev3"): -# return name -# return "sev3" -# -# def parse_affected_services(body): -# services = [] -# if not body: -# return services -# in_section = False -# for line in body.split("\n"): -# stripped = line.strip() -# if stripped.lower().startswith("## affected"): -# in_section = True -# continue -# if in_section: -# if stripped.startswith("##"): -# break -# if stripped.startswith("- ") and not stripped.startswith("-