From ff83ec3325658bde8f0b78622f2d4e09a1329d95 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 15 Mar 2026 02:01:07 +0000 Subject: [PATCH] add infrastructure agent team: 8 specialized agents + 14 diagnostic scripts Agents: devops-engineer, dba, security-engineer, sre, network-engineer, platform-engineer, observability-engineer, home-automation-engineer. Scripts: deploy-status, db-health, backup-verify, tls-check, crowdsec-status, authentik-audit, oom-investigator, resource-report, dns-check, network-health, nfs-health, truenas-status, platform-status, monitoring-health. Also: known-issues.md suppression list, cluster-health-checker port-forward fix. --- .claude/agents/cluster-health-checker.md | 2 +- .claude/agents/dba.md | 49 ++++ .claude/agents/devops-engineer.md | 46 ++++ .claude/agents/home-automation-engineer.md | 61 +++++ .claude/agents/network-engineer.md | 54 ++++ .claude/agents/observability-engineer.md | 49 ++++ .claude/agents/platform-engineer.md | 65 +++++ .claude/agents/security-engineer.md | 61 +++++ .claude/agents/sre.md | 68 +++++ .claude/reference/known-issues.md | 12 + .claude/scripts/authentik-audit.sh | 134 ++++++++++ .claude/scripts/backup-verify.sh | 247 ++++++++++++++++++ .claude/scripts/crowdsec-status.sh | 166 ++++++++++++ .claude/scripts/db-health.sh | 194 ++++++++++++++ .claude/scripts/deploy-status.sh | 217 ++++++++++++++++ .claude/scripts/dns-check.sh | 144 +++++++++++ .claude/scripts/monitoring-health.sh | 281 +++++++++++++++++++++ .claude/scripts/network-health.sh | 166 ++++++++++++ .claude/scripts/nfs-health.sh | 145 +++++++++++ .claude/scripts/oom-investigator.sh | 214 ++++++++++++++++ .claude/scripts/platform-status.sh | 260 +++++++++++++++++++ .claude/scripts/resource-report.sh | 190 ++++++++++++++ .claude/scripts/tls-check.sh | 143 +++++++++++ .claude/scripts/truenas-status.sh | 186 ++++++++++++++ 24 files changed, 3153 insertions(+), 1 deletion(-) create mode 100644 .claude/agents/dba.md create mode 100644 .claude/agents/devops-engineer.md create mode 100644 .claude/agents/home-automation-engineer.md create mode 100644 .claude/agents/network-engineer.md create mode 100644 .claude/agents/observability-engineer.md create mode 100644 .claude/agents/platform-engineer.md create mode 100644 .claude/agents/security-engineer.md create mode 100644 .claude/agents/sre.md create mode 100644 .claude/reference/known-issues.md create mode 100755 .claude/scripts/authentik-audit.sh create mode 100755 .claude/scripts/backup-verify.sh create mode 100755 .claude/scripts/crowdsec-status.sh create mode 100755 .claude/scripts/db-health.sh create mode 100755 .claude/scripts/deploy-status.sh create mode 100755 .claude/scripts/dns-check.sh create mode 100755 .claude/scripts/monitoring-health.sh create mode 100755 .claude/scripts/network-health.sh create mode 100755 .claude/scripts/nfs-health.sh create mode 100755 .claude/scripts/oom-investigator.sh create mode 100755 .claude/scripts/platform-status.sh create mode 100755 .claude/scripts/resource-report.sh create mode 100755 .claude/scripts/tls-check.sh create mode 100755 .claude/scripts/truenas-status.sh diff --git a/.claude/agents/cluster-health-checker.md b/.claude/agents/cluster-health-checker.md index 144ed6da..999dba77 100644 --- a/.claude/agents/cluster-health-checker.md +++ b/.claude/agents/cluster-health-checker.md @@ -25,7 +25,7 @@ Run the cluster healthcheck script and interpret the results. If issues are foun - **Problematic pods**: `kubectl describe pod`, `kubectl logs --previous` - **Failed deployments**: check rollout status, events - **StatefulSet issues**: check pod readiness, GR status for MySQL - - **Prometheus alerts**: query via port-forward to prometheus-server + - **Prometheus alerts**: query via kubectl exec into prometheus-server 4. Apply safe auto-fixes: - Delete evicted/failed pods: `kubectl delete pods -A --field-selector=status.phase=Failed` - Delete stale failed jobs: `kubectl delete jobs -n --field-selector=status.successful=0` diff --git a/.claude/agents/dba.md b/.claude/agents/dba.md new file mode 100644 index 00000000..288b6df5 --- /dev/null +++ b/.claude/agents/dba.md @@ -0,0 +1,49 @@ +--- +name: dba +description: Check database health — MySQL InnoDB Cluster, PostgreSQL (CNPG), SQLite. Monitor replication, backups, connections, and slow queries. +tools: Read, Bash, Grep, Glob +model: sonnet +--- + +You are a DBA for a homelab Kubernetes cluster managed via Terraform/Terragrunt. + +## Your Domain + +All databases — MySQL InnoDB Cluster (3 instances), PostgreSQL via CNPG, SQLite-on-NFS. + +## Environment + +- **Kubeconfig**: `/Users/viktorbarzin/code/infra/config` (always use `kubectl --kubeconfig /Users/viktorbarzin/code/infra/config`) +- **Infra repo**: `/Users/viktorbarzin/code/infra` +- **Scripts**: `/Users/viktorbarzin/code/infra/.claude/scripts/` + +## Workflow + +1. Before reporting issues, read `.claude/reference/known-issues.md` and suppress any matches +2. Run diagnostic scripts: + - `bash /Users/viktorbarzin/code/infra/.claude/scripts/db-health.sh` — MySQL GR + CNPG + connections + - `bash /Users/viktorbarzin/code/infra/.claude/scripts/backup-verify.sh` — backup freshness +3. Investigate specific issues: + - **MySQL InnoDB Cluster**: Group Replication status via `kubectl exec sts/mysql-cluster -n dbaas -- mysql -e 'SELECT * FROM performance_schema.replication_group_members'` + - **CNPG PostgreSQL**: Cluster health via `kubectl get cluster,backup -A` + - **Backups**: CNPG backup CRD timestamps, MySQL dump timestamps on NFS + - **Connections**: Connection counts and slow queries + - **iSCSI volumes**: Health for database PVCs + - **SQLite**: WAL checkpoint status, integrity checks +4. Report findings with clear root cause analysis + +## Safe Auto-Fix + +None — database operations are too risky for auto-fix. Advisory only. + +## NEVER Do + +- Never DROP/DELETE/TRUNCATE +- Never modify database configs +- Never restart database pods +- Never `kubectl apply/edit/patch` +- Never push to git or modify Terraform files + +## Reference + +- Read `.claude/reference/service-catalog.md` for which services use which database diff --git a/.claude/agents/devops-engineer.md b/.claude/agents/devops-engineer.md new file mode 100644 index 00000000..487a87dc --- /dev/null +++ b/.claude/agents/devops-engineer.md @@ -0,0 +1,46 @@ +--- +name: devops-engineer +description: Check deployment rollouts, CI/CD builds, image pull errors, and post-deploy health. Use for stalled deployments, Woodpecker CI issues, or deploy verification. +tools: Read, Bash, Grep, Glob +model: sonnet +--- + +You are a DevOps Engineer for a homelab Kubernetes cluster managed via Terraform/Terragrunt. + +## Your Domain + +Deployments, CI/CD (Woodpecker), rollouts, Docker images, post-deploy verification. + +## Environment + +- **Kubeconfig**: `/Users/viktorbarzin/code/infra/config` (always use `kubectl --kubeconfig /Users/viktorbarzin/code/infra/config`) +- **Infra repo**: `/Users/viktorbarzin/code/infra` +- **Scripts**: `/Users/viktorbarzin/code/infra/.claude/scripts/` + +## Workflow + +1. Before reporting issues, read `.claude/reference/known-issues.md` and suppress any matches +2. Run `bash /Users/viktorbarzin/code/infra/.claude/scripts/deploy-status.sh` to check deployment health +3. Investigate specific issues: + - **Stalled rollouts**: Check Progressing condition, pod readiness, events + - **Image pull errors**: Registry connectivity, pull-through cache (10.0.20.10), tag existence + - **Woodpecker CI**: Build status via `kubectl exec` into woodpecker-server pod + - **Post-deploy health**: Verify via Uptime Kuma (use `uptime-kuma` skill) and service endpoints + - **DIUN**: Check for available image updates, report digest +4. Report findings with clear remediation steps + +## Safe Auto-Fix + +None — deployments are Terraform-owned. + +## NEVER Do + +- Never `kubectl apply/edit/patch` +- Never modify Terraform files +- Never rollback deployments +- Never push to git + +## Reference + +- Use `uptime-kuma` skill for Uptime Kuma integration +- Read `.claude/reference/service-catalog.md` for service inventory diff --git a/.claude/agents/home-automation-engineer.md b/.claude/agents/home-automation-engineer.md new file mode 100644 index 00000000..85e50e67 --- /dev/null +++ b/.claude/agents/home-automation-engineer.md @@ -0,0 +1,61 @@ +--- +name: home-automation-engineer +description: Check Home Assistant device health, Frigate NVR cameras, automations, and battery levels. Use for smart home diagnostics across ha-london and ha-sofia instances. +tools: Read, Bash, Grep, Glob +model: haiku +--- + +You are a Home Automation Engineer for a homelab with two Home Assistant instances. + +## Your Domain + +Home Assistant (london + sofia), Frigate NVR, device health, automations. These are external services on separate hardware, not K8s-managed. + +## Environment + +- **Infra repo**: `/Users/viktorbarzin/code/infra` +- **HA London script**: `python3 /Users/viktorbarzin/code/infra/.claude/home-assistant.py` +- **HA Sofia script**: `python3 /Users/viktorbarzin/code/infra/.claude/home-assistant-sofia.py` + +### Instances + +| Instance | URL | Default? | +|----------|-----|----------| +| **ha-london** | `https://ha-london.viktorbarzin.me` | Yes | +| **ha-sofia** | `https://ha-sofia.viktorbarzin.me` | No | + +- **Default**: ha-london (use unless user specifies "sofia" or "ha-sofia") +- **Aliases**: "ha" or "HA" = ha-london + +## Workflow + +1. Before reporting issues, read `.claude/reference/known-issues.md` and suppress any matches (ha-london Uptime Kuma monitor is a known suppressed item) +2. Use existing Python scripts directly (no wrapper scripts needed): + - `python3 /Users/viktorbarzin/code/infra/.claude/home-assistant.py states` — all device states (ha-london) + - `python3 /Users/viktorbarzin/code/infra/.claude/home-assistant-sofia.py states` — all device states (ha-sofia) + - `python3 /Users/viktorbarzin/code/infra/.claude/home-assistant.py services` — available services +3. Check for issues: + - **Device availability**: Look for `unavailable` or `unknown` state entities + - **Frigate cameras**: 9 cameras on ha-sofia — check camera entity states + - **Automations**: Review automation run history for failures + - **Climate zones**: Temperature/HVAC status + - **Alarm**: Security system status + - **Battery levels**: All battery-powered devices — warn if <20% + - **Energy**: Consumption monitoring +4. Report findings organized by instance + +## Safe Auto-Fix + +None — home automation actions require user intent. + +## NEVER Do + +- Never turn off alarm system +- Never unlock doors +- Never change climate settings +- Never disable automations without explicit request +- Never expose API tokens + +## Reference + +- Use `home-assistant` skill for HA interaction patterns diff --git a/.claude/agents/network-engineer.md b/.claude/agents/network-engineer.md new file mode 100644 index 00000000..b423c1b5 --- /dev/null +++ b/.claude/agents/network-engineer.md @@ -0,0 +1,54 @@ +--- +name: network-engineer +description: Check pfSense firewall, DNS (Technitium + Cloudflare), VPN (WireGuard/Headscale), routing, and MetalLB. Use for connectivity issues, DNS problems, or network diagnostics. +tools: Read, Bash, Grep, Glob +model: sonnet +--- + +You are a Network Engineer for a homelab Kubernetes cluster managed via Terraform/Terragrunt. + +## Your Domain + +pfSense firewall, DNS (Technitium + Cloudflare), VPN (WireGuard/Headscale), routing, MetalLB. + +## Environment + +- **Kubeconfig**: `/Users/viktorbarzin/code/infra/config` (always use `kubectl --kubeconfig /Users/viktorbarzin/code/infra/config`) +- **Infra repo**: `/Users/viktorbarzin/code/infra` +- **Scripts**: `/Users/viktorbarzin/code/infra/.claude/scripts/` +- **pfSense**: Access via `python3 /Users/viktorbarzin/code/infra/.claude/pfsense.py` +- **VLANs**: 10.0.10.0/24 (storage), 10.0.20.0/24 (k8s), 192.168.1.0/24 (management) + +## Workflow + +1. Before reporting issues, read `.claude/reference/known-issues.md` and suppress any matches +2. Run diagnostic scripts: + - `bash /Users/viktorbarzin/code/infra/.claude/scripts/dns-check.sh` — DNS resolution verification + - `bash /Users/viktorbarzin/code/infra/.claude/scripts/network-health.sh` — pfSense + VPN + MetalLB +3. Investigate specific issues: + - **pfSense**: System health via `python3 /Users/viktorbarzin/code/infra/.claude/pfsense.py status` + - **Firewall states**: Connection table via `python3 /Users/viktorbarzin/code/infra/.claude/pfsense.py pfctl` + - **DNS**: Resolution for all services (internal `.lan` + external `.me`) + - **Technitium**: DNS server health and zone status + - **WireGuard/Headscale**: Tunnel status via `python3 /Users/viktorbarzin/code/infra/.claude/pfsense.py wireguard` + - **Routing**: Between VLANs + - **MetalLB**: L2 advertisement health +4. Report findings with clear root cause analysis + +## Safe Auto-Fix + +None — network changes are high-blast-radius. + +## NEVER Do + +- Never modify firewall rules +- Never change DNS records (Terraform-owned) +- Never modify VPN configs +- Never restart pfSense services +- Never `kubectl apply/edit/patch` +- Never push to git or modify Terraform files + +## Reference + +- Use `pfsense` skill for pfSense access patterns +- Read `k8s-ndots` skill for DNS search domain issues diff --git a/.claude/agents/observability-engineer.md b/.claude/agents/observability-engineer.md new file mode 100644 index 00000000..7d57dda2 --- /dev/null +++ b/.claude/agents/observability-engineer.md @@ -0,0 +1,49 @@ +--- +name: observability-engineer +description: Check monitoring stack health (Prometheus, Grafana, Alertmanager, Uptime Kuma, SNMP exporters). Use for alert issues, monitoring problems, or dashboard diagnostics. +tools: Read, Bash, Grep, Glob +model: sonnet +--- + +You are an Observability Engineer for a homelab Kubernetes cluster managed via Terraform/Terragrunt. + +## Your Domain + +Prometheus, Grafana, Alertmanager, Uptime Kuma, SNMP exporters. Note: Loki and Alloy are NOT deployed — log queries use `kubectl logs`. + +## Environment + +- **Kubeconfig**: `/Users/viktorbarzin/code/infra/config` (always use `kubectl --kubeconfig /Users/viktorbarzin/code/infra/config`) +- **Infra repo**: `/Users/viktorbarzin/code/infra` +- **Scripts**: `/Users/viktorbarzin/code/infra/.claude/scripts/` + +## Workflow + +1. Before reporting issues, read `.claude/reference/known-issues.md` and suppress any matches +2. Run diagnostic script: + - `bash /Users/viktorbarzin/code/infra/.claude/scripts/monitoring-health.sh` — monitoring pod health, alerts, Grafana datasources, SNMP exporters +3. Investigate specific issues: + - **Monitoring stack health**: Verify Prometheus (`deploy/prometheus-server`), Alertmanager (`sts/prometheus-alertmanager`), Grafana (`deploy/grafana`) pods are running and responsive + - **Alert analysis**: Why alerts are firing or not firing — check Alertmanager routing, silences, inhibitions + - **Grafana**: Datasource connectivity via `kubectl exec deploy/grafana -n monitoring -- curl -s 'http://localhost:3000/api/datasources'` + - **SNMP exporters**: snmp-exporter (UPS), idrac-redfish-exporter (iDRAC), proxmox-exporter scraping status + - **Prometheus storage**: Usage and retention + - **Alert routing**: Receivers, matchers, inhibitions + - **Uptime Kuma**: Use the `uptime-kuma` skill for monitor management +4. Report findings with clear root cause analysis + +## Safe Auto-Fix + +None — monitoring config is Terraform-owned. + +## NEVER Do + +- Never modify Prometheus rules, Grafana dashboards, or alert configs directly +- Never `kubectl apply/edit/patch` +- Never commit secrets +- Never push to git or modify Terraform files + +## Reference + +- Use `uptime-kuma` skill for Uptime Kuma management +- Use `cluster-health` skill for quick cluster triage diff --git a/.claude/agents/platform-engineer.md b/.claude/agents/platform-engineer.md new file mode 100644 index 00000000..57f77984 --- /dev/null +++ b/.claude/agents/platform-engineer.md @@ -0,0 +1,65 @@ +--- +name: platform-engineer +description: Check K8s platform health, NFS/iSCSI storage, Proxmox VMs, Traefik, Kyverno, VPA. Use for node issues, storage problems, or platform-level diagnostics. +tools: Read, Bash, Grep, Glob +model: sonnet +--- + +You are a Platform Engineer for a homelab Kubernetes cluster managed via Terraform/Terragrunt. + +## Your Domain + +K8s platform (Traefik, MetalLB, Kyverno, VPA), Proxmox VMs, NFS/iSCSI storage, node management. + +## Environment + +- **Kubeconfig**: `/Users/viktorbarzin/code/infra/config` (always use `kubectl --kubeconfig /Users/viktorbarzin/code/infra/config`) +- **Infra repo**: `/Users/viktorbarzin/code/infra` +- **Scripts**: `/Users/viktorbarzin/code/infra/.claude/scripts/` +- **K8s nodes**: k8s-master (10.0.20.100), k8s-node1 (10.0.20.101), k8s-node2 (10.0.20.102), k8s-node3 (10.0.20.103), k8s-node4 (10.0.20.104) — SSH user: `wizard` +- **TrueNAS**: `ssh root@10.0.10.15` +- **Proxmox**: `ssh root@192.168.1.127` + +## Workflow + +1. Before reporting issues, read `.claude/reference/known-issues.md` and suppress any matches +2. Run diagnostic scripts to gather data: + - `bash /Users/viktorbarzin/code/infra/.claude/scripts/nfs-health.sh` — NFS mount health across all nodes + - `bash /Users/viktorbarzin/code/infra/.claude/scripts/truenas-status.sh` — ZFS pools, SMART, replication, iSCSI + - `bash /Users/viktorbarzin/code/infra/.claude/scripts/platform-status.sh` — Traefik, Kyverno, VPA, pull-through cache, Proxmox +3. Investigate specific issues: + - NFS: SSH to affected nodes, check mount status, detect stale file handles + - TrueNAS: ZFS pool status, SMART health, replication tasks via SSH + - PVCs: Check pending PVCs, unbound PVs, capacity usage + - iSCSI: democratic-csi volume health + - Traefik: IngressRoute health, middleware status + - Kyverno: Resource governance (LimitRange + ResourceQuota per namespace) + - VPA/Goldilocks: Status and unexpected updateMode settings + - Proxmox: Host resources via SSH + - Node conditions: kubelet status + - Pull-through cache: Registry health (10.0.20.10) +4. Report findings with clear root cause analysis + +## Proactive Mode + +Daily NFS + TrueNAS health check — storage failures cascade across all 70+ services. + +## Safe Auto-Fix + +None. NFS remount via SSH can hang on dead TrueNAS; PV cleanup destroys data. + +## NEVER Do + +- Never restart NFS on TrueNAS +- Never delete datasets/pools/snapshots +- Never modify PVCs via kubectl +- Never delete PVs +- Never `kubectl apply/edit/patch` +- Never change Kyverno policies directly +- Never push to git or modify Terraform files + +## Reference + +- Read `.claude/reference/patterns.md` for governance tables +- Read `.claude/reference/proxmox-inventory.md` for VM details +- Use `extend-vm-storage` skill for storage extension workflow diff --git a/.claude/agents/security-engineer.md b/.claude/agents/security-engineer.md new file mode 100644 index 00000000..cacdae76 --- /dev/null +++ b/.claude/agents/security-engineer.md @@ -0,0 +1,61 @@ +--- +name: security-engineer +description: Check TLS certs, CrowdSec WAF, Authentik SSO, Kyverno policies, Snort IDS, and Cloudflare tunnel. Use for security audits, cert expiry, or access control issues. +tools: Read, Bash, Grep, Glob +model: sonnet +--- + +You are a Security Engineer for a homelab Kubernetes cluster managed via Terraform/Terragrunt. + +## Your Domain + +TLS certs, CrowdSec WAF, Authentik SSO, Kyverno policies, Snort IDS, Cloudflare tunnel. + +## Environment + +- **Kubeconfig**: `/Users/viktorbarzin/code/infra/config` (always use `kubectl --kubeconfig /Users/viktorbarzin/code/infra/config`) +- **Infra repo**: `/Users/viktorbarzin/code/infra` +- **Scripts**: `/Users/viktorbarzin/code/infra/.claude/scripts/` +- **pfSense**: Access via `python3 /Users/viktorbarzin/code/infra/.claude/pfsense.py` + +## Workflow + +1. Before reporting issues, read `.claude/reference/known-issues.md` and suppress any matches +2. Run diagnostic scripts: + - `bash /Users/viktorbarzin/code/infra/.claude/scripts/tls-check.sh` — cert expiry scan + - `bash /Users/viktorbarzin/code/infra/.claude/scripts/crowdsec-status.sh` — CrowdSec LAPI/agent health + - `bash /Users/viktorbarzin/code/infra/.claude/scripts/authentik-audit.sh` — user/group audit +3. Investigate specific issues: + - **TLS certs**: Check in-cluster `kubernetes.io/tls` secrets + `secrets/fullchain.pem`, alert <14 days to expiry + - **cert-manager**: Certificate/CertificateRequest/Order CRDs for renewal failures + - **CrowdSec**: LAPI health via `kubectl exec` + `cscli`, agent DaemonSet, recent decisions + - **Authentik**: Users/groups via `kubectl exec deploy/goauthentik-server -n authentik`, outpost health + - **Snort IDS**: Review alerts via `python3 /Users/viktorbarzin/code/infra/.claude/pfsense.py snort` + - **Kyverno**: Policies in expected state (Audit mode, not Enforce) + - **Cloudflare tunnel**: Pod health + - **Sealed-secrets**: Controller operational +4. Report findings with clear remediation steps + +## Proactive Mode + +Daily TLS cert expiry check only. All other checks on-demand. + +## Safe Auto-Fix + +Delete stale CrowdSec machine registrations via `cscli machines delete` — only machines not seen in >7 days. Always run `cscli machines list` first and show what would be deleted before acting. Reversible — agents re-register on next heartbeat. + +## NEVER Do + +- Never read/expose raw secret values +- Never modify CrowdSec config (Terraform-owned) +- Never create/delete Authentik users without explicit request +- Never modify firewall rules +- Never disable security policies +- Never commit secrets +- Never `kubectl apply/edit/patch` +- Never push to git or modify Terraform files + +## Reference + +- Use `pfsense` skill for pfSense access patterns +- Read `.claude/reference/authentik-state.md` for Authentik configuration diff --git a/.claude/agents/sre.md b/.claude/agents/sre.md new file mode 100644 index 00000000..827eda34 --- /dev/null +++ b/.claude/agents/sre.md @@ -0,0 +1,68 @@ +--- +name: sre +description: Investigate OOMKilled pods, capacity issues, and complex multi-system incidents. The escalation point when specialist agents aren't enough. +tools: Read, Bash, Grep, Glob +model: opus +--- + +You are an SRE / On-Call engineer for a homelab Kubernetes cluster managed via Terraform/Terragrunt. + +## Your Domain + +Incident response, OOM investigation, capacity planning, root cause analysis. You are the escalation point when specialist agents aren't enough. + +## Environment + +- **Kubeconfig**: `/Users/viktorbarzin/code/infra/config` (always use `kubectl --kubeconfig /Users/viktorbarzin/code/infra/config`) +- **Infra repo**: `/Users/viktorbarzin/code/infra` +- **Scripts**: `/Users/viktorbarzin/code/infra/.claude/scripts/` +- **K8s nodes**: k8s-master (10.0.20.100), k8s-node1-4 (10.0.20.101-104) — SSH user: `wizard` + +## Two Modes + +### Mode 1 — OOM/Capacity (most common) + +1. Run `bash /Users/viktorbarzin/code/infra/.claude/scripts/oom-investigator.sh` to find OOMKilled pods +2. For each OOMKilled pod: + - Identify the container that was killed + - Check LimitRange defaults in the namespace + - Check actual usage vs limit + - Read Goldilocks VPA recommendations + - Compare to Terraform-defined resources in the stack +3. Run `bash /Users/viktorbarzin/code/infra/.claude/scripts/resource-report.sh` for cluster-wide capacity +4. Produce actionable Terraform snippets for resource fixes + +### Mode 2 — Incident Response (rare, complex) + +1. **Pre-check**: Verify monitoring pods are running (`kubectl get pods -n monitoring`). If monitoring is down, fall back to kubectl events/logs and SSH-based investigation. +2. Query Prometheus via `kubectl exec deploy/prometheus-server -n monitoring -- wget -qO- 'http://localhost:9090/api/v1/query?query=...'` +3. Query Alertmanager via `kubectl exec sts/prometheus-alertmanager -n monitoring -- wget -qO- 'http://localhost:9093/api/v2/...'` +4. Aggregate logs via `kubectl logs` across pods/namespaces (Loki is NOT deployed) +5. Correlate across: pod events, node conditions, pfSense logs, CrowdSec decisions +6. SSH to nodes for kubelet logs (`journalctl -u kubelet`), dmesg, systemd status +7. Produce incident reports with root cause + remediation + +## Workflow + +1. Before reporting issues, read `.claude/reference/known-issues.md` and suppress any matches +2. Determine which mode applies based on the user's request +3. Run appropriate scripts and investigations +4. Report findings with clear root cause analysis and actionable remediation + +## Safe Auto-Fix + +None — purely investigative. + +## NEVER Do + +- Never `kubectl apply/edit/patch` +- Never modify any files +- Never restart services +- Never push to git +- Never commit secrets + +## Reference + +- All other agents' scripts are available in `.claude/scripts/` +- Read `.claude/reference/patterns.md` for governance tables +- Read `.claude/reference/proxmox-inventory.md` for VM details diff --git a/.claude/reference/known-issues.md b/.claude/reference/known-issues.md new file mode 100644 index 00000000..25dd2105 --- /dev/null +++ b/.claude/reference/known-issues.md @@ -0,0 +1,12 @@ +# Known Issues (suppress in all agents) + +## Permanent +- ha-london Uptime Kuma monitor down — external HA on Raspberry Pi, not in this cluster +- PVFillingUp for navidrome-music — Synology NAS volume, threshold is 95%, expected + +## Intermittent +- CrowdSec Helm release stuck in pending-upgrade — known issue, workaround: helm rollback +- Resource usage >80% on nodes — WARN only, overcommit is by design (2x LimitRange ratio) + +## How agents consume this file +Each agent definition includes: "Before reporting issues, read `.claude/reference/known-issues.md` and suppress any matches." diff --git a/.claude/scripts/authentik-audit.sh b/.claude/scripts/authentik-audit.sh new file mode 100755 index 00000000..0b7df6fb --- /dev/null +++ b/.claude/scripts/authentik-audit.sh @@ -0,0 +1,134 @@ +#!/usr/bin/env bash +set -euo pipefail + +KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config" +AGENT="authentik-audit" +DRY_RUN=false +NAMESPACE="authentik" + +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + esac +done + +checks=() + +add_check() { + local name="$1" status="$2" message="$3" + checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}") +} + +find_authentik_pod() { + local pod + pod=$($KUBECTL get pods -n "$NAMESPACE" -l app.kubernetes.io/name=authentik,app.kubernetes.io/component=server -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || \ + pod=$($KUBECTL get pods -n "$NAMESPACE" --no-headers 2>/dev/null | grep -i "goauthentik-server\|authentik-server" | grep "Running" | head -1 | awk '{print $1}') || true + echo "$pod" +} + +check_server_health() { + if $DRY_RUN; then + add_check "authentik-server" "ok" "dry-run: would check goauthentik-server pod health" + return + fi + + local pods + pods=$($KUBECTL get pods -n "$NAMESPACE" --no-headers 2>/dev/null | grep -i "authentik") || { + add_check "authentik-server" "fail" "No Authentik pods found in namespace ${NAMESPACE}" + return + } + + local not_running + not_running=$(echo "$pods" | grep -v "Running" | grep -v "Completed" | grep -c "." 2>/dev/null || echo "0") + + local total + total=$(echo "$pods" | grep -c "." 2>/dev/null || echo "0") + + if [ "$not_running" -gt 0 ]; then + add_check "authentik-server" "warn" "${not_running}/${total} Authentik pod(s) not running" + else + add_check "authentik-server" "ok" "All ${total} Authentik pod(s) running" + fi +} + +check_outposts() { + if $DRY_RUN; then + add_check "authentik-outposts" "ok" "dry-run: would check Authentik outpost pods" + return + fi + + local outpost_pods + outpost_pods=$($KUBECTL get pods -n "$NAMESPACE" -l app.kubernetes.io/managed-by=goauthentik.io --no-headers 2>/dev/null) || \ + outpost_pods=$($KUBECTL get pods -n "$NAMESPACE" --no-headers 2>/dev/null | grep -i "outpost" || true) + + if [ -z "$outpost_pods" ]; then + add_check "authentik-outposts" "warn" "No outpost pods found" + return + fi + + local total not_running + total=$(echo "$outpost_pods" | grep -c "." 2>/dev/null || echo "0") + not_running=$(echo "$outpost_pods" | grep -v "Running" | grep -c "." 2>/dev/null || echo "0") + + if [ "$not_running" -gt 0 ]; then + add_check "authentik-outposts" "warn" "${not_running}/${total} outpost pod(s) not running" + else + add_check "authentik-outposts" "ok" "All ${total} outpost pod(s) running" + fi +} + +check_user_count() { + if $DRY_RUN; then + add_check "authentik-users" "ok" "dry-run: would check user count via ak CLI" + return + fi + + local pod + pod=$(find_authentik_pod) + + if [ -z "$pod" ]; then + add_check "authentik-users" "warn" "No Authentik server pod found to query users" + return + fi + + # Use the ak CLI to get user count + local user_output + user_output=$($KUBECTL exec -n "$NAMESPACE" "$pod" -- ak user list 2>/dev/null) || { + # Fallback: try management command + user_output=$($KUBECTL exec -n "$NAMESPACE" "$pod" -- python -c " +import django; django.setup() +from authentik.core.models import User +print(f'total={User.objects.count()} active={User.objects.filter(is_active=True).count()}') +" 2>/dev/null) || { + add_check "authentik-users" "warn" "Could not query user count from Authentik" + return + } + } + + local user_count + if echo "$user_output" | grep -q "total="; then + user_count=$(echo "$user_output" | grep "total=" | sed 's/.*total=\([0-9]*\).*/\1/') + local active_count + active_count=$(echo "$user_output" | grep "active=" | sed 's/.*active=\([0-9]*\).*/\1/') + add_check "authentik-users" "ok" "${user_count} total users, ${active_count} active" + else + # Count lines of output as fallback + user_count=$(echo "$user_output" | wc -l | tr -d ' ') + add_check "authentik-users" "ok" "User query returned ${user_count} lines of output" + fi +} + +check_server_health +check_outposts +check_user_count + +# Output JSON +overall="ok" +for c in "${checks[@]}"; do + s=$(echo "$c" | jq -r '.status') + if [ "$s" = "fail" ]; then overall="fail"; break; fi + if [ "$s" = "warn" ]; then overall="warn"; fi +done + +printf '{"status": "%s", "agent": "%s", "checks": [%s]}\n' \ + "$overall" "$AGENT" "$(IFS=,; echo "${checks[*]}")" diff --git a/.claude/scripts/backup-verify.sh b/.claude/scripts/backup-verify.sh new file mode 100755 index 00000000..bef75f8e --- /dev/null +++ b/.claude/scripts/backup-verify.sh @@ -0,0 +1,247 @@ +#!/usr/bin/env bash +set -euo pipefail + +KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config" +DRY_RUN=false +AGENT="backup-verify" + +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + esac +done + +CHECKS="[]" + +add_check() { + local name="$1" status="$2" message="$3" + CHECKS=$(echo "$CHECKS" | python3 -c " +import sys, json +checks = json.load(sys.stdin) +checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''}) +json.dump(checks, sys.stdout) +") +} + +# CNPG backup freshness via backup CRDs +check_cnpg_backups() { + if $DRY_RUN; then + add_check "cnpg-backups" "ok" "DRY RUN: would check CNPG backup CRD timestamps" + return + fi + + local backups + backups=$($KUBECTL get backup.postgresql.cnpg.io --all-namespaces -o json 2>/dev/null) || { + # Try scheduledbackup as well + local scheduled + scheduled=$($KUBECTL get scheduledbackup.postgresql.cnpg.io --all-namespaces --no-headers 2>/dev/null) || true + if [ -n "$scheduled" ]; then + add_check "cnpg-backups" "warn" "ScheduledBackups exist but no Backup CRDs found — backups may not have run yet" + else + add_check "cnpg-backups" "warn" "No CNPG Backup CRDs found" + fi + return + } + + local report + report=$(echo "$backups" | python3 -c " +import sys, json +from datetime import datetime, timezone + +data = json.load(sys.stdin) +items = data.get('items', []) +if not items: + print('WARN|No CNPG backups found') + sys.exit(0) + +# Group by cluster, find latest backup per cluster +clusters = {} +for b in items: + ns = b['metadata']['namespace'] + cluster = b.get('spec', {}).get('cluster', {}).get('name', 'unknown') + key = f'{ns}/{cluster}' + phase = b.get('status', {}).get('phase', 'unknown') + started = b.get('status', {}).get('startedAt', '') + stopped = b.get('status', {}).get('stoppedAt', '') + if key not in clusters or stopped > clusters[key].get('stopped', ''): + clusters[key] = {'phase': phase, 'started': started, 'stopped': stopped} + +results = [] +all_ok = True +now = datetime.now(timezone.utc) +for key, info in sorted(clusters.items()): + age_str = 'unknown' + if info['stopped']: + try: + stopped_dt = datetime.fromisoformat(info['stopped'].replace('Z', '+00:00')) + age = now - stopped_dt + age_hours = age.total_seconds() / 3600 + age_str = f'{age_hours:.1f}h ago' + if age_hours > 48: + all_ok = False + except Exception: + age_str = info['stopped'] + else: + all_ok = False + age_str = 'no completion time' + + phase = info['phase'] + if phase not in ('completed', 'Completed'): + all_ok = False + results.append(f'{key}: {phase} ({age_str})') + +status = 'OK' if all_ok else 'WARN' +print(f'{status}|' + '; '.join(results)) +" 2>/dev/null) || report="WARN|Failed to parse CNPG backups" + + local status_prefix="${report%%|*}" + local detail="${report#*|}" + + if [ "$status_prefix" = "OK" ]; then + add_check "cnpg-backups" "ok" "$detail" + else + add_check "cnpg-backups" "warn" "$detail" + fi +} + +# CNPG ScheduledBackup health +check_cnpg_scheduled() { + if $DRY_RUN; then + add_check "cnpg-scheduled-backups" "ok" "DRY RUN: would check CNPG ScheduledBackup status" + return + fi + + local scheduled + scheduled=$($KUBECTL get scheduledbackup.postgresql.cnpg.io --all-namespaces -o json 2>/dev/null) || { + add_check "cnpg-scheduled-backups" "ok" "No CNPG ScheduledBackups configured" + return + } + + local report + report=$(echo "$scheduled" | python3 -c " +import sys, json +data = json.load(sys.stdin) +items = data.get('items', []) +if not items: + print('OK|No ScheduledBackups defined') + sys.exit(0) +results = [] +all_ok = True +for sb in items: + ns = sb['metadata']['namespace'] + name = sb['metadata']['name'] + schedule = sb.get('spec', {}).get('schedule', 'unknown') + suspend = sb.get('spec', {}).get('suspend', False) + last = sb.get('status', {}).get('lastScheduleTime', 'never') + if suspend: + all_ok = False + results.append(f'{ns}/{name}: SUSPENDED schedule={schedule}') + else: + results.append(f'{ns}/{name}: active schedule={schedule} last={last}') +status = 'OK' if all_ok else 'WARN' +print(f'{status}|' + '; '.join(results)) +" 2>/dev/null) || report="WARN|Failed to parse ScheduledBackups" + + local status_prefix="${report%%|*}" + local detail="${report#*|}" + + if [ "$status_prefix" = "OK" ]; then + add_check "cnpg-scheduled-backups" "ok" "$detail" + else + add_check "cnpg-scheduled-backups" "warn" "$detail" + fi +} + +# MySQL backup file freshness on NFS +check_mysql_backups() { + if $DRY_RUN; then + add_check "mysql-backups" "ok" "DRY RUN: would check MySQL backup file timestamps" + return + fi + + # Check for MySQL backup files via a pod that has NFS mounted, or via known backup job + local backup_pods + backup_pods=$($KUBECTL get pods --all-namespaces -l app=mysql-backup -o name 2>/dev/null | head -1) || true + if [ -z "$backup_pods" ]; then + backup_pods=$($KUBECTL get cronjobs --all-namespaces --no-headers 2>/dev/null | grep -i "mysql.*backup\|backup.*mysql" | awk '{print $1"/"$2}') || true + fi + + if [ -z "$backup_pods" ]; then + # Try checking via TrueNAS SSH for NFS backup files + local nfs_check + nfs_check=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@10.0.10.15 \ + "find /mnt/main -name '*.sql.gz' -o -name '*.sql' -o -name '*mysql*backup*' 2>/dev/null | head -5" 2>/dev/null) || true + + if [ -n "$nfs_check" ]; then + local ages + ages=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@10.0.10.15 \ + "for f in $(echo "$nfs_check" | tr '\n' ' '); do stat -f '%m %N' \"\$f\" 2>/dev/null || stat -c '%Y %n' \"\$f\" 2>/dev/null; done" 2>/dev/null) || true + if [ -n "$ages" ]; then + add_check "mysql-backups" "ok" "Found MySQL backup files on NFS: $(echo "$nfs_check" | tr '\n' '; ')" + else + add_check "mysql-backups" "warn" "Found backup files but cannot determine age: $(echo "$nfs_check" | tr '\n' '; ')" + fi + else + add_check "mysql-backups" "warn" "No MySQL backup CronJobs or backup files found" + fi + return + fi + + # Check CronJob last successful run + local cronjob_status + cronjob_status=$($KUBECTL get cronjobs --all-namespaces -o json 2>/dev/null | python3 -c " +import sys, json +from datetime import datetime, timezone + +data = json.load(sys.stdin) +results = [] +for cj in data.get('items', []): + ns = cj['metadata']['namespace'] + name = cj['metadata']['name'] + if 'mysql' not in name.lower() and 'backup' not in name.lower(): + continue + schedule = cj.get('spec', {}).get('schedule', 'unknown') + last_time = cj.get('status', {}).get('lastScheduleTime', '') + last_success = cj.get('status', {}).get('lastSuccessfulTime', '') + suspend = cj.get('spec', {}).get('suspend', False) + + age_str = 'never' + if last_success: + try: + dt = datetime.fromisoformat(last_success.replace('Z', '+00:00')) + age = datetime.now(timezone.utc) - dt + age_str = f'{age.total_seconds()/3600:.1f}h ago' + except Exception: + age_str = last_success + + status = 'suspended' if suspend else 'active' + results.append(f'{ns}/{name}: {status} schedule={schedule} last_success={age_str}') + +if results: + print('; '.join(results)) +else: + print('No MySQL/backup CronJobs found') +" 2>/dev/null) || cronjob_status="Failed to check CronJobs" + + add_check "mysql-backups" "ok" "$cronjob_status" +} + +# Run all checks +check_cnpg_backups +check_cnpg_scheduled +check_mysql_backups + +# Determine overall status +OVERALL=$(echo "$CHECKS" | python3 -c " +import sys, json +checks = json.load(sys.stdin) +statuses = [c['status'] for c in checks] +if 'fail' in statuses: + print('fail') +elif 'warn' in statuses: + print('warn') +else: + print('ok') +") + +echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool diff --git a/.claude/scripts/crowdsec-status.sh b/.claude/scripts/crowdsec-status.sh new file mode 100755 index 00000000..3c7ec0f6 --- /dev/null +++ b/.claude/scripts/crowdsec-status.sh @@ -0,0 +1,166 @@ +#!/usr/bin/env bash +set -euo pipefail + +KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config" +AGENT="crowdsec-status" +DRY_RUN=false + +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + esac +done + +checks=() + +add_check() { + local name="$1" status="$2" message="$3" + checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}") +} + +find_crowdsec_namespace() { + $KUBECTL get pods -A -l app.kubernetes.io/name=crowdsec --no-headers 2>/dev/null | head -1 | awk '{print $1}' || \ + $KUBECTL get pods -A --no-headers 2>/dev/null | grep -i crowdsec | head -1 | awk '{print $1}' || \ + echo "crowdsec" +} + +check_lapi_health() { + if $DRY_RUN; then + add_check "crowdsec-lapi" "ok" "dry-run: would check CrowdSec LAPI pod health" + return + fi + + local ns + ns=$(find_crowdsec_namespace) + + local lapi_pod + lapi_pod=$($KUBECTL get pods -n "$ns" -l app.kubernetes.io/name=crowdsec,app.kubernetes.io/component=lapi --no-headers 2>/dev/null | head -1) || true + + if [ -z "$lapi_pod" ]; then + lapi_pod=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep -i "crowdsec.*lapi" | head -1) || true + fi + + if [ -z "$lapi_pod" ]; then + add_check "crowdsec-lapi" "fail" "No CrowdSec LAPI pod found in namespace ${ns}" + return + fi + + local pod_name status + pod_name=$(echo "$lapi_pod" | awk '{print $1}') + status=$(echo "$lapi_pod" | awk '{print $3}') + + if [ "$status" != "Running" ]; then + add_check "crowdsec-lapi" "fail" "LAPI pod ${pod_name} is ${status}" + return + fi + + add_check "crowdsec-lapi" "ok" "LAPI pod ${pod_name} is Running" +} + +check_cscli_metrics() { + if $DRY_RUN; then + add_check "crowdsec-metrics" "ok" "dry-run: would run cscli metrics via kubectl exec" + return + fi + + local ns + ns=$(find_crowdsec_namespace) + + local lapi_pod + lapi_pod=$($KUBECTL get pods -n "$ns" -l app.kubernetes.io/name=crowdsec,app.kubernetes.io/component=lapi -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || \ + lapi_pod=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep -i "crowdsec.*lapi" | head -1 | awk '{print $1}') || true + + if [ -z "$lapi_pod" ]; then + add_check "crowdsec-metrics" "warn" "No LAPI pod found to run cscli metrics" + return + fi + + local metrics_output + metrics_output=$($KUBECTL exec -n "$ns" "$lapi_pod" -- cscli metrics 2>/dev/null) || { + add_check "crowdsec-metrics" "warn" "Failed to run cscli metrics on ${lapi_pod}" + return + } + + add_check "crowdsec-metrics" "ok" "cscli metrics returned successfully" +} + +check_decisions() { + if $DRY_RUN; then + add_check "crowdsec-decisions" "ok" "dry-run: would check cscli decisions list" + return + fi + + local ns + ns=$(find_crowdsec_namespace) + + local lapi_pod + lapi_pod=$($KUBECTL get pods -n "$ns" -l app.kubernetes.io/name=crowdsec,app.kubernetes.io/component=lapi -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || \ + lapi_pod=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep -i "crowdsec.*lapi" | head -1 | awk '{print $1}') || true + + if [ -z "$lapi_pod" ]; then + add_check "crowdsec-decisions" "warn" "No LAPI pod found to check decisions" + return + fi + + local decisions + decisions=$($KUBECTL exec -n "$ns" "$lapi_pod" -- cscli decisions list -o json 2>/dev/null) || { + add_check "crowdsec-decisions" "ok" "No active decisions (or failed to query)" + return + } + + local count + count=$(echo "$decisions" | jq 'if type == "array" then length else 0 end' 2>/dev/null || echo "0") + + if [ "$count" -gt 0 ]; then + add_check "crowdsec-decisions" "ok" "${count} active decision(s)" + else + add_check "crowdsec-decisions" "ok" "No active decisions" + fi +} + +check_agent_daemonset() { + if $DRY_RUN; then + add_check "crowdsec-agents" "ok" "dry-run: would check CrowdSec agent DaemonSet" + return + fi + + local ns + ns=$(find_crowdsec_namespace) + + local ds_json + ds_json=$($KUBECTL get daemonset -n "$ns" -l app.kubernetes.io/name=crowdsec -o json 2>/dev/null) || { + # Fallback: search by name + ds_json=$($KUBECTL get daemonset -n "$ns" -o json 2>/dev/null | jq '{items: [.items[] | select(.metadata.name | test("crowdsec"))]}') || { + add_check "crowdsec-agents" "warn" "No CrowdSec DaemonSet found" + return + } + } + + local desired ready + desired=$(echo "$ds_json" | jq '[.items[].status.desiredNumberScheduled] | add // 0' 2>/dev/null || echo "0") + ready=$(echo "$ds_json" | jq '[.items[].status.numberReady] | add // 0' 2>/dev/null || echo "0") + + if [ "$ready" -lt "$desired" ]; then + add_check "crowdsec-agents" "warn" "CrowdSec agents: ${ready}/${desired} ready" + elif [ "$desired" -eq 0 ]; then + add_check "crowdsec-agents" "warn" "No CrowdSec agent DaemonSet pods scheduled" + else + add_check "crowdsec-agents" "ok" "CrowdSec agents: ${ready}/${desired} ready" + fi +} + +check_lapi_health +check_cscli_metrics +check_decisions +check_agent_daemonset + +# Output JSON +overall="ok" +for c in "${checks[@]}"; do + s=$(echo "$c" | jq -r '.status') + if [ "$s" = "fail" ]; then overall="fail"; break; fi + if [ "$s" = "warn" ]; then overall="warn"; fi +done + +printf '{"status": "%s", "agent": "%s", "checks": [%s]}\n' \ + "$overall" "$AGENT" "$(IFS=,; echo "${checks[*]}")" diff --git a/.claude/scripts/db-health.sh b/.claude/scripts/db-health.sh new file mode 100755 index 00000000..4edcc9c5 --- /dev/null +++ b/.claude/scripts/db-health.sh @@ -0,0 +1,194 @@ +#!/usr/bin/env bash +set -euo pipefail + +KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config" +DRY_RUN=false +AGENT="db-health" + +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + esac +done + +CHECKS="[]" + +add_check() { + local name="$1" status="$2" message="$3" + CHECKS=$(echo "$CHECKS" | python3 -c " +import sys, json +checks = json.load(sys.stdin) +checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''}) +json.dump(checks, sys.stdout) +") +} + +# MySQL InnoDB Cluster - Group Replication status +check_mysql_gr() { + if $DRY_RUN; then + add_check "mysql-group-replication" "ok" "DRY RUN: would check MySQL Group Replication status" + return + fi + + # Discover MySQL pod via labels first, fall back to known name + local mysql_pod + mysql_pod=$($KUBECTL get pods -n dbaas -l app=mysql-cluster -o name 2>/dev/null | head -1) || true + if [ -z "$mysql_pod" ]; then + mysql_pod=$($KUBECTL get pods -n dbaas -l app.kubernetes.io/name=mysql -o name 2>/dev/null | head -1) || true + fi + if [ -z "$mysql_pod" ]; then + mysql_pod="sts/mysql-cluster" + fi + + local gr_status + gr_status=$($KUBECTL exec "$mysql_pod" -n dbaas -- mysql -N -e \ + "SELECT MEMBER_HOST, MEMBER_STATE, MEMBER_ROLE FROM performance_schema.replication_group_members" 2>/dev/null) || { + add_check "mysql-group-replication" "fail" "Cannot connect to MySQL cluster to check GR status" + return + } + + local member_count online_count + member_count=$(echo "$gr_status" | grep -c . || true) + online_count=$(echo "$gr_status" | grep -c "ONLINE" || true) + + if [ "$online_count" -eq "$member_count" ] && [ "$member_count" -ge 3 ]; then + add_check "mysql-group-replication" "ok" "All $member_count members ONLINE: $(echo "$gr_status" | tr '\t' ' ' | tr '\n' '; ')" + elif [ "$online_count" -lt "$member_count" ]; then + add_check "mysql-group-replication" "fail" "Only $online_count/$member_count members ONLINE: $(echo "$gr_status" | tr '\t' ' ' | tr '\n' '; ')" + else + add_check "mysql-group-replication" "warn" "Cluster has $member_count members (expected 3): $(echo "$gr_status" | tr '\t' ' ' | tr '\n' '; ')" + fi +} + +# MySQL pod health +check_mysql_pods() { + if $DRY_RUN; then + add_check "mysql-pods" "ok" "DRY RUN: would check MySQL pod status" + return + fi + + local pod_status + pod_status=$($KUBECTL get pods -n dbaas -l app=mysql-cluster -o wide --no-headers 2>/dev/null) || \ + pod_status=$($KUBECTL get pods -n dbaas --no-headers 2>/dev/null | grep -i mysql) || { + add_check "mysql-pods" "warn" "Cannot find MySQL pods in dbaas namespace" + return + } + + local not_running + not_running=$(echo "$pod_status" | grep -v "Running" | grep -v "Completed" || true) + + if [ -z "$not_running" ]; then + local count + count=$(echo "$pod_status" | grep -c "Running" || true) + add_check "mysql-pods" "ok" "$count MySQL pod(s) running in dbaas namespace" + else + add_check "mysql-pods" "fail" "Unhealthy MySQL pods: $(echo "$not_running" | awk '{print $1": "$3}' | tr '\n' '; ')" + fi +} + +# CNPG PostgreSQL cluster health +check_cnpg() { + if $DRY_RUN; then + add_check "cnpg-clusters" "ok" "DRY RUN: would check CNPG PostgreSQL cluster health" + return + fi + + # Check if CNPG CRDs exist + local cnpg_clusters + cnpg_clusters=$($KUBECTL get cluster.postgresql.cnpg.io --all-namespaces -o json 2>/dev/null) || { + add_check "cnpg-clusters" "warn" "CNPG CRD not found or no clusters deployed" + return + } + + local report + report=$(echo "$cnpg_clusters" | python3 -c " +import sys, json +data = json.load(sys.stdin) +results = [] +all_healthy = True +for cluster in data.get('items', []): + ns = cluster['metadata']['namespace'] + name = cluster['metadata']['name'] + phase = cluster.get('status', {}).get('phase', 'unknown') + ready = cluster.get('status', {}).get('readyInstances', 0) + instances = cluster.get('spec', {}).get('instances', 0) + primary = cluster.get('status', {}).get('currentPrimary', 'unknown') + if phase != 'Cluster in healthy state' and phase != 'Healthy': + all_healthy = False + if ready < instances: + all_healthy = False + results.append(f'{ns}/{name}: phase={phase} ready={ready}/{instances} primary={primary}') +print('HEALTHY' if all_healthy else 'UNHEALTHY') +print('; '.join(results)) +" 2>/dev/null) || report="Failed to parse CNPG status" + + local health_line + health_line=$(echo "$report" | head -1) + local detail_line + detail_line=$(echo "$report" | tail -1) + + if [ "$health_line" = "HEALTHY" ]; then + add_check "cnpg-clusters" "ok" "$detail_line" + else + add_check "cnpg-clusters" "fail" "$detail_line" + fi +} + +# Database connection counts (MySQL) +check_mysql_connections() { + if $DRY_RUN; then + add_check "mysql-connections" "ok" "DRY RUN: would check MySQL connection counts" + return + fi + + local mysql_pod + mysql_pod=$($KUBECTL get pods -n dbaas -l app=mysql-cluster -o name 2>/dev/null | head -1) || true + if [ -z "$mysql_pod" ]; then + mysql_pod="sts/mysql-cluster" + fi + + local conn_info + conn_info=$($KUBECTL exec "$mysql_pod" -n dbaas -- mysql -N -e \ + "SELECT 'threads_connected', VARIABLE_VALUE FROM performance_schema.global_status WHERE VARIABLE_NAME='Threads_connected' UNION ALL SELECT 'max_connections', VARIABLE_VALUE FROM performance_schema.global_variables WHERE VARIABLE_NAME='max_connections'" 2>/dev/null) || { + add_check "mysql-connections" "warn" "Cannot query MySQL connection info" + return + } + + local threads_connected max_connections + threads_connected=$(echo "$conn_info" | grep threads_connected | awk '{print $2}') || threads_connected="unknown" + max_connections=$(echo "$conn_info" | grep max_connections | awk '{print $2}') || max_connections="unknown" + + if [ "$threads_connected" != "unknown" ] && [ "$max_connections" != "unknown" ]; then + local pct=$((threads_connected * 100 / max_connections)) + if [ "$pct" -gt 80 ]; then + add_check "mysql-connections" "fail" "MySQL connections at ${pct}%: $threads_connected/$max_connections" + elif [ "$pct" -gt 60 ]; then + add_check "mysql-connections" "warn" "MySQL connections at ${pct}%: $threads_connected/$max_connections" + else + add_check "mysql-connections" "ok" "MySQL connections: $threads_connected/$max_connections (${pct}%)" + fi + else + add_check "mysql-connections" "warn" "MySQL connections: threads=$threads_connected max=$max_connections" + fi +} + +# Run all checks +check_mysql_gr +check_mysql_pods +check_cnpg +check_mysql_connections + +# Determine overall status +OVERALL=$(echo "$CHECKS" | python3 -c " +import sys, json +checks = json.load(sys.stdin) +statuses = [c['status'] for c in checks] +if 'fail' in statuses: + print('fail') +elif 'warn' in statuses: + print('warn') +else: + print('ok') +") + +echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool diff --git a/.claude/scripts/deploy-status.sh b/.claude/scripts/deploy-status.sh new file mode 100755 index 00000000..a958ad41 --- /dev/null +++ b/.claude/scripts/deploy-status.sh @@ -0,0 +1,217 @@ +#!/usr/bin/env bash +set -euo pipefail + +KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config" +DRY_RUN=false +AGENT="deploy-status" + +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + esac +done + +CHECKS="[]" + +add_check() { + local name="$1" status="$2" message="$3" + CHECKS=$(echo "$CHECKS" | python3 -c " +import sys, json +checks = json.load(sys.stdin) +checks.append({'name': '''$name''', 'status': '''$status''', 'message': '''$message'''}) +json.dump(checks, sys.stdout) +") +} + +# Check for stalled rollouts (Progressing=False or deadline exceeded) +check_stalled_rollouts() { + if $DRY_RUN; then + add_check "stalled-rollouts" "ok" "DRY RUN: would check for stalled deployment rollouts" + return + fi + + local stalled + stalled=$($KUBECTL get deployments --all-namespaces -o json 2>/dev/null | python3 -c " +import sys, json +data = json.load(sys.stdin) +stalled = [] +for dep in data.get('items', []): + ns = dep['metadata']['namespace'] + name = dep['metadata']['name'] + conditions = dep.get('status', {}).get('conditions', []) + for cond in conditions: + if cond.get('type') == 'Progressing' and cond.get('status') == 'False': + reason = cond.get('reason', 'unknown') + stalled.append(f'{ns}/{name}: {reason}') + elif cond.get('type') == 'Available' and cond.get('status') == 'False': + reason = cond.get('reason', 'unknown') + stalled.append(f'{ns}/{name}: unavailable ({reason})') +if stalled: + print('; '.join(stalled)) +else: + print('') +" 2>/dev/null) || stalled="Failed to check deployments" + + if [ -z "$stalled" ]; then + add_check "stalled-rollouts" "ok" "No stalled rollouts detected" + else + add_check "stalled-rollouts" "fail" "Stalled rollouts: $stalled" + fi +} + +# Check for unavailable replicas +check_unavailable_replicas() { + if $DRY_RUN; then + add_check "unavailable-replicas" "ok" "DRY RUN: would check for deployments with unavailable replicas" + return + fi + + local unavail + unavail=$($KUBECTL get deployments --all-namespaces -o json 2>/dev/null | python3 -c " +import sys, json +data = json.load(sys.stdin) +issues = [] +for dep in data.get('items', []): + ns = dep['metadata']['namespace'] + name = dep['metadata']['name'] + spec_replicas = dep.get('spec', {}).get('replicas', 1) + ready = dep.get('status', {}).get('readyReplicas', 0) or 0 + unavailable = dep.get('status', {}).get('unavailableReplicas', 0) or 0 + if unavailable > 0 or ready < spec_replicas: + issues.append(f'{ns}/{name}: {ready}/{spec_replicas} ready, {unavailable} unavailable') +if issues: + print('; '.join(issues)) +else: + print('') +" 2>/dev/null) || unavail="Failed to check replicas" + + if [ -z "$unavail" ]; then + add_check "unavailable-replicas" "ok" "All deployments have desired replicas ready" + else + add_check "unavailable-replicas" "warn" "Unavailable replicas: $unavail" + fi +} + +# Check for image pull errors +check_image_pull_errors() { + if $DRY_RUN; then + add_check "image-pull-errors" "ok" "DRY RUN: would check for ImagePullBackOff/ErrImagePull pods" + return + fi + + local pull_errors + pull_errors=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c " +import sys, json +data = json.load(sys.stdin) +errors = [] +for pod in data.get('items', []): + ns = pod['metadata']['namespace'] + name = pod['metadata']['name'] + for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []): + waiting = cs.get('state', {}).get('waiting', {}) + reason = waiting.get('reason', '') + if reason in ('ImagePullBackOff', 'ErrImagePull', 'InvalidImageName'): + image = cs.get('image', 'unknown') + msg = waiting.get('message', '')[:100] + errors.append(f'{ns}/{name}: {reason} image={image} ({msg})') +if errors: + print('; '.join(errors)) +else: + print('') +" 2>/dev/null) || pull_errors="Failed to check image pulls" + + if [ -z "$pull_errors" ]; then + add_check "image-pull-errors" "ok" "No image pull errors found" + else + add_check "image-pull-errors" "fail" "Image pull errors: $pull_errors" + fi +} + +# Check for recent restarts (>5 in last hour) +check_recent_restarts() { + if $DRY_RUN; then + add_check "recent-restarts" "ok" "DRY RUN: would check for pods with high restart counts" + return + fi + + local restarts + restarts=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c " +import sys, json +data = json.load(sys.stdin) +high_restart = [] +for pod in data.get('items', []): + ns = pod['metadata']['namespace'] + name = pod['metadata']['name'] + for cs in pod.get('status', {}).get('containerStatuses', []): + count = cs.get('restartCount', 0) + if count >= 5: + container = cs['name'] + high_restart.append(f'{ns}/{name}:{container} restarts={count}') +if high_restart: + print('; '.join(sorted(high_restart, key=lambda x: int(x.split('=')[1]), reverse=True)[:20])) +else: + print('') +" 2>/dev/null) || restarts="Failed to check restarts" + + if [ -z "$restarts" ]; then + add_check "recent-restarts" "ok" "No pods with 5+ restarts" + else + add_check "recent-restarts" "warn" "High restart counts: $restarts" + fi +} + +# Check CrashLoopBackOff pods +check_crashloop() { + if $DRY_RUN; then + add_check "crashloop" "ok" "DRY RUN: would check for CrashLoopBackOff pods" + return + fi + + local crashloop + crashloop=$($KUBECTL get pods --all-namespaces -o json 2>/dev/null | python3 -c " +import sys, json +data = json.load(sys.stdin) +crashes = [] +for pod in data.get('items', []): + ns = pod['metadata']['namespace'] + name = pod['metadata']['name'] + for cs in pod.get('status', {}).get('containerStatuses', []): + waiting = cs.get('state', {}).get('waiting', {}) + if waiting.get('reason') == 'CrashLoopBackOff': + container = cs['name'] + restarts = cs.get('restartCount', 0) + crashes.append(f'{ns}/{name}:{container} restarts={restarts}') +if crashes: + print('; '.join(crashes)) +else: + print('') +" 2>/dev/null) || crashloop="Failed to check crashloop" + + if [ -z "$crashloop" ]; then + add_check "crashloop" "ok" "No CrashLoopBackOff pods" + else + add_check "crashloop" "fail" "CrashLoopBackOff: $crashloop" + fi +} + +# Run all checks +check_stalled_rollouts +check_unavailable_replicas +check_image_pull_errors +check_recent_restarts +check_crashloop + +# Determine overall status +OVERALL=$(echo "$CHECKS" | python3 -c " +import sys, json +checks = json.load(sys.stdin) +statuses = [c['status'] for c in checks] +if 'fail' in statuses: + print('fail') +elif 'warn' in statuses: + print('warn') +else: + print('ok') +") + +echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool diff --git a/.claude/scripts/dns-check.sh b/.claude/scripts/dns-check.sh new file mode 100755 index 00000000..71704133 --- /dev/null +++ b/.claude/scripts/dns-check.sh @@ -0,0 +1,144 @@ +#!/usr/bin/env bash +set -euo pipefail + +KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config" +AGENT="dns-check" +DRY_RUN=false + +# Internal DNS server (Technitium) +INTERNAL_DNS="10.0.20.100" +# Public DNS +PUBLIC_DNS="1.1.1.1" + +# Services to check +SERVICES=( + "grafana.viktorbarzin.me" + "prometheus.viktorbarzin.me" + "nextcloud.viktorbarzin.me" + "authentik.viktorbarzin.me" + "viktorbarzin.me" +) + +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + esac +done + +checks=() + +add_check() { + local name="$1" status="$2" message="$3" + checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}") +} + +check_dns_resolution() { + if $DRY_RUN; then + add_check "dns-resolution" "ok" "dry-run: would resolve ${#SERVICES[@]} services via internal and public DNS" + return + fi + + local failures=0 mismatches=0 successes=0 + local failure_details="" mismatch_details="" + + for svc in "${SERVICES[@]}"; do + local internal_result public_result + + internal_result=$(dig +short "$svc" @"$INTERNAL_DNS" A 2>/dev/null | head -1) || internal_result="" + public_result=$(dig +short "$svc" @"$PUBLIC_DNS" A 2>/dev/null | head -1) || public_result="" + + if [ -z "$internal_result" ] && [ -z "$public_result" ]; then + failures=$((failures + 1)) + failure_details="${failure_details}${svc} (both resolvers failed); " + elif [ -z "$internal_result" ]; then + failures=$((failures + 1)) + failure_details="${failure_details}${svc} (internal DNS failed); " + elif [ -z "$public_result" ]; then + # Public might use CNAME/proxy, not necessarily a failure + successes=$((successes + 1)) + elif [ "$internal_result" != "$public_result" ]; then + # Mismatch is informational — Cloudflare proxy IPs differ from internal IPs + mismatches=$((mismatches + 1)) + mismatch_details="${mismatch_details}${svc} (internal=${internal_result} public=${public_result}); " + successes=$((successes + 1)) + else + successes=$((successes + 1)) + fi + done + + if [ "$failures" -gt 0 ]; then + add_check "dns-resolution" "fail" "${failures} DNS failures: ${failure_details}" + elif [ "$mismatches" -gt 0 ]; then + add_check "dns-resolution" "ok" "${successes}/${#SERVICES[@]} resolved. ${mismatches} internal/public mismatches (expected with Cloudflare proxy): ${mismatch_details}" + else + add_check "dns-resolution" "ok" "All ${successes}/${#SERVICES[@]} services resolved successfully" + fi +} + +check_technitium_health() { + if $DRY_RUN; then + add_check "technitium" "ok" "dry-run: would check Technitium DNS server pod health" + return + fi + + local tech_pods + tech_pods=$($KUBECTL get pods -A -l app.kubernetes.io/name=technitium --no-headers 2>/dev/null) || \ + tech_pods=$($KUBECTL get pods -A --no-headers 2>/dev/null | grep -i technitium || true) + + if [ -z "$tech_pods" ]; then + add_check "technitium" "warn" "No Technitium pods found" + return + fi + + local not_running + not_running=$(echo "$tech_pods" | grep -v "Running" | grep -c "." 2>/dev/null || echo "0") + + if [ "$not_running" -gt 0 ]; then + add_check "technitium" "fail" "Technitium pod(s) not running" + else + add_check "technitium" "ok" "Technitium DNS server pod(s) running" + fi +} + +check_coredns_health() { + if $DRY_RUN; then + add_check "coredns" "ok" "dry-run: would check CoreDNS pod health" + return + fi + + local coredns_pods + coredns_pods=$($KUBECTL get pods -n kube-system -l k8s-app=kube-dns --no-headers 2>/dev/null) || { + add_check "coredns" "warn" "Failed to query CoreDNS pods" + return + } + + if [ -z "$coredns_pods" ]; then + add_check "coredns" "warn" "No CoreDNS pods found" + return + fi + + local total not_running + total=$(echo "$coredns_pods" | grep -c "." 2>/dev/null || echo "0") + not_running=$(echo "$coredns_pods" | grep -v "Running" | grep -c "." 2>/dev/null || echo "0") + + if [ "$not_running" -gt 0 ]; then + add_check "coredns" "fail" "${not_running}/${total} CoreDNS pod(s) not running" + else + add_check "coredns" "ok" "All ${total} CoreDNS pod(s) running" + fi +} + +check_dns_resolution +check_technitium_health +check_coredns_health + +# Output JSON +overall="ok" +for c in "${checks[@]}"; do + s=$(echo "$c" | jq -r '.status') + if [ "$s" = "fail" ]; then overall="fail"; break; fi + if [ "$s" = "warn" ]; then overall="warn"; fi +done + +printf '{"status": "%s", "agent": "%s", "checks": [%s]}\n' \ + "$overall" "$AGENT" "$(IFS=,; echo "${checks[*]}")" diff --git a/.claude/scripts/monitoring-health.sh b/.claude/scripts/monitoring-health.sh new file mode 100755 index 00000000..a269e19f --- /dev/null +++ b/.claude/scripts/monitoring-health.sh @@ -0,0 +1,281 @@ +#!/usr/bin/env bash +set -euo pipefail + +AGENT="monitoring-health" +KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config" +MONITORING_NS="monitoring" +DRY_RUN=false + +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + esac +done + +checks=() + +add_check() { + local name="$1" status="$2" message="$3" + checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}") +} + +check_prometheus() { + if $DRY_RUN; then + add_check "prometheus" "ok" "dry-run: would check Prometheus server health" + return + fi + + # Discover Prometheus server pod via labels + local prom_pod + prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server -o name 2>/dev/null | head -1) + if [ -z "$prom_pod" ]; then + prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app=prometheus,component=server -o name 2>/dev/null | head -1) + fi + if [ -z "$prom_pod" ]; then + prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep prometheus-server | head -1) + fi + + if [ -z "$prom_pod" ]; then + add_check "prometheus" "fail" "No Prometheus server pod found in $MONITORING_NS" + return + fi + + local phase + phase=$($KUBECTL get "$prom_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null) + if [ "$phase" != "Running" ]; then + add_check "prometheus" "fail" "Prometheus server pod phase: $phase" + return + fi + + # Check Prometheus is responding + local prom_healthy + prom_healthy=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \ + wget -q -O- "http://localhost:9090/-/healthy" 2>/dev/null || echo "unhealthy") + + if echo "$prom_healthy" | grep -qi "ok\|healthy"; then + # Check target scraping + local targets_up + targets_up=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \ + wget -q -O- "http://localhost:9090/api/v1/targets" 2>/dev/null | \ + python3 -c " +import sys, json +try: + data = json.load(sys.stdin) + active = data.get('data',{}).get('activeTargets',[]) + up = sum(1 for t in active if t.get('health') == 'up') + total = len(active) + print(f'{up}/{total}') +except: print('unknown') +" 2>/dev/null || echo "unknown") + add_check "prometheus" "ok" "Prometheus server healthy, targets: $targets_up up" + else + add_check "prometheus" "warn" "Prometheus server running but health check unclear" + fi +} + +check_alertmanager() { + if $DRY_RUN; then + add_check "alertmanager" "ok" "dry-run: would check Alertmanager health" + return + fi + + # Discover Alertmanager pod + local am_pod + am_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=alertmanager -o name 2>/dev/null | head -1) + if [ -z "$am_pod" ]; then + am_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep alertmanager | head -1) + fi + + if [ -z "$am_pod" ]; then + add_check "alertmanager" "fail" "No Alertmanager pod found in $MONITORING_NS" + return + fi + + local phase + phase=$($KUBECTL get "$am_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null) + if [ "$phase" != "Running" ]; then + add_check "alertmanager" "fail" "Alertmanager pod phase: $phase" + return + fi + + # Check firing alerts + local alert_info + alert_info=$($KUBECTL exec "$am_pod" -n "$MONITORING_NS" -- \ + wget -q -O- "http://localhost:9093/api/v2/alerts?active=true" 2>/dev/null | \ + python3 -c " +import sys, json +try: + alerts = json.load(sys.stdin) + firing = [a for a in alerts if a.get('status',{}).get('state') == 'active'] + print(len(firing)) +except: print('unknown') +" 2>/dev/null || echo "unknown") + + # Check silences + local silence_count + silence_count=$($KUBECTL exec "$am_pod" -n "$MONITORING_NS" -- \ + wget -q -O- "http://localhost:9093/api/v2/silences" 2>/dev/null | \ + python3 -c " +import sys, json +try: + silences = json.load(sys.stdin) + active = [s for s in silences if s.get('status',{}).get('state') == 'active'] + print(len(active)) +except: print('0') +" 2>/dev/null || echo "0") + + if [ "$alert_info" = "unknown" ]; then + add_check "alertmanager" "warn" "Alertmanager running but could not query alerts" + else + local status="ok" + [ "$alert_info" -gt 0 ] 2>/dev/null && status="warn" + add_check "alertmanager" "$status" "Alertmanager healthy: $alert_info firing alerts, $silence_count active silences" + fi +} + +check_grafana() { + if $DRY_RUN; then + add_check "grafana" "ok" "dry-run: would check Grafana health" + return + fi + + # Discover Grafana pod + local grafana_pod + grafana_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=grafana -o name 2>/dev/null | head -1) + if [ -z "$grafana_pod" ]; then + grafana_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep grafana | grep -v test | head -1) + fi + + if [ -z "$grafana_pod" ]; then + add_check "grafana" "fail" "No Grafana pod found in $MONITORING_NS" + return + fi + + local phase + phase=$($KUBECTL get "$grafana_pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null) + if [ "$phase" != "Running" ]; then + add_check "grafana" "fail" "Grafana pod phase: $phase" + return + fi + + # Check datasource connectivity + local ds_info + ds_info=$($KUBECTL exec "$grafana_pod" -n "$MONITORING_NS" -- \ + curl -sf "http://localhost:3000/api/datasources" 2>/dev/null | \ + python3 -c " +import sys, json +try: + ds = json.load(sys.stdin) + names = [d.get('name','?') for d in ds] + print(f'{len(ds)} datasources: {\", \".join(names)}') +except: print('unknown') +" 2>/dev/null || echo "unknown") + + if [ "$ds_info" = "unknown" ]; then + add_check "grafana" "warn" "Grafana running but could not query datasources (may need auth)" + else + add_check "grafana" "ok" "Grafana healthy, $ds_info" + fi +} + +check_snmp_exporters() { + if $DRY_RUN; then + add_check "snmp-exporters" "ok" "dry-run: would check SNMP exporter pods" + return + fi + + local exporters=("snmp-exporter" "idrac-redfish-exporter" "proxmox-exporter") + local running=0 total=0 + + for exporter in "${exporters[@]}"; do + total=$((total + 1)) + local pod + pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep "$exporter" | head -1) + + if [ -z "$pod" ]; then + # Try all namespaces + pod=$($KUBECTL get pods --all-namespaces -o custom-columns=NS:.metadata.namespace,NAME:.metadata.name --no-headers 2>/dev/null | \ + grep "$exporter" | head -1) + if [ -z "$pod" ]; then + add_check "exporter-$exporter" "warn" "$exporter pod not found" + continue + fi + local ns + ns=$(echo "$pod" | awk '{print $1}') + local name + name=$(echo "$pod" | awk '{print $2}') + local phase + phase=$($KUBECTL get pod "$name" -n "$ns" -o jsonpath='{.status.phase}' 2>/dev/null) + if [ "$phase" = "Running" ]; then + running=$((running + 1)) + add_check "exporter-$exporter" "ok" "$exporter running in $ns" + else + add_check "exporter-$exporter" "warn" "$exporter phase: $phase in $ns" + fi + else + local phase + phase=$($KUBECTL get "$pod" -n "$MONITORING_NS" -o jsonpath='{.status.phase}' 2>/dev/null) + if [ "$phase" = "Running" ]; then + running=$((running + 1)) + add_check "exporter-$exporter" "ok" "$exporter running" + else + add_check "exporter-$exporter" "warn" "$exporter phase: $phase" + fi + fi + done +} + +check_prometheus_storage() { + if $DRY_RUN; then + add_check "prometheus-storage" "ok" "dry-run: would check Prometheus storage usage" + return + fi + + local prom_pvc + prom_pvc=$($KUBECTL get pvc -n "$MONITORING_NS" -o name 2>/dev/null | grep prometheus-server | head -1) + + if [ -z "$prom_pvc" ]; then + add_check "prometheus-storage" "warn" "No Prometheus server PVC found" + return + fi + + # Check storage via Prometheus TSDB stats + local prom_pod + prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -l app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server -o name 2>/dev/null | head -1) + if [ -z "$prom_pod" ]; then + prom_pod=$($KUBECTL get pods -n "$MONITORING_NS" -o name 2>/dev/null | grep prometheus-server | head -1) + fi + + if [ -n "$prom_pod" ]; then + local storage_info + storage_info=$($KUBECTL exec "$prom_pod" -n "$MONITORING_NS" -c prometheus-server -- \ + df -h /data 2>/dev/null | tail -1 | awk '{printf "%s used of %s (%s)", $3, $2, $5}' || echo "unknown") + add_check "prometheus-storage" "ok" "Prometheus storage: $storage_info" + else + add_check "prometheus-storage" "warn" "Could not check Prometheus storage" + fi +} + +# Run checks +check_prometheus +check_alertmanager +check_grafana +check_snmp_exporters +check_prometheus_storage + +# Determine overall status +overall="ok" +for c in "${checks[@]}"; do + if echo "$c" | grep -q '"status": "fail"'; then + overall="fail" + break + elif echo "$c" | grep -q '"status": "warn"'; then + overall="warn" + fi +done + +# Output JSON +checks_json=$(IFS=,; echo "${checks[*]}") +cat </dev/null) || { + add_check "pfsense" "fail" "Failed to connect to pfSense via pfsense.py" + return + } + + if echo "$pf_output" | grep -qi "error\|fail\|down"; then + add_check "pfsense" "warn" "pfSense reported issues: $(echo "$pf_output" | head -3 | tr '\n' ' ')" + else + add_check "pfsense" "ok" "pfSense system healthy" + fi +} + +check_vpn_status() { + if $DRY_RUN; then + add_check "vpn" "ok" "dry-run: would check VPN tunnel status via pfsense.py" + return + fi + + local vpn_output + vpn_output=$($PFSENSE wireguard 2>/dev/null) || { + add_check "vpn" "warn" "Failed to query VPN status via pfsense.py" + return + } + + if echo "$vpn_output" | grep -qi "error\|fail\|down"; then + add_check "vpn" "warn" "VPN issues detected: $(echo "$vpn_output" | head -3 | tr '\n' ' ')" + else + add_check "vpn" "ok" "VPN tunnels healthy" + fi +} + +check_metallb_speakers() { + if $DRY_RUN; then + add_check "metallb-speakers" "ok" "dry-run: would check MetalLB speaker pod health" + return + fi + + local ns="metallb-system" + + # Find MetalLB speaker pods via labels first + local speaker_pods + speaker_pods=$($KUBECTL get pods -n "$ns" -l app.kubernetes.io/component=speaker --no-headers 2>/dev/null) || \ + speaker_pods=$($KUBECTL get pods -n "$ns" -l component=speaker --no-headers 2>/dev/null) || \ + speaker_pods=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep -i speaker || true) + + if [ -z "$speaker_pods" ]; then + add_check "metallb-speakers" "warn" "No MetalLB speaker pods found in ${ns}" + return + fi + + local total not_running + total=$(echo "$speaker_pods" | grep -c "." 2>/dev/null || echo "0") + not_running=$(echo "$speaker_pods" | grep -v "Running" | grep -c "." 2>/dev/null || echo "0") + + if [ "$not_running" -gt 0 ]; then + add_check "metallb-speakers" "fail" "${not_running}/${total} MetalLB speaker pod(s) not running" + else + add_check "metallb-speakers" "ok" "All ${total} MetalLB speaker pod(s) running" + fi +} + +check_metallb_l2() { + if $DRY_RUN; then + add_check "metallb-l2" "ok" "dry-run: would check MetalLB L2 advertisements" + return + fi + + local ns="metallb-system" + + # Check L2Advertisement CRDs + local l2_ads + l2_ads=$($KUBECTL get l2advertisements -n "$ns" -o json 2>/dev/null) || { + add_check "metallb-l2" "warn" "Could not query L2Advertisement CRDs" + return + } + + local count + count=$(echo "$l2_ads" | jq '.items | length' 2>/dev/null || echo "0") + + if [ "$count" -eq 0 ]; then + add_check "metallb-l2" "warn" "No L2Advertisement resources found" + else + # Check MetalLB controller + local controller + controller=$($KUBECTL get pods -n "$ns" -l app.kubernetes.io/component=controller --no-headers 2>/dev/null) || \ + controller=$($KUBECTL get pods -n "$ns" --no-headers 2>/dev/null | grep -i controller || true) + + if [ -z "$controller" ]; then + add_check "metallb-l2" "warn" "${count} L2Advertisement(s) found but no controller pod" + elif echo "$controller" | grep -q "Running"; then + add_check "metallb-l2" "ok" "${count} L2Advertisement(s) configured, controller running" + else + add_check "metallb-l2" "warn" "${count} L2Advertisement(s) found but controller not running" + fi + fi +} + +check_node_connectivity() { + if $DRY_RUN; then + add_check "node-connectivity" "ok" "dry-run: would ping k8s nodes" + return + fi + + local nodes=("10.0.20.100" "10.0.20.101" "10.0.20.102" "10.0.20.103" "10.0.20.104") + local names=("k8s-master" "k8s-node1" "k8s-node2" "k8s-node3" "k8s-node4") + local failures=0 + local failure_details="" + + for i in "${!nodes[@]}"; do + if ! ping -c 1 -W 2 "${nodes[$i]}" >/dev/null 2>&1; then + failures=$((failures + 1)) + failure_details="${failure_details}${names[$i]}(${nodes[$i]}) " + fi + done + + if [ "$failures" -gt 0 ]; then + add_check "node-connectivity" "fail" "${failures} node(s) unreachable: ${failure_details}" + else + add_check "node-connectivity" "ok" "All ${#nodes[@]} nodes reachable" + fi +} + +check_pfsense_status +check_vpn_status +check_metallb_speakers +check_metallb_l2 +check_node_connectivity + +# Output JSON +overall="ok" +for c in "${checks[@]}"; do + s=$(echo "$c" | jq -r '.status') + if [ "$s" = "fail" ]; then overall="fail"; break; fi + if [ "$s" = "warn" ]; then overall="warn"; fi +done + +printf '{"status": "%s", "agent": "%s", "checks": [%s]}\n' \ + "$overall" "$AGENT" "$(IFS=,; echo "${checks[*]}")" diff --git a/.claude/scripts/nfs-health.sh b/.claude/scripts/nfs-health.sh new file mode 100755 index 00000000..dc933868 --- /dev/null +++ b/.claude/scripts/nfs-health.sh @@ -0,0 +1,145 @@ +#!/usr/bin/env bash +set -euo pipefail + +AGENT="nfs-health" +KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config" +TRUENAS_HOST="10.0.10.15" +NODES=("k8s-master:10.0.20.100" "k8s-node1:10.0.20.101" "k8s-node2:10.0.20.102" "k8s-node3:10.0.20.103" "k8s-node4:10.0.20.104") +SSH_USER="wizard" +DRY_RUN=false + +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + esac +done + +checks=() + +add_check() { + local name="$1" status="$2" message="$3" + checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}") +} + +check_truenas_reachable() { + if $DRY_RUN; then + add_check "truenas-reachable" "ok" "dry-run: would ping $TRUENAS_HOST" + return + fi + if timeout 5 ping -c 1 "$TRUENAS_HOST" &>/dev/null; then + add_check "truenas-reachable" "ok" "TrueNAS at $TRUENAS_HOST is reachable" + else + add_check "truenas-reachable" "fail" "TrueNAS at $TRUENAS_HOST is unreachable" + fi +} + +check_truenas_nfs_service() { + if $DRY_RUN; then + add_check "truenas-nfs-service" "ok" "dry-run: would check NFS service on TrueNAS" + return + fi + local result + if result=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "root@$TRUENAS_HOST" \ + "service nfs-server status 2>/dev/null || systemctl is-active nfs-server 2>/dev/null || echo 'unknown'" 2>/dev/null); then + if echo "$result" | grep -qiE "running|active|is running"; then + add_check "truenas-nfs-service" "ok" "NFS service is running on TrueNAS" + else + add_check "truenas-nfs-service" "warn" "NFS service status unclear: $(echo "$result" | head -1 | tr '"' "'")" + fi + else + add_check "truenas-nfs-service" "fail" "Could not check NFS service on TrueNAS via SSH" + fi +} + +check_node_nfs_mounts() { + local node_name="$1" node_ip="$2" + + if $DRY_RUN; then + add_check "nfs-mounts-$node_name" "ok" "dry-run: would check NFS mounts on $node_name ($node_ip)" + return + fi + + local mount_output + if ! mount_output=$(timeout 15 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$SSH_USER@$node_ip" \ + "mount | grep nfs" 2>/dev/null); then + add_check "nfs-mounts-$node_name" "warn" "No NFS mounts found or SSH failed on $node_name ($node_ip)" + return + fi + + if [ -z "$mount_output" ]; then + add_check "nfs-mounts-$node_name" "warn" "No NFS mounts found on $node_name" + return + fi + + local mount_count + mount_count=$(echo "$mount_output" | wc -l | tr -d ' ') + + # Check for stale mounts by trying to stat each mount point + local stale_count=0 + local stale_mounts="" + while IFS= read -r line; do + local mount_point + mount_point=$(echo "$line" | awk '{print $3}') + if [ -n "$mount_point" ]; then + if ! timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$SSH_USER@$node_ip" \ + "timeout 5 stat '$mount_point' >/dev/null 2>&1" 2>/dev/null; then + stale_count=$((stale_count + 1)) + stale_mounts="$stale_mounts $mount_point" + fi + fi + done <<< "$mount_output" + + if [ "$stale_count" -gt 0 ]; then + add_check "nfs-mounts-$node_name" "fail" "$stale_count/$mount_count NFS mounts stale on $node_name:$stale_mounts" + else + add_check "nfs-mounts-$node_name" "ok" "$mount_count NFS mounts healthy on $node_name" + fi +} + +check_nfs_pvcs() { + if $DRY_RUN; then + add_check "nfs-pvcs" "ok" "dry-run: would check NFS-backed PVCs" + return + fi + + local pending + pending=$($KUBECTL get pvc --all-namespaces --field-selector='status.phase!=Bound' -o json 2>/dev/null | \ + python3 -c "import sys,json; items=json.load(sys.stdin).get('items',[]); nfs=[i for i in items if 'nfs' in json.dumps(i).lower()]; print(len(nfs))" 2>/dev/null || echo "error") + + if [ "$pending" = "error" ]; then + add_check "nfs-pvcs" "warn" "Could not check NFS PVC status" + elif [ "$pending" = "0" ]; then + add_check "nfs-pvcs" "ok" "All NFS-backed PVCs are bound" + else + add_check "nfs-pvcs" "fail" "$pending NFS-backed PVCs are not bound" + fi +} + +# Run checks +check_truenas_reachable +check_truenas_nfs_service + +for node_entry in "${NODES[@]}"; do + node_name="${node_entry%%:*}" + node_ip="${node_entry##*:}" + check_node_nfs_mounts "$node_name" "$node_ip" +done + +check_nfs_pvcs + +# Determine overall status +overall="ok" +for c in "${checks[@]}"; do + if echo "$c" | grep -q '"status": "fail"'; then + overall="fail" + break + elif echo "$c" | grep -q '"status": "warn"'; then + overall="warn" + fi +done + +# Output JSON +checks_json=$(IFS=,; echo "${checks[*]}") +cat </dev/null) || oom_pods="[]" + + local count + count=$(echo "$oom_pods" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))") + + if [ "$count" -eq 0 ]; then + add_check "oom-killed-pods" "ok" "No OOMKilled pods found" + else + add_check "oom-killed-pods" "fail" "Found $count OOMKilled container(s): $(echo "$oom_pods" | python3 -c " +import sys,json +pods = json.load(sys.stdin) +print('; '.join(f\"{p['namespace']}/{p['pod']}:{p['container']} (restarts={p['restarts']}, at={p['finishedAt']})\" for p in pods)) +")" + fi +} + +# Check LimitRange defaults in namespaces with OOM events +check_limitranges() { + if $DRY_RUN; then + add_check "limitranges" "ok" "DRY RUN: would check LimitRange defaults" + return + fi + + local namespaces + namespaces=$($KUBECTL get pods --all-namespaces -o json | python3 -c " +import sys, json +data = json.load(sys.stdin) +ns_set = set() +for pod in data.get('items', []): + for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []): + for state in [cs.get('lastState', {}).get('terminated', {}), cs.get('state', {}).get('terminated', {})]: + if state.get('reason') == 'OOMKilled': + ns_set.add(pod['metadata']['namespace']) +for ns in sorted(ns_set): + print(ns) +" 2>/dev/null) || namespaces="" + + if [ -z "$namespaces" ]; then + add_check "limitranges" "ok" "No namespaces with OOMKilled pods to check" + return + fi + + local lr_info="" + while IFS= read -r ns; do + local lr + lr=$($KUBECTL get limitrange -n "$ns" -o json 2>/dev/null | python3 -c " +import sys, json +data = json.load(sys.stdin) +for item in data.get('items', []): + for limit in item.get('spec', {}).get('limits', []): + if limit.get('type') == 'Container': + default_mem = limit.get('default', {}).get('memory', 'none') + default_cpu = limit.get('default', {}).get('cpu', 'none') + print(f'$ns: default memory={default_mem}, cpu={default_cpu}') +" 2>/dev/null) || lr="" + if [ -n "$lr" ]; then + lr_info="${lr_info}${lr}; " + else + lr_info="${lr_info}${ns}: no LimitRange; " + fi + done <<< "$namespaces" + + add_check "limitranges" "warn" "LimitRange defaults for OOM namespaces: ${lr_info}" +} + +# Check VPA recommendations from Goldilocks +check_vpa_recommendations() { + if $DRY_RUN; then + add_check "vpa-recommendations" "ok" "DRY RUN: would check VPA recommendations" + return + fi + + local vpa_count + vpa_count=$($KUBECTL get vpa --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') || vpa_count=0 + + if [ "$vpa_count" -eq 0 ]; then + add_check "vpa-recommendations" "warn" "No VPA objects found — Goldilocks may not be deployed" + return + fi + + local vpa_recs + vpa_recs=$($KUBECTL get vpa --all-namespaces -o json 2>/dev/null | python3 -c " +import sys, json +data = json.load(sys.stdin) +recs = [] +for vpa in data.get('items', []): + ns = vpa['metadata']['namespace'] + name = vpa['metadata']['name'] + for cr in vpa.get('status', {}).get('recommendation', {}).get('containerRecommendations', []): + container = cr.get('containerName', 'unknown') + target_mem = cr.get('target', {}).get('memory', 'n/a') + target_cpu = cr.get('target', {}).get('cpu', 'n/a') + upper_mem = cr.get('upperBound', {}).get('memory', 'n/a') + recs.append(f'{ns}/{name}:{container} target_mem={target_mem} target_cpu={target_cpu} upper_mem={upper_mem}') +if recs: + print('; '.join(recs[:20])) +else: + print('No recommendations available yet') +" 2>/dev/null) || vpa_recs="Failed to read VPA recommendations" + + add_check "vpa-recommendations" "ok" "$vpa_recs" +} + +# Check resource requests/limits on OOMKilled pods +check_pod_resources() { + if $DRY_RUN; then + add_check "pod-resources" "ok" "DRY RUN: would check pod resource specs" + return + fi + + local resources + resources=$($KUBECTL get pods --all-namespaces -o json | python3 -c " +import sys, json +data = json.load(sys.stdin) +results = [] +for pod in data.get('items', []): + ns = pod['metadata']['namespace'] + name = pod['metadata']['name'] + has_oom = False + for cs in pod.get('status', {}).get('containerStatuses', []) + pod.get('status', {}).get('initContainerStatuses', []): + for state in [cs.get('lastState', {}).get('terminated', {}), cs.get('state', {}).get('terminated', {})]: + if state.get('reason') == 'OOMKilled': + has_oom = True + break + if has_oom: + for c in pod.get('spec', {}).get('containers', []) + pod.get('spec', {}).get('initContainers', []): + req_mem = c.get('resources', {}).get('requests', {}).get('memory', 'none') + lim_mem = c.get('resources', {}).get('limits', {}).get('memory', 'none') + req_cpu = c.get('resources', {}).get('requests', {}).get('cpu', 'none') + lim_cpu = c.get('resources', {}).get('limits', {}).get('cpu', 'none') + results.append(f\"{ns}/{name}:{c['name']} req_mem={req_mem} lim_mem={lim_mem} req_cpu={req_cpu} lim_cpu={lim_cpu}\") +if results: + print('; '.join(results)) +else: + print('No OOMKilled pods to inspect') +" 2>/dev/null) || resources="Failed to check pod resources" + + if echo "$resources" | grep -q "No OOMKilled"; then + add_check "pod-resources" "ok" "$resources" + else + add_check "pod-resources" "warn" "$resources" + fi +} + +# Run all checks +find_oomkilled +check_limitranges +check_vpa_recommendations +check_pod_resources + +# Determine overall status +OVERALL=$(echo "$CHECKS" | python3 -c " +import sys, json +checks = json.load(sys.stdin) +statuses = [c['status'] for c in checks] +if 'fail' in statuses: + print('fail') +elif 'warn' in statuses: + print('warn') +else: + print('ok') +") + +echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool diff --git a/.claude/scripts/platform-status.sh b/.claude/scripts/platform-status.sh new file mode 100755 index 00000000..dd0f2dee --- /dev/null +++ b/.claude/scripts/platform-status.sh @@ -0,0 +1,260 @@ +#!/usr/bin/env bash +set -euo pipefail + +AGENT="platform-status" +KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config" +PROXMOX_HOST="root@192.168.1.127" +REGISTRY_HOST="10.0.20.10" +DRY_RUN=false + +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + esac +done + +checks=() + +add_check() { + local name="$1" status="$2" message="$3" + checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}") +} + +check_traefik() { + if $DRY_RUN; then + add_check "traefik" "ok" "dry-run: would check Traefik status" + return + fi + + # Discover Traefik pods via labels + local traefik_pod + traefik_pod=$($KUBECTL get pods -n traefik -l app.kubernetes.io/name=traefik -o name 2>/dev/null | head -1) + if [ -z "$traefik_pod" ]; then + traefik_pod=$($KUBECTL get pods -n traefik -l app=traefik -o name 2>/dev/null | head -1) + fi + + if [ -z "$traefik_pod" ]; then + add_check "traefik" "fail" "No Traefik pods found in traefik namespace" + return + fi + + local phase + phase=$($KUBECTL get "$traefik_pod" -n traefik -o jsonpath='{.status.phase}' 2>/dev/null) + if [ "$phase" = "Running" ]; then + # Check IngressRoute count + local ir_count + ir_count=$($KUBECTL get ingressroute --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') + add_check "traefik" "ok" "Traefik running, $ir_count IngressRoutes configured" + else + add_check "traefik" "fail" "Traefik pod phase: $phase" + fi + + # Check for IngressRoutes with errors (TLS or service issues) + local ir_errors + ir_errors=$($KUBECTL get events --all-namespaces --field-selector reason=IngressRouteError --no-headers 2>/dev/null | wc -l | tr -d ' ') + if [ "$ir_errors" -gt 0 ]; then + add_check "traefik-ingressroutes" "warn" "$ir_errors IngressRoute error events found" + fi +} + +check_kyverno() { + if $DRY_RUN; then + add_check "kyverno" "ok" "dry-run: would check Kyverno status" + return + fi + + # Discover Kyverno pods via labels + local kyverno_pods + kyverno_pods=$($KUBECTL get pods -n kyverno -l app.kubernetes.io/name=kyverno -o name 2>/dev/null) + if [ -z "$kyverno_pods" ]; then + kyverno_pods=$($KUBECTL get pods -n kyverno -l app=kyverno -o name 2>/dev/null) + fi + + if [ -z "$kyverno_pods" ]; then + add_check "kyverno" "warn" "No Kyverno pods found" + return + fi + + local total=0 ready=0 + while IFS= read -r pod; do + [ -z "$pod" ] && continue + total=$((total + 1)) + local phase + phase=$($KUBECTL get "$pod" -n kyverno -o jsonpath='{.status.phase}' 2>/dev/null) + [ "$phase" = "Running" ] && ready=$((ready + 1)) + done <<< "$kyverno_pods" + + if [ "$ready" -eq "$total" ]; then + # Check policy count + local policy_count + policy_count=$($KUBECTL get clusterpolicy --no-headers 2>/dev/null | wc -l | tr -d ' ') + add_check "kyverno" "ok" "$ready/$total Kyverno pods running, $policy_count ClusterPolicies" + else + add_check "kyverno" "warn" "$ready/$total Kyverno pods running" + fi + + # Check for policy violations + local violations + violations=$($KUBECTL get policyreport --all-namespaces -o json 2>/dev/null | \ + python3 -c " +import sys, json +try: + data = json.load(sys.stdin) + fail_count = sum(r.get('summary',{}).get('fail',0) for r in data.get('items',[])) + print(fail_count) +except: print('0') +" 2>/dev/null || echo "0") + + if [ "$violations" -gt 0 ]; then + add_check "kyverno-violations" "warn" "$violations policy violations across namespaces" + fi +} + +check_vpa_goldilocks() { + if $DRY_RUN; then + add_check "vpa-goldilocks" "ok" "dry-run: would check VPA/Goldilocks status" + return + fi + + # Check VPA admission controller + local vpa_pods + vpa_pods=$($KUBECTL get pods -n goldilocks -l app.kubernetes.io/name=goldilocks -o name 2>/dev/null) + if [ -z "$vpa_pods" ]; then + vpa_pods=$($KUBECTL get pods -n goldilocks -o name 2>/dev/null) + fi + + if [ -z "$vpa_pods" ]; then + add_check "vpa-goldilocks" "warn" "No Goldilocks pods found" + return + fi + + local total=0 ready=0 + while IFS= read -r pod; do + [ -z "$pod" ] && continue + total=$((total + 1)) + local phase + phase=$($KUBECTL get "$pod" -n goldilocks -o jsonpath='{.status.phase}' 2>/dev/null) + [ "$phase" = "Running" ] && ready=$((ready + 1)) + done <<< "$vpa_pods" + + if [ "$ready" -eq "$total" ]; then + local vpa_count + vpa_count=$($KUBECTL get vpa --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') + add_check "vpa-goldilocks" "ok" "$ready/$total Goldilocks pods running, $vpa_count VPAs configured" + else + add_check "vpa-goldilocks" "warn" "$ready/$total Goldilocks pods running" + fi + + # Check for VPAs with unexpected updateMode + local auto_vpas + auto_vpas=$($KUBECTL get vpa --all-namespaces -o json 2>/dev/null | \ + python3 -c " +import sys, json +try: + data = json.load(sys.stdin) + auto = [i['metadata']['name'] for i in data.get('items',[]) if i.get('spec',{}).get('updatePolicy',{}).get('updateMode','') == 'Auto'] + print(len(auto)) +except: print('0') +" 2>/dev/null || echo "0") + + if [ "$auto_vpas" -gt 0 ]; then + add_check "vpa-auto-mode" "warn" "$auto_vpas VPAs set to Auto updateMode (may cause unexpected restarts)" + fi +} + +check_pull_through_cache() { + if $DRY_RUN; then + add_check "pull-through-cache" "ok" "dry-run: would check pull-through cache at $REGISTRY_HOST" + return + fi + + if timeout 5 curl -sf "http://${REGISTRY_HOST}:5000/v2/" &>/dev/null; then + add_check "pull-through-cache" "ok" "Pull-through cache registry at $REGISTRY_HOST:5000 is healthy" + elif timeout 5 curl -sf "https://${REGISTRY_HOST}/v2/" &>/dev/null; then + add_check "pull-through-cache" "ok" "Pull-through cache registry at $REGISTRY_HOST is healthy (HTTPS)" + else + add_check "pull-through-cache" "fail" "Pull-through cache registry at $REGISTRY_HOST is unreachable" + fi +} + +check_proxmox() { + if $DRY_RUN; then + add_check "proxmox" "ok" "dry-run: would check Proxmox host resources" + return + fi + + local cpu_load + if cpu_load=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$PROXMOX_HOST" \ + "uptime | awk -F'load average:' '{print \$2}' | awk -F, '{print \$1}' | tr -d ' '" 2>/dev/null); then + local cpu_count + cpu_count=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$PROXMOX_HOST" \ + "nproc" 2>/dev/null || echo "1") + + # Check memory + local mem_info + mem_info=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$PROXMOX_HOST" \ + "free -m | awk '/Mem:/{printf \"%d/%dMB (%.0f%%)\", \$3, \$2, \$3/\$2*100}'" 2>/dev/null || echo "unknown") + + add_check "proxmox" "ok" "Proxmox host: load=$cpu_load (${cpu_count}cores), mem=$mem_info" + else + add_check "proxmox" "fail" "Could not reach Proxmox host via SSH" + fi +} + +check_metallb() { + if $DRY_RUN; then + add_check "metallb" "ok" "dry-run: would check MetalLB status" + return + fi + + local metallb_pods + metallb_pods=$($KUBECTL get pods -n metallb-system -l app.kubernetes.io/name=metallb -o name 2>/dev/null) + if [ -z "$metallb_pods" ]; then + metallb_pods=$($KUBECTL get pods -n metallb-system -o name 2>/dev/null) + fi + + if [ -z "$metallb_pods" ]; then + add_check "metallb" "warn" "No MetalLB pods found" + return + fi + + local total=0 ready=0 + while IFS= read -r pod; do + [ -z "$pod" ] && continue + total=$((total + 1)) + local phase + phase=$($KUBECTL get "$pod" -n metallb-system -o jsonpath='{.status.phase}' 2>/dev/null) + [ "$phase" = "Running" ] && ready=$((ready + 1)) + done <<< "$metallb_pods" + + if [ "$ready" -eq "$total" ]; then + add_check "metallb" "ok" "$ready/$total MetalLB pods running" + else + add_check "metallb" "warn" "$ready/$total MetalLB pods running" + fi +} + +# Run checks +check_traefik +check_kyverno +check_vpa_goldilocks +check_pull_through_cache +check_proxmox +check_metallb + +# Determine overall status +overall="ok" +for c in "${checks[@]}"; do + if echo "$c" | grep -q '"status": "fail"'; then + overall="fail" + break + elif echo "$c" | grep -q '"status": "warn"'; then + overall="warn" + fi +done + +# Output JSON +checks_json=$(IFS=,; echo "${checks[*]}") +cat </dev/null) || report="Failed to get node capacity" + + # Get requests/limits per node + local usage + usage=$($KUBECTL get pods --all-namespaces -o json | python3 -c " +import sys, json + +def parse_cpu(val): + if not val: return 0 + if val.endswith('m'): + return int(val[:-1]) + return int(float(val) * 1000) + +def parse_mem(val): + if not val: return 0 + units = {'Ki': 1024, 'Mi': 1024**2, 'Gi': 1024**3, 'Ti': 1024**4} + for suffix, mult in units.items(): + if val.endswith(suffix): + return int(float(val[:-len(suffix)]) * mult) + return int(val) + +def fmt_mem(b): + return f'{b / (1024**3):.1f}Gi' + +def fmt_cpu(m): + return f'{m}m' + +data = json.load(sys.stdin) +per_node = {} +for pod in data.get('items', []): + phase = pod.get('status', {}).get('phase', '') + if phase not in ('Running', 'Pending'): + continue + node = pod.get('spec', {}).get('nodeName', 'unscheduled') + if node not in per_node: + per_node[node] = {'cpu_req': 0, 'cpu_lim': 0, 'mem_req': 0, 'mem_lim': 0} + for c in pod.get('spec', {}).get('containers', []) + pod.get('spec', {}).get('initContainers', []): + res = c.get('resources', {}) + per_node[node]['cpu_req'] += parse_cpu(res.get('requests', {}).get('cpu', '')) + per_node[node]['cpu_lim'] += parse_cpu(res.get('limits', {}).get('cpu', '')) + per_node[node]['mem_req'] += parse_mem(res.get('requests', {}).get('memory', '')) + per_node[node]['mem_lim'] += parse_mem(res.get('limits', {}).get('memory', '')) + +for node in sorted(per_node.keys()): + n = per_node[node] + print(f\"{node}: cpu_req={fmt_cpu(n['cpu_req'])} cpu_lim={fmt_cpu(n['cpu_lim'])} mem_req={fmt_mem(n['mem_req'])} mem_lim={fmt_mem(n['mem_lim'])}\") +" 2>/dev/null) || usage="Failed to get pod resource usage" + + add_check "node-capacity" "ok" "Allocatable: ${report} | Usage: ${usage}" +} + +# Per-namespace ResourceQuota usage +check_resource_quotas() { + if $DRY_RUN; then + add_check "resource-quotas" "ok" "DRY RUN: would check ResourceQuota usage per namespace" + return + fi + + local quota_count + quota_count=$($KUBECTL get resourcequota --all-namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') || quota_count=0 + + if [ "$quota_count" -eq 0 ]; then + add_check "resource-quotas" "ok" "No ResourceQuotas defined in the cluster" + return + fi + + local quota_report + quota_report=$($KUBECTL get resourcequota --all-namespaces -o json 2>/dev/null | python3 -c " +import sys, json +data = json.load(sys.stdin) +results = [] +for rq in data.get('items', []): + ns = rq['metadata']['namespace'] + name = rq['metadata']['name'] + hard = rq.get('status', {}).get('hard', {}) + used = rq.get('status', {}).get('used', {}) + for resource in hard: + h = hard[resource] + u = used.get(resource, '0') + results.append(f'{ns}/{name}: {resource} used={u} hard={h}') +if results: + print('; '.join(results[:30])) +else: + print('No quota usage data') +" 2>/dev/null) || quota_report="Failed to read ResourceQuotas" + + add_check "resource-quotas" "ok" "$quota_report" +} + +# Top pods by memory usage +check_top_consumers() { + if $DRY_RUN; then + add_check "top-consumers" "ok" "DRY RUN: would report top memory-consuming pods" + return + fi + + local top_pods + top_pods=$($KUBECTL top pods --all-namespaces --no-headers 2>/dev/null | sort -k4 -h -r | head -10 | awk '{print $1"/"$2": cpu="$3" mem="$4}' | tr '\n' '; ') || top_pods="metrics-server may not be available" + + if [ -z "$top_pods" ]; then + add_check "top-consumers" "warn" "kubectl top returned no data — metrics-server may not be running" + else + add_check "top-consumers" "ok" "Top 10 by memory: ${top_pods}" + fi +} + +# Run all checks +check_node_capacity +check_resource_quotas +check_top_consumers + +# Determine overall status +OVERALL=$(echo "$CHECKS" | python3 -c " +import sys, json +checks = json.load(sys.stdin) +statuses = [c['status'] for c in checks] +if 'fail' in statuses: + print('fail') +elif 'warn' in statuses: + print('warn') +else: + print('ok') +") + +echo "{\"status\": \"$OVERALL\", \"agent\": \"$AGENT\", \"checks\": $CHECKS}" | python3 -m json.tool diff --git a/.claude/scripts/tls-check.sh b/.claude/scripts/tls-check.sh new file mode 100755 index 00000000..e81c49a7 --- /dev/null +++ b/.claude/scripts/tls-check.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash +set -euo pipefail + +KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config" +AGENT="tls-check" +DRY_RUN=false +WARN_DAYS=14 + +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + esac +done + +checks=() + +add_check() { + local name="$1" status="$2" message="$3" + checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}") +} + +check_tls_secrets() { + if $DRY_RUN; then + add_check "tls-secrets" "ok" "dry-run: would scan all kubernetes.io/tls secrets for expiry" + return + fi + + local secrets_json + secrets_json=$($KUBECTL get secrets -A -o json 2>/dev/null) || { + add_check "tls-secrets" "fail" "Failed to list secrets" + return + } + + local tls_secrets + tls_secrets=$(echo "$secrets_json" | jq -r '.items[] | select(.type=="kubernetes.io/tls") | "\(.metadata.namespace)/\(.metadata.name)"' 2>/dev/null) || { + add_check "tls-secrets" "fail" "Failed to parse secrets JSON" + return + } + + if [ -z "$tls_secrets" ]; then + add_check "tls-secrets" "warn" "No TLS secrets found" + return + fi + + local total=0 expiring=0 expired=0 healthy=0 errors=0 + local now_epoch + now_epoch=$(date +%s) + local warn_epoch=$((now_epoch + WARN_DAYS * 86400)) + local expiring_list="" + + while IFS= read -r secret; do + total=$((total + 1)) + local ns="${secret%%/*}" + local name="${secret##*/}" + + local cert_pem + cert_pem=$($KUBECTL get secret "$name" -n "$ns" -o jsonpath='{.data.tls\.crt}' 2>/dev/null | base64 -d 2>/dev/null) || { + errors=$((errors + 1)) + continue + } + + local expiry_str + expiry_str=$(echo "$cert_pem" | openssl x509 -noout -enddate 2>/dev/null | sed 's/notAfter=//') || { + errors=$((errors + 1)) + continue + } + + local expiry_epoch + expiry_epoch=$(date -j -f "%b %d %T %Y %Z" "$expiry_str" +%s 2>/dev/null || date -d "$expiry_str" +%s 2>/dev/null) || { + errors=$((errors + 1)) + continue + } + + if [ "$expiry_epoch" -lt "$now_epoch" ]; then + expired=$((expired + 1)) + expiring_list="${expiring_list}EXPIRED: ${ns}/${name}; " + elif [ "$expiry_epoch" -lt "$warn_epoch" ]; then + local days_left=$(( (expiry_epoch - now_epoch) / 86400 )) + expiring=$((expiring + 1)) + expiring_list="${expiring_list}${days_left}d: ${ns}/${name}; " + else + healthy=$((healthy + 1)) + fi + done <<< "$tls_secrets" + + if [ "$expired" -gt 0 ]; then + add_check "tls-secrets" "fail" "${expired} expired, ${expiring} expiring soon, ${healthy} healthy out of ${total} certs. ${expiring_list}" + elif [ "$expiring" -gt 0 ]; then + add_check "tls-secrets" "warn" "${expiring} expiring within ${WARN_DAYS}d, ${healthy} healthy out of ${total} certs. ${expiring_list}" + else + add_check "tls-secrets" "ok" "All ${healthy} TLS certs healthy (${errors} decode errors skipped)" + fi +} + +check_cert_manager() { + if $DRY_RUN; then + add_check "cert-manager" "ok" "dry-run: would check cert-manager pod health and certificate CRDs" + return + fi + + local cm_pods + cm_pods=$($KUBECTL get pods -n cert-manager -l app.kubernetes.io/instance=cert-manager --no-headers 2>/dev/null) || { + add_check "cert-manager" "fail" "Failed to query cert-manager pods" + return + } + + local not_running + not_running=$(echo "$cm_pods" | grep -v "Running" | grep -v "Completed" | grep -c "." 2>/dev/null || echo "0") + + if [ "$not_running" -gt 0 ]; then + add_check "cert-manager" "fail" "${not_running} cert-manager pod(s) not running" + return + fi + + # Check for failed certificates + local failed_certs + failed_certs=$($KUBECTL get certificates -A -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | select(.type=="Ready" and .status=="False")) | "\(.metadata.namespace)/\(.metadata.name)"' 2>/dev/null) || { + add_check "cert-manager" "warn" "Could not query certificate CRDs" + return + } + + if [ -n "$failed_certs" ]; then + local count + count=$(echo "$failed_certs" | wc -l | tr -d ' ') + add_check "cert-manager" "warn" "${count} certificate(s) not ready: $(echo "$failed_certs" | head -5 | tr '\n' ', ')" + else + add_check "cert-manager" "ok" "cert-manager healthy, all certificates ready" + fi +} + +check_tls_secrets +check_cert_manager + +# Output JSON +overall="ok" +for c in "${checks[@]}"; do + s=$(echo "$c" | jq -r '.status') + if [ "$s" = "fail" ]; then overall="fail"; break; fi + if [ "$s" = "warn" ]; then overall="warn"; fi +done + +printf '{"status": "%s", "agent": "%s", "checks": [%s]}\n' \ + "$overall" "$AGENT" "$(IFS=,; echo "${checks[*]}")" diff --git a/.claude/scripts/truenas-status.sh b/.claude/scripts/truenas-status.sh new file mode 100755 index 00000000..055fe2e7 --- /dev/null +++ b/.claude/scripts/truenas-status.sh @@ -0,0 +1,186 @@ +#!/usr/bin/env bash +set -euo pipefail + +AGENT="truenas-status" +TRUENAS_HOST="root@10.0.10.15" +DRY_RUN=false + +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + esac +done + +checks=() + +add_check() { + local name="$1" status="$2" message="$3" + checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}") +} + +ssh_cmd() { + timeout 15 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$TRUENAS_HOST" "$@" 2>/dev/null +} + +check_zfs_pools() { + if $DRY_RUN; then + add_check "zfs-pools" "ok" "dry-run: would check ZFS pool status" + return + fi + + local pool_status + if ! pool_status=$(ssh_cmd "zpool status -x" 2>/dev/null); then + add_check "zfs-pools" "fail" "Could not retrieve ZFS pool status via SSH" + return + fi + + if echo "$pool_status" | grep -q "all pools are healthy"; then + add_check "zfs-pools" "ok" "All ZFS pools are healthy" + else + local degraded_pools + degraded_pools=$(echo "$pool_status" | grep "pool:" | awk '{print $2}' | tr '\n' ', ' | sed 's/,$//') + if [ -n "$degraded_pools" ]; then + add_check "zfs-pools" "fail" "Degraded ZFS pools: $degraded_pools" + else + add_check "zfs-pools" "warn" "ZFS pool status unclear: $(echo "$pool_status" | head -1 | tr '"' "'")" + fi + fi + + # Check pool capacity + local pool_list + if pool_list=$(ssh_cmd "zpool list -H -o name,cap" 2>/dev/null); then + while IFS=$'\t' read -r pool_name cap_pct; do + local cap_num + cap_num=$(echo "$cap_pct" | tr -d '%') + if [ -n "$cap_num" ] && [ "$cap_num" -ge 90 ]; then + add_check "zfs-capacity-$pool_name" "fail" "Pool $pool_name is ${cap_pct} full" + elif [ -n "$cap_num" ] && [ "$cap_num" -ge 80 ]; then + add_check "zfs-capacity-$pool_name" "warn" "Pool $pool_name is ${cap_pct} full" + else + add_check "zfs-capacity-$pool_name" "ok" "Pool $pool_name is ${cap_pct} full" + fi + done <<< "$pool_list" + fi +} + +check_smart_health() { + if $DRY_RUN; then + add_check "smart-health" "ok" "dry-run: would check SMART disk health" + return + fi + + local disk_list + if ! disk_list=$(ssh_cmd "smartctl --scan" 2>/dev/null); then + add_check "smart-health" "warn" "Could not scan disks for SMART status" + return + fi + + local fail_count=0 + local total_count=0 + local failed_disks="" + + while IFS= read -r line; do + local dev + dev=$(echo "$line" | awk '{print $1}') + [ -z "$dev" ] && continue + total_count=$((total_count + 1)) + + local health + if health=$(ssh_cmd "smartctl -H '$dev'" 2>/dev/null); then + if ! echo "$health" | grep -qiE "PASSED|OK"; then + fail_count=$((fail_count + 1)) + failed_disks="$failed_disks $dev" + fi + fi + done <<< "$disk_list" + + if [ "$fail_count" -gt 0 ]; then + add_check "smart-health" "fail" "$fail_count/$total_count disks failing SMART:$failed_disks" + elif [ "$total_count" -gt 0 ]; then + add_check "smart-health" "ok" "All $total_count disks pass SMART health checks" + else + add_check "smart-health" "warn" "No disks found for SMART check" + fi +} + +check_replication() { + if $DRY_RUN; then + add_check "replication" "ok" "dry-run: would check replication task status" + return + fi + + # Check for any running/failed replication tasks via midclt if available + local repl_status + if repl_status=$(ssh_cmd "midclt call replication.query 2>/dev/null" 2>/dev/null); then + local failed + failed=$(echo "$repl_status" | python3 -c " +import sys, json +try: + tasks = json.load(sys.stdin) + failed = [t.get('name','unknown') for t in tasks if t.get('state',{}).get('state','') == 'ERROR'] + print(len(failed)) +except: print('error') +" 2>/dev/null || echo "error") + + if [ "$failed" = "error" ]; then + add_check "replication" "warn" "Could not parse replication task status" + elif [ "$failed" = "0" ]; then + add_check "replication" "ok" "All replication tasks healthy" + else + add_check "replication" "fail" "$failed replication tasks in ERROR state" + fi + else + # Fallback: check if zfs send/recv processes are stuck + local send_procs + send_procs=$(ssh_cmd "pgrep -c 'zfs send' 2>/dev/null || echo 0") + add_check "replication" "warn" "midclt unavailable; $send_procs active zfs send processes" + fi +} + +check_iscsi() { + if $DRY_RUN; then + add_check "iscsi-targets" "ok" "dry-run: would check iSCSI target status" + return + fi + + local target_status + if target_status=$(ssh_cmd "ctladm islist 2>/dev/null || targetcli ls 2>/dev/null" 2>/dev/null); then + local target_count + target_count=$(echo "$target_status" | wc -l | tr -d ' ') + if [ "$target_count" -gt 0 ]; then + add_check "iscsi-targets" "ok" "iSCSI service active with $target_count entries" + else + add_check "iscsi-targets" "warn" "iSCSI service active but no targets listed" + fi + else + # Try checking if the service is at least running + if ssh_cmd "midclt call iscsi.global.config" &>/dev/null; then + add_check "iscsi-targets" "ok" "iSCSI service is configured and running" + else + add_check "iscsi-targets" "warn" "Could not query iSCSI target status" + fi + fi +} + +# Run checks +check_zfs_pools +check_smart_health +check_replication +check_iscsi + +# Determine overall status +overall="ok" +for c in "${checks[@]}"; do + if echo "$c" | grep -q '"status": "fail"'; then + overall="fail" + break + elif echo "$c" | grep -q '"status": "warn"'; then + overall="warn" + fi +done + +# Output JSON +checks_json=$(IFS=,; echo "${checks[*]}") +cat <