diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 674c629e..e852f193 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -158,6 +158,13 @@ Choose storage class based on workload type: **Migration note**: CSI PV `volumeAttributes` are immutable — cannot update NFS server in place. New PV/PVC pairs required (convention: append `-host` to PV name). +**NFS CSI mount option requirements** (learned from [PM-2026-04-14]): +- **ALWAYS set `nfsvers=4`** in CSI mount options. NFSv3 is disabled on the PVE host (`vers3=n` in `/etc/nfs.conf`). Without this, mounts fail silently if kernel NFS client state is corrupt. +- **NEVER use `fsid=0`** in `/etc/exports` on `/srv/nfs`. `fsid=0` designates the NFSv4 pseudo-root, which breaks subdirectory path resolution for all CSI mounts. Only `fsid=1` (unique ID) is safe on `/srv/nfs-ssd`. +- **`/etc/exports` is git-managed** at `infra/scripts/pve-nfs-exports`. Deploy: `scp scripts/pve-nfs-exports root@192.168.1.127:/etc/exports && ssh root@192.168.1.127 exportfs -ra` +- **Critical services MUST NOT use NFS storage** — circular dependency risk. Alertmanager, Prometheus, and any monitoring that should alert about NFS must use `proxmox-lvm-encrypted`. Technitium DNS primary uses `proxmox-lvm-encrypted` (migrated 2026-04-14). +- **NFS PV template** (in `modules/kubernetes/nfs_volume/`): always include `mountOptions: ["nfsvers=4", "soft", "actimeo=5", "retrans=3", "timeo=30"]` + **proxmox-lvm PVC template** (Terraform): ```hcl resource "kubernetes_persistent_volume_claim" "data_proxmox" { diff --git a/scripts/daily-backup.sh b/scripts/daily-backup.sh index de7508b9..7813b505 100644 --- a/scripts/daily-backup.sh +++ b/scripts/daily-backup.sh @@ -50,6 +50,45 @@ resolve_pvc_name() { ' "${MAPPING_CACHE}" 2>/dev/null } +# --- NFS Export Health Check --- +# Verify NFS exports are healthy before starting backup. +# Detects: missing /etc/exports, incorrect fsid=0 flag, unexpected exports. +# Added 2026-04-14 [PM-2026-04-14]: backup script accessed NFS causing stale handle +# propagation during the fsid=0 outage. Early check prevents cascading failures. +check_nfs_exports() { + local exports_file="/etc/exports" + local status=0 + + if [ ! -f "${exports_file}" ]; then + log "WARN: ${exports_file} does not exist — NFS exports may be unconfigured" + return 1 + fi + + # Check for dangerous fsid=0 on /srv/nfs (breaks NFSv4 subdirectory path resolution) + if grep -E '^/srv/nfs[[:space:]].*fsid=0' "${exports_file}" 2>/dev/null; then + log "ERROR: /etc/exports contains fsid=0 on /srv/nfs — this will break all k8s NFS mounts!" + log "ERROR: Remove fsid=0 and run: exportfs -ra && systemctl restart nfs-server" + return 1 + fi + + # Verify NFS server is active + if ! systemctl is-active --quiet nfs-server 2>/dev/null; then + log "WARN: nfs-server is not running — NFS mounts will fail" + return 1 + fi + + # Verify exports are actually loaded (exportfs -s lists active exports) + local active_exports + active_exports=$(exportfs -s 2>/dev/null | grep -c '/srv/nfs' || true) + if [ "${active_exports:-0}" -eq 0 ]; then + log "WARN: No /srv/nfs exports active in kernel — run: exportfs -ra" + return 1 + fi + + log "NFS export health check passed (${active_exports} /srv/nfs export(s) active)" + return 0 +} + # --- Main --- log "=== Weekly backup starting ===" @@ -57,6 +96,12 @@ if ! mountpoint -q "${BACKUP_ROOT}"; then die "${BACKUP_ROOT} is not mounted" fi +# NFS export health check — warn but don't abort (backup can proceed with block storage PVCs) +check_nfs_exports || { + log "WARN: NFS export health check failed — NFS-backed PVC backups may fail" + STATUS=1 +} + STATUS=0 TOTAL_BYTES=0 diff --git a/scripts/pve-nfs-exports b/scripts/pve-nfs-exports new file mode 100644 index 00000000..1f273b15 --- /dev/null +++ b/scripts/pve-nfs-exports @@ -0,0 +1,26 @@ +# /etc/exports — NFS export configuration for Proxmox VE host +# Managed in git: infra/scripts/pve-nfs-exports +# Deploy: scp scripts/pve-nfs-exports root@192.168.1.127:/etc/exports && ssh root@192.168.1.127 exportfs -ra +# +# CRITICAL NOTES (learned from 2026-04-14 outage [PM-2026-04-14]): +# - NEVER add fsid=0 to /srv/nfs or /srv/nfs-ssd exports. fsid=0 designates the +# NFSv4 pseudo-root which changes path resolution for ALL subdirectory mounts. +# When CSI mounts use paths like /srv/nfs/technitium, fsid=0 makes them resolve +# as the root itself, causing ENOENT on all subdirectory mounts. +# - fsid=1 is acceptable on /srv/nfs-ssd (unique ID, not root). +# - The NFS CSI driver mounts subdirectories — never use fsid=0 on any export +# that serves dynamic path mounts. +# - NFSv3 is disabled on this host (vers3=n in /etc/nfs.conf) — all k8s mounts +# must use nfsvers=4 mount option. +# +# Mount options explanation: +# rw — read/write access (required for PVCs) +# async — async writes safe: UPS protects host + Vault Raft replication + +# databases on block storage. Only NFS metadata at risk. +# no_subtree_check — disable subtree checking for performance and reliability +# no_root_squash — k8s CSI driver runs as root; squashing breaks PVC writes +# insecure — allow source ports >1024 (required: pfSense VLAN NAT uses +# unprivileged ports for VLAN 10 → 192.168.1.x traffic) +# +/srv/nfs *(rw,async,no_subtree_check,no_root_squash,insecure) +/srv/nfs-ssd *(rw,sync,no_subtree_check,no_root_squash,insecure,fsid=1)