deprecate TrueNAS: migrate Immich NFS to Proxmox, remove all 10.0.10.15 references [ci skip]

- Migrate Immich (8 NFS PVs, 1.1TB) from TrueNAS to Proxmox host NFS - Update config.tfvars nfs_server to 192.168.1.127 (Proxmox) - Update nfs-csi StorageClass share to /srv/nfs - Update scripts (weekly-backup, cluster-healthcheck) to Proxmox IP - Delete obsolete TrueNAS scripts (nfs_exports.sh, truenas-status.sh) - Rewrite nfs-health.sh for Proxmox NFS monitoring - Update Freedify nfs_music_server default to Proxmox - Mark CloudSync monitor CronJob as deprecated - Update Prometheus alert summaries - Update all architecture docs, AGENTS.md, and reference docs - Zero PVs remain on TrueNAS — VM ready for decommission Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 14:41:15 +00:00 · 2026-04-13 14:41:15 +00:00 · 38d51ab0af
commit 38d51ab0af
parent 69248eaa7b
20 changed files with 245 additions and 524 deletions
--- a/.claude/scripts/nfs-health.sh
+++ b/.claude/scripts/nfs-health.sh
@ -3,7 +3,7 @@ set -euo pipefail

 AGENT="nfs-health"
 KUBECTL="kubectl --kubeconfig /Users/viktorbarzin/code/infra/config"
-TRUENAS_HOST="10.0.10.15"
+NFS_HOST="192.168.1.127"
 NODES=("k8s-master:10.0.20.100" "k8s-node1:10.0.20.101" "k8s-node2:10.0.20.102" "k8s-node3:10.0.20.103" "k8s-node4:10.0.20.104")
 SSH_USER="wizard"
 DRY_RUN=false
@ -21,33 +21,61 @@ add_check() {
  checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
 }

-check_truenas_reachable() {
+check_nfs_reachable() {
  if $DRY_RUN; then
-    add_check "truenas-reachable" "ok" "dry-run: would ping $TRUENAS_HOST"
+    add_check "nfs-reachable" "ok" "dry-run: would ping $NFS_HOST"
    return
  fi
-  if timeout 5 ping -c 1 "$TRUENAS_HOST" &>/dev/null; then
-    add_check "truenas-reachable" "ok" "TrueNAS at $TRUENAS_HOST is reachable"
+  if timeout 5 ping -c 1 "$NFS_HOST" &>/dev/null; then
+    add_check "nfs-reachable" "ok" "Proxmox NFS at $NFS_HOST is reachable"
  else
-    add_check "truenas-reachable" "fail" "TrueNAS at $TRUENAS_HOST is unreachable"
+    add_check "nfs-reachable" "fail" "Proxmox NFS at $NFS_HOST is unreachable"
  fi
 }

-check_truenas_nfs_service() {
+check_nfs_exports() {
  if $DRY_RUN; then
-    add_check "truenas-nfs-service" "ok" "dry-run: would check NFS service on TrueNAS"
+    add_check "nfs-exports" "ok" "dry-run: would check NFS exports on Proxmox"
    return
  fi
  local result
-  if result=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "root@$TRUENAS_HOST" \
-    "service nfs-server status 2>/dev/null || systemctl is-active nfs-server 2>/dev/null || echo 'unknown'" 2>/dev/null); then
-    if echo "$result" | grep -qiE "running|active|is running"; then
-      add_check "truenas-nfs-service" "ok" "NFS service is running on TrueNAS"
+  if result=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "root@$NFS_HOST" \
+    "exportfs -v 2>/dev/null || cat /etc/exports 2>/dev/null" 2>/dev/null); then
+    local export_count
+    export_count=$(echo "$result" | grep -c '/' || echo 0)
+    if [ "$export_count" -gt 0 ]; then
+      add_check "nfs-exports" "ok" "$export_count NFS exports active on Proxmox"
    else
-      add_check "truenas-nfs-service" "warn" "NFS service status unclear: $(echo "$result" | head -1 | tr '"' "'")"
+      add_check "nfs-exports" "warn" "No NFS exports found on Proxmox"
    fi
  else
-    add_check "truenas-nfs-service" "fail" "Could not check NFS service on TrueNAS via SSH"
+    add_check "nfs-exports" "fail" "Could not check NFS exports on Proxmox via SSH"
+  fi
+}
+
+check_nfs_disk_usage() {
+  if $DRY_RUN; then
+    add_check "nfs-disk" "ok" "dry-run: would check NFS disk usage"
+    return
+  fi
+  local result
+  if result=$(timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "root@$NFS_HOST" \
+    "df -h /srv/nfs /srv/nfs-ssd 2>/dev/null" 2>/dev/null); then
+    while IFS= read -r line; do
+      local mount pct
+      mount=$(echo "$line" | awk '{print $6}')
+      pct=$(echo "$line" | awk '{print $5}' | tr -d '%')
+      [ -z "$pct" ] || ! [[ "$pct" =~ ^[0-9]+$ ]] && continue
+      if [ "$pct" -ge 90 ]; then
+        add_check "nfs-disk-$mount" "fail" "$mount is ${pct}% full"
+      elif [ "$pct" -ge 80 ]; then
+        add_check "nfs-disk-$mount" "warn" "$mount is ${pct}% full"
+      else
+        add_check "nfs-disk-$mount" "ok" "$mount is ${pct}% full"
+      fi
+    done <<< "$result"
+  else
+    add_check "nfs-disk" "warn" "Could not check NFS disk usage"
  fi
 }

@ -116,8 +144,9 @@ check_nfs_pvcs() {
 }

 # Run checks
-check_truenas_reachable
-check_truenas_nfs_service
+check_nfs_reachable
+check_nfs_exports
+check_nfs_disk_usage

 for node_entry in "${NODES[@]}"; do
  node_name="${node_entry%%:*}"
--- a/.claude/scripts/truenas-status.sh
+++ b/.claude/scripts/truenas-status.sh
@ -1,186 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-AGENT="truenas-status"
-TRUENAS_HOST="root@10.0.10.15"
-DRY_RUN=false
-
-for arg in "$@"; do
-  case "$arg" in
-    --dry-run) DRY_RUN=true ;;
-  esac
-done
-
-checks=()
-
-add_check() {
-  local name="$1" status="$2" message="$3"
-  checks+=("{\"name\": \"$name\", \"status\": \"$status\", \"message\": \"$message\"}")
-}
-
-ssh_cmd() {
-  timeout 15 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$TRUENAS_HOST" "$@" 2>/dev/null
-}
-
-check_zfs_pools() {
-  if $DRY_RUN; then
-    add_check "zfs-pools" "ok" "dry-run: would check ZFS pool status"
-    return
-  fi
-
-  local pool_status
-  if ! pool_status=$(ssh_cmd "zpool status -x" 2>/dev/null); then
-    add_check "zfs-pools" "fail" "Could not retrieve ZFS pool status via SSH"
-    return
-  fi
-
-  if echo "$pool_status" | grep -q "all pools are healthy"; then
-    add_check "zfs-pools" "ok" "All ZFS pools are healthy"
-  else
-    local degraded_pools
-    degraded_pools=$(echo "$pool_status" | grep "pool:" | awk '{print $2}' | tr '\n' ', ' | sed 's/,$//')
-    if [ -n "$degraded_pools" ]; then
-      add_check "zfs-pools" "fail" "Degraded ZFS pools: $degraded_pools"
-    else
-      add_check "zfs-pools" "warn" "ZFS pool status unclear: $(echo "$pool_status" | head -1 | tr '"' "'")"
-    fi
-  fi
-
-  # Check pool capacity
-  local pool_list
-  if pool_list=$(ssh_cmd "zpool list -H -o name,cap" 2>/dev/null); then
-    while IFS=$'\t' read -r pool_name cap_pct; do
-      local cap_num
-      cap_num=$(echo "$cap_pct" | tr -d '%')
-      if [ -n "$cap_num" ] && [ "$cap_num" -ge 90 ]; then
-        add_check "zfs-capacity-$pool_name" "fail" "Pool $pool_name is ${cap_pct} full"
-      elif [ -n "$cap_num" ] && [ "$cap_num" -ge 80 ]; then
-        add_check "zfs-capacity-$pool_name" "warn" "Pool $pool_name is ${cap_pct} full"
-      else
-        add_check "zfs-capacity-$pool_name" "ok" "Pool $pool_name is ${cap_pct} full"
-      fi
-    done <<< "$pool_list"
-  fi
-}
-
-check_smart_health() {
-  if $DRY_RUN; then
-    add_check "smart-health" "ok" "dry-run: would check SMART disk health"
-    return
-  fi
-
-  local disk_list
-  if ! disk_list=$(ssh_cmd "smartctl --scan" 2>/dev/null); then
-    add_check "smart-health" "warn" "Could not scan disks for SMART status"
-    return
-  fi
-
-  local fail_count=0
-  local total_count=0
-  local failed_disks=""
-
-  while IFS= read -r line; do
-    local dev
-    dev=$(echo "$line" | awk '{print $1}')
-    [ -z "$dev" ] && continue
-    total_count=$((total_count + 1))
-
-    local health
-    if health=$(ssh_cmd "smartctl -H '$dev'" 2>/dev/null); then
-      if ! echo "$health" | grep -qiE "PASSED|OK"; then
-        fail_count=$((fail_count + 1))
-        failed_disks="$failed_disks $dev"
-      fi
-    fi
-  done <<< "$disk_list"
-
-  if [ "$fail_count" -gt 0 ]; then
-    add_check "smart-health" "fail" "$fail_count/$total_count disks failing SMART:$failed_disks"
-  elif [ "$total_count" -gt 0 ]; then
-    add_check "smart-health" "ok" "All $total_count disks pass SMART health checks"
-  else
-    add_check "smart-health" "warn" "No disks found for SMART check"
-  fi
-}
-
-check_replication() {
-  if $DRY_RUN; then
-    add_check "replication" "ok" "dry-run: would check replication task status"
-    return
-  fi
-
-  # Check for any running/failed replication tasks via midclt if available
-  local repl_status
-  if repl_status=$(ssh_cmd "midclt call replication.query 2>/dev/null" 2>/dev/null); then
-    local failed
-    failed=$(echo "$repl_status" | python3 -c "
-import sys, json
-try:
-    tasks = json.load(sys.stdin)
-    failed = [t.get('name','unknown') for t in tasks if t.get('state',{}).get('state','') == 'ERROR']
-    print(len(failed))
-except: print('error')
-" 2>/dev/null || echo "error")
-
-    if [ "$failed" = "error" ]; then
-      add_check "replication" "warn" "Could not parse replication task status"
-    elif [ "$failed" = "0" ]; then
-      add_check "replication" "ok" "All replication tasks healthy"
-    else
-      add_check "replication" "fail" "$failed replication tasks in ERROR state"
-    fi
-  else
-    # Fallback: check if zfs send/recv processes are stuck
-    local send_procs
-    send_procs=$(ssh_cmd "pgrep -c 'zfs send' 2>/dev/null || echo 0")
-    add_check "replication" "warn" "midclt unavailable; $send_procs active zfs send processes"
-  fi
-}
-
-check_iscsi() {
-  if $DRY_RUN; then
-    add_check "iscsi-targets" "ok" "dry-run: would check iSCSI target status"
-    return
-  fi
-
-  local target_status
-  if target_status=$(ssh_cmd "ctladm islist 2>/dev/null || targetcli ls 2>/dev/null" 2>/dev/null); then
-    local target_count
-    target_count=$(echo "$target_status" | wc -l | tr -d ' ')
-    if [ "$target_count" -gt 0 ]; then
-      add_check "iscsi-targets" "ok" "iSCSI service active with $target_count entries"
-    else
-      add_check "iscsi-targets" "warn" "iSCSI service active but no targets listed"
-    fi
-  else
-    # Try checking if the service is at least running
-    if ssh_cmd "midclt call iscsi.global.config" &>/dev/null; then
-      add_check "iscsi-targets" "ok" "iSCSI service is configured and running"
-    else
-      add_check "iscsi-targets" "warn" "Could not query iSCSI target status"
-    fi
-  fi
-}
-
-# Run checks
-check_zfs_pools
-check_smart_health
-check_replication
-check_iscsi
-
-# Determine overall status
-overall="ok"
-for c in "${checks[@]}"; do
-  if echo "$c" | grep -q '"status": "fail"'; then
-    overall="fail"
-    break
-  elif echo "$c" | grep -q '"status": "warn"'; then
-    overall="warn"
-  fi
-done
-
-# Output JSON
-checks_json=$(IFS=,; echo "${checks[*]}")
-cat <<EOF
-{"status": "$overall", "agent": "$AGENT", "checks": [$checks_json]}
-EOF