fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]

6d224861 came from a --no-checkout worktree whose empty index made the
commit drop every file except two. This restores 05b50d2b's full tree and
correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su
entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the
live infra was never applied from the broken commit.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-09 08:45:33 +00:00
parent 6d224861c4
commit fd0f4a0365
1166 changed files with 358546 additions and 0 deletions

View file

@ -0,0 +1,12 @@
[Unit]
Description=Apply per-VM I/O caps via qm set (idempotent)
Documentation=https://github.com/ViktorBarzin/infra/blob/master/scripts/apply-mbps-caps.sh
After=pve-cluster.service
Wants=pve-cluster.service
[Service]
Type=oneshot
ExecStart=/usr/local/bin/apply-mbps-caps.sh
StandardOutput=journal
StandardError=journal
SyslogIdentifier=apply-mbps-caps

74
scripts/apply-mbps-caps.sh Executable file
View file

@ -0,0 +1,74 @@
#!/usr/bin/env bash
# Apply per-VM I/O caps via `qm set` on the PVE host.
#
# - Reads each target VM's current boot-disk options.
# - Appends/normalises `mbps_rd=<N>,mbps_wr=<N>`.
# - Re-applies via `qm set` (live, no reboot needed).
# - Idempotent: re-running with no drift is a no-op at the storage
# level (proxmox config rewrite is cheap).
# - Continues on per-VM failures so one missing/stopped VM doesn't
# skip the rest — designed to be safe under the systemd timer.
#
# Backed by `apply-mbps-caps.{service,timer}` (hourly + 5min-after-boot).
# Why these values: see beads code-9v2j + memory id=2726 (alloy IO storm)
# + memory id=1575 (VMs intentionally out of TF).
set -uo pipefail # NOT -e — keep going if a single VM step fails.
# vmid:disk_slot:mbps_rd:mbps_wr (Linux VMs only — skipping 101 pfsense BSD, 300 Windows)
TARGETS=(
"102:scsi0:60:60" # devvm
"103:sata0:40:40" # home-assistant
"200:scsi0:100:60" # k8s-master (alloy storm origin — firmest clip)
"201:scsi1:150:120" # k8s-node1 (GPU + many CSI disks; boots from scsi1)
"202:scsi0:150:120" # k8s-node2
"203:scsi0:150:120" # k8s-node3
"204:scsi0:150:120" # k8s-node4
"220:scsi0:40:40" # docker-registry
)
apply_one() {
local spec="$1"
local vmid slot rd wr
IFS=: read -r vmid slot rd wr <<<"$spec"
# Skip non-existent VMs cleanly (e.g. node decommissioned, never rebuilt).
if ! qm status "$vmid" >/dev/null 2>&1; then
echo "vmid $vmid: not present on this host — skipping"
return 0
fi
local current cleaned newvalue
current=$(qm config "$vmid" | awk -v s="$slot:" '$1==s {sub(/^[^ ]+ /, ""); print; exit}')
if [[ -z "$current" ]]; then
echo "vmid $vmid: no $slot line in config — skipping"
return 0
fi
cleaned=$(echo "$current" | sed -E 's/,mbps_rd=[0-9]+//g; s/,mbps_wr=[0-9]+//g')
newvalue="${cleaned},mbps_rd=${rd},mbps_wr=${wr}"
# Skip the qm-set call entirely when state already matches — keeps
# journal noise low under the hourly timer.
if [[ "$current" == "$newvalue" ]]; then
echo "vmid $vmid: $slot already at mbps_rd=${rd},mbps_wr=${wr} — no-op"
return 0
fi
echo "vmid $vmid: updating $slot"
echo " before: $current"
echo " after: $newvalue"
if qm set "$vmid" "--$slot" "$newvalue"; then
echo " ok"
else
echo " FAILED: qm set returned non-zero"
return 1
fi
}
rc=0
for spec in "${TARGETS[@]}"; do
apply_one "$spec" || rc=1
done
exit "$rc"

View file

@ -0,0 +1,18 @@
[Unit]
Description=Re-apply per-VM I/O caps periodically + after PVE boot
[Timer]
# After every PVE host reboot — caps survive in /etc/pve/qemu-server/<vmid>.conf
# normally, but a config restore from backup can drop them (see 2026-05-26
# incident where we restored 202.conf + 203.conf from /mnt/backup/pve-config/).
OnBootSec=5min
# Hourly during normal operation — catches manual `qm set` drift or fresh
# VM clones that haven't had caps applied yet.
OnCalendar=hourly
Persistent=true
RandomizedDelaySec=2min
[Install]
WantedBy=timers.target

View file

@ -0,0 +1,124 @@
#!/usr/bin/env python3
"""Enforce the inline-comment convention for ingress_factory auth tiers.
Every `auth = "app"` or `auth = "none"` line under a stack must have an
immediately-preceding comment block containing `# auth = "<tier>":`
that documents what gates the app (for "app") or why the endpoint is
intentionally public (for "none").
This is the static guard for the anti-exposure rule documented in
`infra/.claude/CLAUDE.md` "Auth" section. It's invoked by `scripts/tg`
before every plan/apply/destroy/refresh, so it fires regardless of who
or what is running terragrunt local laptop, CI, headless agent.
Stack-scoped by design: only checks the .tf files under the stack
being acted on. Other stacks' historical violations don't block work
on the current stack; each stack documents itself the next time it's
edited.
Usage:
check-ingress-auth-comments.py <stack-path> # scan one stack
check-ingress-auth-comments.py --all # scan every stack
"""
import argparse
import os
import re
import sys
AUTH_LINE = re.compile(r'^\s*auth\s*=\s*"(app|none)"\s*$')
COMMENT_LINE = re.compile(r'^\s*#')
COMMENT_TIER = re.compile(r'auth\s*=\s*"(app|none)"')
def scan_dir(path):
violations = []
for root, _, files in os.walk(path):
for f in files:
if not f.endswith('.tf'):
continue
full = os.path.join(root, f)
try:
with open(full) as fh:
lines = fh.readlines()
except OSError:
continue
for i, line in enumerate(lines):
m = AUTH_LINE.match(line)
if not m:
continue
tier = m.group(1)
# Walk backwards through contiguous comment lines.
# Pass if ANY of them documents the matching tier.
ok = False
j = i - 1
while j >= 0 and COMMENT_LINE.match(lines[j]):
cm = COMMENT_TIER.search(lines[j])
if cm and cm.group(1) == tier:
ok = True
break
j -= 1
if not ok:
violations.append((full, i + 1, tier))
return violations
def main():
ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
g = ap.add_mutually_exclusive_group(required=True)
g.add_argument('path', nargs='?', help='Stack directory to scan')
g.add_argument('--all', action='store_true', help='Scan every stack under stacks/')
args = ap.parse_args()
if args.all:
scan_paths = ['stacks']
else:
if not os.path.isdir(args.path):
print(f"ERROR: {args.path} is not a directory", file=sys.stderr)
sys.exit(2)
scan_paths = [args.path]
violations = []
for p in scan_paths:
violations.extend(scan_dir(p))
if not violations:
return
print(
"\n"
"==============================================================\n"
"ingress_factory auth-comment convention violated\n"
"==============================================================\n"
"\n"
"Every `auth = \"app\"` or `auth = \"none\"` line must have a\n"
"preceding comment line documenting what gates the app (for\n"
"\"app\") or why the endpoint is intentionally public (for\n"
"\"none\"). This guard prevents accidentally exposing private\n"
"services. See infra/.claude/CLAUDE.md Auth section.\n"
"\n"
"Add a comment line directly above the auth line:\n"
"\n"
" # auth = \"app\": <what gates the app, e.g. NextAuth + OAuth>\n"
" auth = \"app\"\n"
"\n"
"or:\n"
"\n"
" # auth = \"none\": <why public, e.g. webhook receiver, CalDAV>\n"
" auth = \"none\"\n"
"\n"
"Violations:",
file=sys.stderr,
)
for path, line_no, tier in violations:
print(
f" {path}:{line_no}: auth = \"{tier}\" missing preceding "
f"`# auth = \"{tier}\":` comment",
file=sys.stderr,
)
print(file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

3258
scripts/cluster_healthcheck.sh Executable file

File diff suppressed because it is too large Load diff

277
scripts/cluster_manager.py Normal file
View file

@ -0,0 +1,277 @@
import asyncio
import click
import logging
import time
from typing import List, Union, Optional
from kubernetes_asyncio import client, config
from kubernetes_asyncio.client.api_client import ApiClient
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
async def wait_for_healthy(
api_instance: client.AppsV1Api,
resource_type: str,
namespace: str,
name: str,
target_replicas: int,
timeout: int = 300,
) -> None:
start_time = time.time()
logger.info(
f"Waiting for {resource_type} {name} to reach {target_replicas} replicas..."
)
while True:
if time.time() - start_time > timeout:
logger.error(f"❌ Timeout reached for {resource_type} {name}")
return
try:
if resource_type.lower() == "deployment":
res = await api_instance.read_namespaced_deployment_status(
name, namespace
)
ready = res.status.ready_replicas or 0
updated = res.status.updated_replicas or 0
if ready == target_replicas and updated == target_replicas:
break
else: # StatefulSet
res = await api_instance.read_namespaced_stateful_set_status(
name, namespace
)
ready = res.status.ready_replicas or 0
if ready == target_replicas:
break
except Exception as e:
logger.debug(f"Retrying status check for {name}: {e}")
await asyncio.sleep(5)
logger.info(f"{resource_type} {name} is now healthy.")
async def wait_for_zero(
api: client.AppsV1Api, kind: str, ns: str, name: str, timeout: int
) -> tuple[str, str]:
start_time = asyncio.get_event_loop().time()
while (asyncio.get_event_loop().time() - start_time) < timeout:
try:
res = await (
api.read_namespaced_deployment_status(name, ns)
if kind.lower() == "deployment"
else api.read_namespaced_stateful_set_status(name, ns)
)
if (res.status.ready_replicas or 0) == 0:
return ns, name
except Exception:
return ns, name # Assume gone if error
await asyncio.sleep(3)
logger.error(f"Timeout: {kind} {ns}/{name} still has running pods.")
return ns, name
async def scale_resource(
api_instance: client.AppsV1Api,
resource_type: str,
namespace: str,
name: str,
replicas: int,
) -> None:
body = {"spec": {"replicas": replicas}}
try:
if resource_type.lower() == "deployment":
await api_instance.patch_namespaced_deployment_scale(name, namespace, body)
else:
await api_instance.patch_namespaced_stateful_set_scale(
name, namespace, body
)
except Exception as e:
logger.error(f"Failed to scale {resource_type} {name}: {e}")
async def run_stop_tier(
api_v1: client.AppsV1Api, label: str, output_file: str, timeout: int
) -> None:
"""Processes a single label tier: saves, scales to 0, and waits."""
excluded_ns = ["kube-system", "kube-public", "kube-node-lease"]
# 1. Discover
targets = [
("Deployment", api_v1.list_deployment_for_all_namespaces),
("StatefulSet", api_v1.list_stateful_set_for_all_namespaces),
]
tier_resources = []
for kind, list_func in targets:
resp = await list_func(label_selector=label)
tier_resources.extend(
[
(kind, item)
for item in resp.items
if item.metadata.namespace not in excluded_ns
]
)
if not tier_resources:
logger.warning(f"No resources found for label: {label}")
return
# 2. Save & Scale
active_jobs: set[tuple[str, str]] = set()
wait_tasks = []
# Append to file so we don't overwrite previous tiers
with open(output_file, "a") as f:
for kind, item in tier_resources:
ns, name = item.metadata.namespace, item.metadata.name
reps = item.spec.replicas or 0
f.write(f"{kind} {ns} {name} {reps}\n")
active_jobs.add((ns, name))
await scale_resource(api_v1, kind, ns, name, 0)
wait_tasks.append(wait_for_zero(api_v1, kind, ns, name, timeout))
# 3. Wait for this tier to finish before moving to next
logger.info(f"Tier [{label}]: Waiting for {len(active_jobs)} resources to stop...")
for coro in asyncio.as_completed(wait_tasks):
finished_ns, finished_name = await coro
active_jobs.discard((finished_ns, finished_name))
if active_jobs:
remaining_ns = sorted({ns for ns, name in active_jobs})
logger.info(
f"[{label}] Pending: {len(active_jobs)} | Namespaces: {', '.join(remaining_ns)}"
)
logger.info(f"✅ Tier [{label}] successfully shut down.")
@click.group()
def cli():
pass
@cli.command()
@click.argument("labels", nargs=-1, required=True)
@click.option("--output", "-o", default="resources.txt", help="Output state file")
@click.option("--timeout", "-t", default=3600)
def stop(labels: List[str], output: str, timeout: int):
"""Stop tiers sequentially. Usage: stop 'app=web' 'app=db'"""
async def main():
await config.load_kube_config()
# Clear/Create file at start
open(output, "w").close()
async with ApiClient() as api_client:
api_v1 = client.AppsV1Api(api_client)
for label in labels:
logger.info(f"🚀 Processing Shutdown Tier: {label}")
await run_stop_tier(api_v1, label, output, timeout)
logger.info("🏁 Sequence complete. Cluster is gracefully stopped.")
asyncio.run(main())
@cli.command()
@click.argument("labels", nargs=-1, required=True)
@click.option("--file", "-f", default="resources.txt")
@click.option("--timeout", "-t", default=3600, help="Seconds to wait per resource")
def start(labels: List[str], file: str, timeout: int):
asyncio.run(run_start_sequence(labels, file, timeout))
async def run_start_sequence(labels: List[str], file_path: str, timeout: int) -> None:
await config.load_kube_config()
async with ApiClient() as api_client:
apps_v1 = client.AppsV1Api(api_client)
# 1. Load the entire snapshot into memory for filtering
try:
with open(file_path, "r") as f:
# Format: Kind Namespace Name Replicas
snapshot_lines = [line.strip().split() for line in f if line.strip()]
except FileNotFoundError:
logger.error(f"Snapshot file {file_path} not found.")
return
# 2. Iterate through labels in the order provided
for label in labels:
logger.info(f"🚀 Starting Tier: {label}")
# Find resources in this tier by querying K8s for the label
# then matching against our snapshot file data
tier_resources = await get_resources_by_label(apps_v1, label)
# Cross-reference: Only start things that are in BOTH the K8s label query AND our file
# This ensures we restore them to the CORRECT previous replica count
to_restore = []
tier_keys = {(r["ns"], r["name"]) for r in tier_resources}
for kind, ns, name, reps in snapshot_lines:
if (ns, name) in tier_keys:
to_restore.append((kind, ns, name, int(reps)))
if not to_restore:
logger.warning(f"No resources found in snapshot for tier: {label}")
continue
# 3. Scale and Wait for this specific tier
await process_start_tier(apps_v1, to_restore, timeout, label)
logger.info("🏁 All tiers started successfully.")
async def get_resources_by_label(api: client.AppsV1Api, label: str) -> List[dict]:
"""Helper to find what currently exists in the cluster with this label."""
targets = [
api.list_deployment_for_all_namespaces,
api.list_stateful_set_for_all_namespaces,
]
found = []
for list_func in targets:
resp = await list_func(label_selector=label)
for item in resp.items:
found.append({"ns": item.metadata.namespace, "name": item.metadata.name})
return found
async def process_start_tier(
api: client.AppsV1Api, resources: list, timeout: int, label: str
):
active_jobs = set()
scale_tasks = []
wait_tasks = []
# Wrapper to track which job finishes
async def tracked_wait(kind, ns, name, target, t_out):
await wait_for_healthy(api, kind, ns, name, target, t_out)
return (ns, name)
for kind, ns, name, reps in resources:
active_jobs.add((ns, name))
scale_tasks.append(scale_resource(api, kind, ns, name, reps))
wait_tasks.append(tracked_wait(kind, ns, name, reps, timeout))
# Trigger all scales for this tier
await asyncio.gather(*scale_tasks)
# Monitor health
for coro in asyncio.as_completed(wait_tasks):
finished_ns, finished_name = await coro
active_jobs.discard((finished_ns, finished_name))
if active_jobs:
remaining_ns = sorted({ns for ns, name in active_jobs})
logger.info(
f"[{label}] Pending Health: {len(active_jobs)} | Namespaces: {', '.join(remaining_ns)}"
)
logger.info(f"✅ Tier [{label}] is healthy.")
if __name__ == "__main__":
cli()

View file

@ -0,0 +1,14 @@
[Unit]
Description=Daily backup: PVC snapshots + SQLite + pfsense to sda
After=network-online.target
[Service]
Type=oneshot
ExecStart=/usr/local/bin/daily-backup
StandardOutput=journal
StandardError=journal
SyslogIdentifier=daily-backup
# 4h budget — the snapshot mount + LUKS decrypt + rsync + sqlite scan loop
# scales with the number of PVCs (118 today). Hit the 1h ceiling around week
# 18 of 2026 and silently SIGTERM'd for 10 days. Bumped to 4h with margin.
TimeoutStartSec=14400

424
scripts/daily-backup.sh Normal file
View file

@ -0,0 +1,424 @@
#!/usr/bin/env bash
# daily-backup — 3-2-1 backup: PVC file copy + SQLite + pfsense + PVE config to sda
# Deploy to PVE host at /usr/local/bin/daily-backup
# Schedule: Daily 05:00 via systemd timer
set -euo pipefail
# --- Configuration ---
BACKUP_ROOT="/mnt/backup"
PVC_MOUNT="/tmp/pvc-mount"
PUSHGATEWAY="${DAILY_BACKUP_PUSHGATEWAY:-http://10.0.20.100:30091}"
PUSHGATEWAY_JOB="daily-backup"
LOCKFILE="/run/daily-backup.lock"
MANIFEST="${BACKUP_ROOT}/.changed-files"
MAPPING_CACHE="${BACKUP_ROOT}/.lv-pvc-mapping.json"
KUBECONFIG="${KUBECONFIG:-/root/.kube/config}"
export KUBECONFIG
# --- Logging ---
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
warn() { log "WARN: $*" >&2; }
die() { log "FATAL: $*" >&2; push_metrics 1 0; exit 1; }
# --- Manifest append helper ---
# Both daily-backup and nfs-mirror append to /mnt/backup/.changed-files.
# If their runs overlap (e.g. nfs-mirror Mon 04:11 still running when
# daily-backup starts Mon 05:00) the appends can interleave mid-line.
# `flock -x` on a sibling lock file makes appends atomic across processes.
MANIFEST_LOCK="${MANIFEST}.lock"
manifest_append() {
(
flock -x 200
cat >> "${MANIFEST}"
) 200>"${MANIFEST_LOCK}"
}
# Cap manifest size to prevent unbounded growth (e.g. Synology unreachable
# for many days, every daily-backup keeps appending). At >500k lines,
# `--files-from=` rsync becomes pathological — fall back to a full Step 1
# sync by signalling offsite-sync to ignore the manifest this round.
MANIFEST_MAX_LINES=500000
check_manifest_size() {
[ -f "${MANIFEST}" ] || return 0
local lines
lines=$(wc -l < "${MANIFEST}" 2>/dev/null || echo 0)
if [ "${lines:-0}" -gt "${MANIFEST_MAX_LINES}" ]; then
warn "manifest at ${lines} lines (>${MANIFEST_MAX_LINES}) — flagging next offsite-sync as full"
touch "${BACKUP_ROOT}/.force-full-sync"
fi
}
# --- Locking ---
# Track whether we got SIGTERM/SIGINT so cleanup can push a non-success metric.
# Without this, a systemd timeout-kill leaves WeeklyBackupFailing alerts blind:
# the script never reaches the success push at the end and the metric goes stale
# silently. (Root cause of 2026-04-30 → 2026-05-09 silent-failure run.)
KILLED=""
cleanup() {
# Recursively unmount /tmp/pvc-mount: previous SIGTERM'd runs left snapshot
# mounts stacked here, which made every subsequent run start with an
# already-occupied mountpoint and time out before reaching its own umount.
while mountpoint -q "${PVC_MOUNT}" 2>/dev/null; do
umount "${PVC_MOUNT}" 2>/dev/null || umount -l "${PVC_MOUNT}" 2>/dev/null || break
done
# Close any LUKS mappers we opened (or that were left over from a prior crash).
for m in /dev/mapper/pvc-snap-*; do
[ -e "$m" ] || continue
cryptsetup close "$(basename "$m")" 2>/dev/null || true
done
rm -f "${LOCKFILE}"
if [ -n "${KILLED}" ]; then
# status=2 = aborted (matches lvm-pvc-snapshot's convention)
push_metrics 2 "${TOTAL_BYTES:-0}"
fi
}
trap cleanup EXIT
trap 'KILLED=1; exit 143' TERM INT
if ! ( set -o noclobber; echo $$ > "${LOCKFILE}" ) 2>/dev/null; then
die "Another instance is running (PID $(cat "${LOCKFILE}" 2>/dev/null || echo unknown))"
fi
# Belt-and-braces: if a previous run was SIGTERM'd before its trap completed,
# /tmp/pvc-mount may have stacked mounts and stale LUKS mappers. The lock above
# guarantees we're alone, so it's safe to clean these up now.
while mountpoint -q "${PVC_MOUNT}" 2>/dev/null; do
umount "${PVC_MOUNT}" 2>/dev/null || umount -l "${PVC_MOUNT}" 2>/dev/null || break
done
for m in /dev/mapper/pvc-snap-*; do
[ -e "$m" ] || continue
cryptsetup close "$(basename "$m")" 2>/dev/null || true
done
# --- Metrics ---
push_metrics() {
local status="${1:-0}" bytes="${2:-0}"
cat <<EOF | curl -s --connect-timeout 5 --max-time 10 --data-binary @- "${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || true
daily_backup_last_run_timestamp $(date +%s)
daily_backup_last_status ${status}
daily_backup_bytes_synced ${bytes}
EOF
}
# --- PVC name resolution ---
resolve_pvc_name() {
local lv="$1"
jq -r --arg lv "${lv}" '
.items[] |
select(.spec.csi.volumeHandle // "" | endswith($lv)) |
"\(.spec.claimRef.namespace)/\(.spec.claimRef.name)"
' "${MAPPING_CACHE}" 2>/dev/null
}
# --- NFS Export Health Check ---
# Verify NFS exports are healthy before starting backup.
# Detects: missing /etc/exports, incorrect fsid=0 flag, unexpected exports.
# Added 2026-04-14 [PM-2026-04-14]: backup script accessed NFS causing stale handle
# propagation during the fsid=0 outage. Early check prevents cascading failures.
check_nfs_exports() {
local exports_file="/etc/exports"
local status=0
if [ ! -f "${exports_file}" ]; then
log "WARN: ${exports_file} does not exist — NFS exports may be unconfigured"
return 1
fi
# Check for dangerous fsid=0 on /srv/nfs (breaks NFSv4 subdirectory path resolution)
if grep -E '^/srv/nfs[[:space:]].*fsid=0' "${exports_file}" 2>/dev/null; then
log "ERROR: /etc/exports contains fsid=0 on /srv/nfs — this will break all k8s NFS mounts!"
log "ERROR: Remove fsid=0 and run: exportfs -ra && systemctl restart nfs-server"
return 1
fi
# Verify NFS server is active
if ! systemctl is-active --quiet nfs-server 2>/dev/null; then
log "WARN: nfs-server is not running — NFS mounts will fail"
return 1
fi
# Verify exports are actually loaded (exportfs -s lists active exports)
local active_exports
active_exports=$(exportfs -s 2>/dev/null | grep -c '/srv/nfs' || true)
if [ "${active_exports:-0}" -eq 0 ]; then
log "WARN: No /srv/nfs exports active in kernel — run: exportfs -ra"
return 1
fi
log "NFS export health check passed (${active_exports} /srv/nfs export(s) active)"
return 0
}
# --- Main ---
log "=== daily-backup starting ==="
if ! mountpoint -q "${BACKUP_ROOT}"; then
die "${BACKUP_ROOT} is not mounted"
fi
# NFS export health check — warn but don't abort (backup can proceed with block storage PVCs)
check_nfs_exports || {
log "WARN: NFS export health check failed — NFS-backed PVC backups may fail"
STATUS=1
}
STATUS=0
TOTAL_BYTES=0
# DO NOT truncate the manifest here.
#
# Truncation lives in offsite-sync-backup (only on successful sync). If
# offsite-sync failed yesterday — Synology unreachable, transient error —
# the manifest holds yesterday's unconsumed file list. Truncating at the
# start of today's daily-backup would silently lose those entries; they'd
# only reach Synology on the next monthly full sync.
#
# Appending duplicates across multiple runs is harmless — rsync transfers
# each file once. If the manifest grows pathologically (Synology down for
# weeks), the OffsiteBackupSync{Stale,Failing} alerts catch it.
# NFS data is synced to Synology via two paths: nfs-mirror → sda → Step 1
# for the curated subset, and inotify + Step 2 for the sda-bypass list.
# ============================================================
# STEP 1: PVC file-level copy from LVM thin snapshots
# ============================================================
log "--- Step 1: PVC file copy from snapshots ---"
WEEK=$(date +%Y-%W)
PREV=$(ls -1d "${BACKUP_ROOT}/pvc-data"/????-?? 2>/dev/null | tail -1 || true)
# Cache LV→PVC mapping (fallback if kubectl is down next time)
if kubectl get pv -o json > /tmp/pv-list.json 2>/dev/null; then
cp /tmp/pv-list.json "${MAPPING_CACHE}"
rm -f /tmp/pv-list.json
fi
if [ ! -f "${MAPPING_CACHE}" ]; then
warn "No PV mapping cache and kubectl unavailable — skipping PVC copy"
STATUS=1
else
mkdir -p "${PVC_MOUNT}"
PVC_COUNT=0
PVC_FAIL=0
# Iterate origin LVs (not snapshots), find latest snapshot for each
for origin_lv in $(lvs --noheadings -o lv_name pve 2>/dev/null | grep 'vm-9999-pvc-' | grep -v '_snap_' | tr -d ' '); do
# Find latest snapshot for this origin
snap=$(lvs --noheadings -o lv_name pve 2>/dev/null | tr -d ' ' | grep "^${origin_lv}_snap_" | sort | tail -1 || true)
[ -z "${snap}" ] && continue
# Resolve human-readable name
ns_pvc=$(resolve_pvc_name "${origin_lv}")
if [ -z "${ns_pvc}" ] || [ "${ns_pvc}" = "null/null" ]; then
warn "Cannot resolve PVC name for ${origin_lv}, skipping"
continue
fi
# Skip-list: PVCs we deliberately don't keep offsite copies of.
# nextcloud-data-proxmox — orphaned pre-encryption PV (Released,
# Retain). Nextcloud moved to nextcloud-data-encrypted on 2026-04-13;
# this old unencrypted PV lingers (Retain) and was still being backed
# up weekly, filling the offsite Synology. Stop copying it (2026-06-01).
case "${ns_pvc}" in
nextcloud/nextcloud-data-proxmox)
log " skip ${ns_pvc} (orphaned pre-encryption PVC)"
continue ;;
esac
# Detect LUKS-encrypted volumes and set up mount device
LUKS_NAME=""
MOUNT_DEV="/dev/pve/${snap}"
MOUNT_OPTS="ro"
if blkid -o value -s TYPE "/dev/pve/${snap}" 2>/dev/null | grep -q 'crypto_LUKS'; then
# Clean up any stale LUKS mapping for this snapshot from a previous crashed run
STALE_LUKS="pvc-snap-$(echo "${snap}" | md5sum | cut -c1-12)"
if [ -e "/dev/mapper/${STALE_LUKS}" ]; then
umount "/dev/mapper/${STALE_LUKS}" 2>/dev/null || true
cryptsetup close "${STALE_LUKS}" 2>/dev/null || true
fi
LUKS_KEY="/root/.luks-backup-key"
LUKS_NAME="pvc-snap-$(echo "${snap}" | md5sum | cut -c1-12)"
if [ -f "${LUKS_KEY}" ] && cryptsetup open --type luks --key-file "${LUKS_KEY}" --readonly "/dev/pve/${snap}" "${LUKS_NAME}" 2>&1; then
MOUNT_DEV="/dev/mapper/${LUKS_NAME}"
MOUNT_OPTS="ro,noload" # noload skips ext4 journal replay on read-only LUKS
log " LUKS: decrypted ${snap}${LUKS_NAME}"
else
warn "Failed to decrypt LUKS snapshot ${snap}"
PVC_FAIL=$((PVC_FAIL + 1))
continue
fi
fi
# Mount snapshot read-only, rsync files
if timeout 30 mount -o "${MOUNT_OPTS}" "${MOUNT_DEV}" "${PVC_MOUNT}" 2>&1; then
dst="${BACKUP_ROOT}/pvc-data/${WEEK}/${ns_pvc}"
mkdir -p "${dst}"
rsync_rc=0
# Per-PVC rsync timeout (30 min). Without this, a single hung
# PVC blocks the entire backup until systemd's TimeoutStartSec
# kills the script (4h ceiling), leaving every later PVC
# unbacked and silently triggering WeeklyBackupFailing. Picked
# 30 min as well above the largest PVC's normal copy time
# (immich-postgres ~10 GiB, ~3 min on local ext4) and well
# below the unit-level budget so we still have headroom to
# finish the rest.
timeout 1800 rsync -a --delete \
${PREV:+--link-dest="${PREV}/${ns_pvc}/"} \
"${PVC_MOUNT}/" "${dst}/" 2>&1 || rsync_rc=$?
if [ "$rsync_rc" -eq 0 ]; then
PVC_COUNT=$((PVC_COUNT + 1))
elif [ "$rsync_rc" -eq 23 ] && [ -n "${LUKS_NAME}" ]; then
# rsync 23 = partial transfer; expected for LUKS noload mounts
# (in-flight writes have corrupt metadata from skipped journal replay)
PVC_COUNT=$((PVC_COUNT + 1))
log " partial rsync (LUKS noload) for ${ns_pvc} — OK"
elif [ "$rsync_rc" -eq 124 ]; then
# `timeout` exit 124 = wall-clock killed the rsync. Track
# separately so the next run still produces a metric and
# doesn't pretend nothing happened.
warn "rsync timed out for ${ns_pvc} after 30 min — moving on"
PVC_FAIL=$((PVC_FAIL + 1))
else
warn "rsync failed for ${ns_pvc} (rc=$rsync_rc)"
PVC_FAIL=$((PVC_FAIL + 1))
fi
# Auto-detect and safely backup SQLite databases from snapshot
if command -v sqlite3 &>/dev/null; then
find "${PVC_MOUNT}" -maxdepth 3 \
\( -name '*.db' -o -name '*.sqlite' -o -name '*.sqlite3' \) \
-size +0 -type f 2>/dev/null | while read -r dbfile; do
# Verify it's actually SQLite (magic number check)
if head -c 15 "$dbfile" 2>/dev/null | grep -q 'SQLite format 3'; then
relpath="${dbfile#${PVC_MOUNT}/}"
dest_file="${BACKUP_ROOT}/sqlite-backup/${WEEK}/${ns_pvc}/${relpath}"
mkdir -p "$(dirname "${dest_file}")"
# 5-min sqlite timeout — same hang-prevention idea
# as rsync above. A corrupted SQLite or one held
# open by a writer in the snapshot can otherwise
# block .backup indefinitely.
if timeout 300 sqlite3 "file://${dbfile}?mode=ro" ".backup '${dest_file}'" 2>/dev/null; then
log " SQLite: ${ns_pvc}/${relpath}"
else
cp "${dbfile}" "${dest_file}" 2>/dev/null || true
fi
fi
done
fi
umount "${PVC_MOUNT}" 2>/dev/null || umount -l "${PVC_MOUNT}" 2>/dev/null || true
else
warn "Failed to mount snapshot ${snap}"
PVC_FAIL=$((PVC_FAIL + 1))
fi
# Close LUKS device if we opened one
if [ -n "${LUKS_NAME}" ]; then
cryptsetup close "${LUKS_NAME}" 2>/dev/null || true
fi
done
log " PVC copy: ${PVC_COUNT} OK, ${PVC_FAIL} failed"
[ "${PVC_FAIL}" -gt 0 ] && STATUS=1
# Add PVC files to manifest (locked append)
if [ -d "${BACKUP_ROOT}/pvc-data/${WEEK}" ]; then
find "${BACKUP_ROOT}/pvc-data/${WEEK}" -type f 2>/dev/null | \
sed "s|^${BACKUP_ROOT}/||" | manifest_append
fi
# Prune old weekly versions (keep 4)
ls -1d "${BACKUP_ROOT}/pvc-data"/????-?? 2>/dev/null | head -n -4 | xargs rm -rf 2>/dev/null || true
ls -1d "${BACKUP_ROOT}/sqlite-backup"/????-?? 2>/dev/null | head -n -4 | xargs rm -rf 2>/dev/null || true
PVC_BYTES=$(du -sb "${BACKUP_ROOT}/pvc-data/${WEEK}" 2>/dev/null | cut -f1 || true)
TOTAL_BYTES=$((TOTAL_BYTES + ${PVC_BYTES:-0}))
fi
# ============================================================
# STEP 3: pfsense backup (config.xml + full tar)
# ============================================================
log "--- Step 3: pfsense backup ---"
PFSENSE_DEST="${BACKUP_ROOT}/pfsense"
DATE=$(date +%Y%m%d)
PFSENSE_STATUS=0
mkdir -p "${PFSENSE_DEST}"
if timeout 10 ssh -o BatchMode=yes -o ConnectTimeout=5 root@10.0.20.1 true 2>/dev/null; then
# config.xml — primary restore artifact
if scp -o ConnectTimeout=10 root@10.0.20.1:/cf/conf/config.xml "${PFSENSE_DEST}/config-${DATE}.xml" 2>/dev/null; then
log " OK: config.xml"
echo "pfsense/config-${DATE}.xml" | manifest_append
else
warn "Failed to copy pfsense config.xml"
STATUS=1
PFSENSE_STATUS=1
fi
# Full filesystem tar — Sundays only (weekly).
# config.xml is the primary restore artifact and runs daily above; the
# full filesystem tar is for forensic / package-state recovery only and
# rarely-needed. Re-tarring 100M+ daily writes ~3G/month to sda + Synology
# for unchanged content. Keep one fresh tarball per week instead.
if [ "$(date +%u)" = "7" ]; then
if ssh -o ConnectTimeout=10 root@10.0.20.1 \
"tar czf - --exclude=/dev --exclude=/proc --exclude=/tmp --exclude=/var/run /" \
> "${PFSENSE_DEST}/pfsense-full-${DATE}.tar.gz" 2>/dev/null; then
log " OK: weekly full tar ($(du -sh "${PFSENSE_DEST}/pfsense-full-${DATE}.tar.gz" | cut -f1))"
echo "pfsense/pfsense-full-${DATE}.tar.gz" | manifest_append
else
warn "Failed to tar pfsense filesystem"
STATUS=1
PFSENSE_STATUS=1
fi
else
log " skip weekly full tar (only runs Sundays)"
fi
# Retention: keep 4 weekly copies
ls -t "${PFSENSE_DEST}"/config-*.xml 2>/dev/null | tail -n +5 | xargs rm -f 2>/dev/null || true
ls -t "${PFSENSE_DEST}"/pfsense-full-*.tar.gz 2>/dev/null | tail -n +5 | xargs rm -f 2>/dev/null || true
else
warn "Cannot SSH to pfsense (10.0.20.1) — skipping"
STATUS=1
PFSENSE_STATUS=1
fi
# Push pfsense-backup metrics in BOTH success and failure paths so
# PfsenseBackupStale + PfsenseBackupFailing alerts can fire instead of going
# silent when ssh-to-pfsense is broken.
{
echo "backup_last_run_timestamp $(date +%s)"
echo "backup_last_status ${PFSENSE_STATUS}"
[ "${PFSENSE_STATUS}" -eq 0 ] && echo "backup_last_success_timestamp $(date +%s)"
} | curl -s --connect-timeout 5 --max-time 10 --data-binary @- \
"${PUSHGATEWAY}/metrics/job/pfsense-backup" 2>/dev/null || true
# ============================================================
# STEP 4: PVE host config backup
# ============================================================
log "--- Step 4: PVE host config ---"
mkdir -p "${BACKUP_ROOT}/pve-config/scripts"
timeout 300 rsync -a --delete /etc/pve/ "${BACKUP_ROOT}/pve-config/etc-pve/" 2>&1 || { warn "Failed to sync /etc/pve"; STATUS=1; }
for script in /usr/local/bin/lvm-pvc-snapshot /usr/local/bin/daily-backup /usr/local/bin/offsite-sync-backup; do
[ -f "${script}" ] && cp "${script}" "${BACKUP_ROOT}/pve-config/scripts/" 2>/dev/null || true
done
find "${BACKUP_ROOT}/pve-config" -type f 2>/dev/null | sed "s|^${BACKUP_ROOT}/||" | manifest_append
log " OK: PVE config"
check_manifest_size
# ============================================================
# STEP 5: Prune LVM snapshots older than 7 days
# ============================================================
log "--- Step 5: Snapshot pruning (7-day retention) ---"
/usr/local/bin/lvm-pvc-snapshot prune 2>&1 || { warn "Snapshot prune failed"; STATUS=1; }
# ============================================================
# Done
# ============================================================
MANIFEST_LINES=$(wc -l < "${MANIFEST}" 2>/dev/null || echo 0)
log "=== daily-backup complete (status=${STATUS}, ${TOTAL_BYTES} bytes, ${MANIFEST_LINES} files in manifest) ==="
push_metrics "${STATUS}" "${TOTAL_BYTES}"
exit "${STATUS}"

View file

@ -0,0 +1,10 @@
[Unit]
Description=Daily backup: PVC snapshots + SQLite + pfsense to sda
[Timer]
OnCalendar=*-*-* 05:00:00
Persistent=true
RandomizedDelaySec=300
[Install]
WantedBy=timers.target

372
scripts/extend_vm_storage.sh Executable file
View file

@ -0,0 +1,372 @@
#!/usr/bin/env bash
# Extend disk storage on a Kubernetes node VM.
# Drains the node, shuts down the VM, resizes the disk in Proxmox,
# boots the VM, expands the filesystem, and uncordons the node.
#
# Usage: ./scripts/extend_vm_storage.sh <node-name> <size-increment>
# Example: ./scripts/extend_vm_storage.sh k8s-node2 +64G
# --- Constants ---
PROXMOX_HOST="root@192.168.1.127"
VM_SSH_USER="wizard"
KUBECTL="kubectl --kubeconfig $(pwd)/config"
SHUTDOWN_TIMEOUT=300
SSH_WAIT_TIMEOUT=300
POLL_INTERVAL=5
# --- Colors ---
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m'
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
error() { echo -e "${RED}[ERROR]${NC} $*"; }
# --- Node-to-VMID mapping ---
declare -A NODE_VMID=(
[k8s-master]=200
[k8s-node1]=201
[k8s-node2]=202
[k8s-node3]=203
[k8s-node4]=204
)
# --- Cleanup trap ---
DRAINED_NODE=""
cleanup() {
if [[ -n "$DRAINED_NODE" ]]; then
echo ""
error "Script exited unexpectedly!"
warn "The node '$DRAINED_NODE' may still be cordoned/drained."
warn "Recovery steps:"
warn " 1. Check VM status: ssh $PROXMOX_HOST 'qm status ${NODE_VMID[$DRAINED_NODE]}'"
warn " 2. Start VM if stopped: ssh $PROXMOX_HOST 'qm start ${NODE_VMID[$DRAINED_NODE]}'"
warn " 3. Uncordon node: $KUBECTL uncordon $DRAINED_NODE"
fi
}
trap cleanup EXIT
# --- Input validation ---
usage() {
echo "Usage: $0 <node-name> <size-increment>"
echo ""
echo "Arguments:"
echo " node-name One of: ${!NODE_VMID[*]}"
echo " size-increment Disk size increase, e.g. +64G, +128G"
echo ""
echo "Example:"
echo " $0 k8s-node2 +64G"
exit 1
}
if [[ $# -ne 2 ]]; then
usage
fi
NODE_NAME="$1"
SIZE_INCREMENT="$2"
if [[ -z "${NODE_VMID[$NODE_NAME]+x}" ]]; then
error "Unknown node: '$NODE_NAME'"
echo "Valid nodes: ${!NODE_VMID[*]}"
exit 1
fi
if [[ ! "$SIZE_INCREMENT" =~ ^\+[0-9]+G$ ]]; then
error "Invalid size increment: '$SIZE_INCREMENT'"
echo "Must match pattern +<number>G, e.g. +64G"
exit 1
fi
VMID="${NODE_VMID[$NODE_NAME]}"
# --- Resolve node IP via kubectl ---
info "Resolving IP for node '$NODE_NAME'..."
NODE_IP=$($KUBECTL get node "$NODE_NAME" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}' 2>/dev/null)
if [[ -z "$NODE_IP" ]]; then
error "Could not resolve IP for node '$NODE_NAME'. Is the cluster reachable?"
exit 1
fi
ok "Node IP: $NODE_IP"
# --- Query current disk size ---
info "Querying current disk size for VM $VMID..."
SCSI0_LINE=$(ssh "$PROXMOX_HOST" "qm config $VMID" 2>/dev/null | grep '^scsi0:')
if [[ -z "$SCSI0_LINE" ]]; then
error "Could not read scsi0 config for VM $VMID."
exit 1
fi
# Extract size value, e.g. "size=64G" from the config line
CURRENT_SIZE=$(echo "$SCSI0_LINE" | sed -n 's/.*size=\([0-9]*G\).*/\1/p')
if [[ -z "$CURRENT_SIZE" ]]; then
error "Could not parse current disk size from: $SCSI0_LINE"
exit 1
fi
CURRENT_SIZE_NUM=${CURRENT_SIZE%G}
INCREMENT_NUM=${SIZE_INCREMENT//[+G]/}
NEW_SIZE_NUM=$((CURRENT_SIZE_NUM + INCREMENT_NUM))
ok "Current disk size: ${CURRENT_SIZE_NUM}G → New size: ${NEW_SIZE_NUM}G (${SIZE_INCREMENT})"
if [[ $NEW_SIZE_NUM -le $CURRENT_SIZE_NUM ]]; then
error "New size (${NEW_SIZE_NUM}G) must be greater than current size (${CURRENT_SIZE_NUM}G)."
exit 1
fi
# --- Confirmation ---
echo ""
echo "========================================="
echo " Extend VM Storage"
echo "========================================="
echo " Node: $NODE_NAME"
echo " VMID: $VMID"
echo " Node IP: $NODE_IP"
echo " Current: ${CURRENT_SIZE_NUM}G"
echo " Increment: $SIZE_INCREMENT"
echo " New size: ${NEW_SIZE_NUM}G"
echo " Proxmox: $PROXMOX_HOST"
echo "========================================="
echo ""
echo "This will:"
echo " 1. Drain the node (evict pods)"
echo " 2. Shut down the VM"
echo " 3. Resize disk (scsi0) from ${CURRENT_SIZE_NUM}G to ${NEW_SIZE_NUM}G"
echo " 4. Start the VM"
echo " 5. Expand the filesystem inside the guest"
echo " 6. Uncordon the node"
echo ""
read -rp "Proceed? [y/N] " confirm
if [[ ! "$confirm" =~ ^[yY]$ ]]; then
echo "Aborted."
exit 0
fi
# --- Step 1: Drain node ---
info "Step 1/7: Draining node '$NODE_NAME'..."
DRAINED_NODE="$NODE_NAME"
if ! $KUBECTL drain "$NODE_NAME" --ignore-daemonsets --delete-emptydir-data --force --timeout=300s; then
error "Failed to drain node '$NODE_NAME'."
exit 1
fi
ok "Node drained."
# --- Step 2: Shutdown VM ---
info "Step 2/7: Shutting down VM $VMID..."
if ! ssh "$PROXMOX_HOST" "qm shutdown $VMID"; then
error "Failed to send shutdown command to VM $VMID."
exit 1
fi
info "Waiting for VM to stop (timeout: ${SHUTDOWN_TIMEOUT}s)..."
elapsed=0
while true; do
status=$(ssh "$PROXMOX_HOST" "qm status $VMID" 2>/dev/null)
if [[ "$status" == *"stopped"* ]]; then
break
fi
if [[ $elapsed -ge $SHUTDOWN_TIMEOUT ]]; then
error "VM $VMID did not stop within ${SHUTDOWN_TIMEOUT}s. Current status: $status"
exit 1
fi
sleep "$POLL_INTERVAL"
elapsed=$((elapsed + POLL_INTERVAL))
done
ok "VM stopped."
# --- Step 3: Resize disk ---
info "Step 3/7: Resizing disk scsi0 by $SIZE_INCREMENT..."
if ! ssh "$PROXMOX_HOST" "qm resize $VMID scsi0 $SIZE_INCREMENT"; then
error "Failed to resize disk on VM $VMID."
exit 1
fi
ok "Disk resized."
# --- Step 4: Start VM ---
info "Step 4/7: Starting VM $VMID..."
if ! ssh "$PROXMOX_HOST" "qm start $VMID"; then
error "Failed to start VM $VMID."
exit 1
fi
info "Waiting for SSH to become available at $NODE_IP (timeout: ${SSH_WAIT_TIMEOUT}s)..."
elapsed=0
while true; do
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$VM_SSH_USER@$NODE_IP" "true" 2>/dev/null; then
break
fi
if [[ $elapsed -ge $SSH_WAIT_TIMEOUT ]]; then
error "SSH not reachable on $NODE_IP within ${SSH_WAIT_TIMEOUT}s."
exit 1
fi
sleep "$POLL_INTERVAL"
elapsed=$((elapsed + POLL_INTERVAL))
done
ok "VM is up and SSH is reachable."
info "Waiting 10s for system stabilization..."
sleep 10
# --- Step 5: Expand filesystem ---
info "Step 5/7: Expanding filesystem inside the guest..."
ssh -o StrictHostKeyChecking=no "$VM_SSH_USER@$NODE_IP" 'bash -s' <<'REMOTE_SCRIPT'
set -o pipefail
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m'
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
error() { echo -e "${RED}[ERROR]${NC} $*"; }
ROOT_DEV=$(findmnt -n -o SOURCE /)
ROOT_FSTYPE=$(findmnt -n -o FSTYPE /)
info "Root device: $ROOT_DEV"
info "Root filesystem: $ROOT_FSTYPE"
# Ensure growpart is available
if ! command -v growpart &>/dev/null; then
info "Installing growpart (cloud-guest-utils)..."
sudo apt-get update -qq && sudo apt-get install -y -qq cloud-guest-utils
fi
resize_fs() {
local dev="$1"
local fstype="$2"
if [[ "$fstype" == "ext4" || "$fstype" == "ext3" || "$fstype" == "ext2" ]]; then
info "Running resize2fs on $dev..."
if ! sudo resize2fs "$dev"; then
error "resize2fs failed on $dev"
return 1
fi
elif [[ "$fstype" == "xfs" ]]; then
info "Running xfs_growfs on /..."
if ! sudo xfs_growfs /; then
error "xfs_growfs failed"
return 1
fi
else
error "Unsupported filesystem type: $fstype"
return 1
fi
return 0
}
# Check if root is on LVM (device-mapper)
if [[ "$ROOT_DEV" == /dev/mapper/* || "$ROOT_DEV" == /dev/dm-* ]]; then
info "LVM layout detected."
# Find the PV device
PV_DEV=$(sudo pvs --noheadings -o pv_name | head -1 | tr -d ' ')
if [[ -z "$PV_DEV" ]]; then
error "Could not determine PV device."
exit 1
fi
info "PV device: $PV_DEV"
# Parse disk and partition number (handles /dev/sdaX and /dev/nvmeXnXpX)
if [[ "$PV_DEV" =~ ^(/dev/nvme[0-9]+n[0-9]+)p([0-9]+)$ ]]; then
DISK="${BASH_REMATCH[1]}"
PARTNUM="${BASH_REMATCH[2]}"
elif [[ "$PV_DEV" =~ ^(/dev/[a-z]+)([0-9]+)$ ]]; then
DISK="${BASH_REMATCH[1]}"
PARTNUM="${BASH_REMATCH[2]}"
else
error "Could not parse disk/partition from PV: $PV_DEV"
exit 1
fi
info "Disk: $DISK, Partition: $PARTNUM"
# Grow partition
info "Growing partition $DISK partition $PARTNUM..."
sudo growpart "$DISK" "$PARTNUM" || echo "(growpart: partition may already be at max size)"
# Resize PV
info "Resizing PV $PV_DEV..."
if ! sudo pvresize "$PV_DEV"; then
error "pvresize failed on $PV_DEV"
exit 1
fi
# Resolve LV path if using /dev/dm-*
if [[ "$ROOT_DEV" == /dev/dm-* ]]; then
LV_PATH=$(sudo lvs --noheadings -o lv_path | head -1 | tr -d ' ')
else
LV_PATH="$ROOT_DEV"
fi
info "LV path: $LV_PATH"
# Extend LV
info "Extending LV $LV_PATH to use all free space..."
if ! sudo lvextend -l +100%FREE "$LV_PATH"; then
warn "lvextend reported no change (LV may already use all space)."
fi
# Resize filesystem
resize_fs "$LV_PATH" "$ROOT_FSTYPE"
if [[ $? -ne 0 ]]; then
exit 1
fi
else
info "Direct partition layout detected."
# Parse disk and partition number
if [[ "$ROOT_DEV" =~ ^(/dev/nvme[0-9]+n[0-9]+)p([0-9]+)$ ]]; then
DISK="${BASH_REMATCH[1]}"
PARTNUM="${BASH_REMATCH[2]}"
elif [[ "$ROOT_DEV" =~ ^(/dev/[a-z]+)([0-9]+)$ ]]; then
DISK="${BASH_REMATCH[1]}"
PARTNUM="${BASH_REMATCH[2]}"
else
error "Could not parse disk/partition from: $ROOT_DEV"
exit 1
fi
info "Disk: $DISK, Partition: $PARTNUM"
# Grow partition
info "Growing partition $DISK partition $PARTNUM..."
sudo growpart "$DISK" "$PARTNUM" || echo "(growpart: partition may already be at max size)"
# Resize filesystem
resize_fs "$ROOT_DEV" "$ROOT_FSTYPE"
if [[ $? -ne 0 ]]; then
exit 1
fi
fi
ok "Filesystem expansion complete."
df -h /
REMOTE_SCRIPT
if [[ $? -ne 0 ]]; then
error "Filesystem expansion failed on the guest."
exit 1
fi
ok "Filesystem expanded."
# --- Step 6: Uncordon node ---
info "Step 6/7: Uncordoning node '$NODE_NAME'..."
if ! $KUBECTL uncordon "$NODE_NAME"; then
error "Failed to uncordon node '$NODE_NAME'."
exit 1
fi
DRAINED_NODE=""
ok "Node uncordoned."
# --- Step 7: Verify ---
info "Step 7/7: Verification"
echo ""
info "Disk usage on $NODE_NAME:"
ssh -o StrictHostKeyChecking=no "$VM_SSH_USER@$NODE_IP" "df -h /"
echo ""
info "Node status:"
$KUBECTL get node "$NODE_NAME"
echo ""
ok "Storage extension complete for $NODE_NAME."

View file

@ -0,0 +1,21 @@
# /etc/fan-control.env — config for the fan-control daemon (chmod 600).
# Deployed manually to the PVE host; the real file holds a secret token and is
# NOT committed. Copy this template, fill HA_TOKEN, scp to /etc/fan-control.env.
# Long-lived ha-sofia access token (Home Assistant -> Profile -> Security ->
# Long-lived access tokens). Empty => presence disabled, daemon runs COOL-only.
HA_TOKEN=
# --- optional overrides (defaults shown) ---
# HA_URL=http://192.168.1.8:8123
# GARAGE_ENTITY=sensor.garage_door_state_bg
# GARAGE_OPEN_STATE=Отворена
# HOLD_SECS=900 # quiet-mode hold after last garage activity (15 min)
# LOOP_INTERVAL=15
# PRESENCE_INTERVAL=30
# DEADBAND=3
# CEILING=83 # degC: hand back to Dell auto at/above this
# RESUME_BELOW=75
# RESUME_STABLE=120
# MAX_IPMI_FAILS=3
PUSHGATEWAY_URL=http://10.0.20.100:30091

View file

@ -0,0 +1,21 @@
[Unit]
Description=Presence-aware IPMI fan controller (Dell R730, garage)
Documentation=https://github.com/ViktorBarzin/infra/blob/master/scripts/fan-control.sh
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
EnvironmentFile=-/etc/fan-control.env
ExecStart=/usr/local/bin/fan-control
# Belt-and-suspenders: whatever happens to the daemon, hand the fans back to
# the iDRAC's own automatic curve so the box is never stuck in manual mode.
ExecStopPost=/usr/bin/ipmitool raw 0x30 0x30 0x01 0x01
Restart=on-failure
RestartSec=10
StandardOutput=journal
StandardError=journal
SyslogIdentifier=fan-control
[Install]
WantedBy=multi-user.target

262
scripts/fan-control.sh Normal file
View file

@ -0,0 +1,262 @@
#!/usr/bin/env bash
# Presence-aware IPMI fan controller for the Dell R730 PVE host (192.168.1.127).
#
# The server lives in the GARAGE (memory id=1723). Two curves, picked by
# whether someone is physically in the garage:
# - COOL : garage empty -> minimise CPU temp, noise is free.
# - QUIET : someone in the garage -> minimise noise, accept a warmer CPU.
# Presence comes from the ha-sofia garage-door sensor: door open now, OR it
# last changed within HOLD_SECS, => QUIET. Otherwise COOL.
#
# Safety (manual fan mode bypasses the iDRAC's own curve, so we backstop it):
# - On ANY exit (crash/stop/TERM) the EXIT trap hands fans back to Dell
# automatic control (raw 0x30 0x30 0x01 0x01). systemd ExecStopPost
# repeats this belt-and-suspenders.
# - CPU >= CEILING -> hand back to Dell auto until it recovers (RESUME_BELOW
# held for RESUME_STABLE s). The firmware's own emergency cooling takes over.
# - IPMI read failures (>= MAX_IPMI_FAILS) -> hand back to Dell auto.
#
# Deploy: scp to /usr/local/bin/fan-control (strip .sh) + install
# fan-control.service + /etc/fan-control.env. Same pattern as apply-mbps-caps.
# Tests: test-fan-control.sh (sources this file, exercises the pure functions).
# Design: infra/docs/plans/2026-06-04-pve-fan-control-design.md
# Runbook: infra/docs/runbooks/fan-control.md
set -uo pipefail
# ---- configuration (override via /etc/fan-control.env) ----
: "${IPMITOOL:=ipmitool}"
: "${LOOP_INTERVAL:=15}" # seconds between temperature decisions
: "${PRESENCE_INTERVAL:=30}" # seconds between ha-sofia garage-door polls
: "${DEADBAND:=3}" # degC hysteresis applied to downward fan steps
: "${CEILING:=83}" # degC: hand back to Dell auto at/above this
: "${RESUME_BELOW:=75}" # degC: eligible to resume manual below this...
: "${RESUME_STABLE:=120}" # ...once held that long
: "${HOLD_SECS:=900}" # quiet-mode hold after last garage activity (15 min)
: "${HA_URL:=http://192.168.1.8:8123}"
: "${HA_TOKEN:=}" # long-lived ha-sofia token; empty => presence disabled (COOL only)
: "${GARAGE_ENTITY:=sensor.garage_door_state_bg}"
: "${GARAGE_OPEN_STATE:=Отворена}" # ha state string meaning "open"
# HA control: a mode select + manual % the user drives from Home Assistant.
# auto => garage-presence curve (default); cool/quiet => force that curve;
# manual => hold MANUAL_ENTITY %. Empty HA_TOKEN or unreachable HA => auto.
: "${MODE_ENTITY:=input_select.r730_fan_mode}"
: "${MANUAL_ENTITY:=input_number.r730_fan_manual_pct}"
: "${PUSHGATEWAY_URL:=}" # optional Prometheus Pushgateway base URL
: "${MAX_IPMI_FAILS:=3}"
: "${DRY_RUN:=0}" # 1 => log IPMI actions instead of executing
: "${RUN_ONCE:=0}" # 1 => one iteration then exit (testing)
# Continuous LINEAR fan curve (2026-06-05): fan% ramps proportionally with CPU
# temp between (T_LO,P_LO) and (T_HI,P_HI), clamped flat outside. Replaces the old
# discrete step-bands (which flapped at band edges — e.g. 45<->65%). Both modes
# reach 100% right at the 83°C ceiling. Anchors are env-tunable.
# COOL (garage empty): 30% @50°C .. 100% @83°C (~2.1%/°C; equilibrium ~60°C/~51%)
# QUIET (someone there): 20% @68°C .. 100% @83°C (near-silent until ~70°C)
# Web-researched: a linear curve + 2-3°C hysteresis is the homelab standard; PID is
# overkill for this slow thermal loop. See docs/plans/2026-06-04-pve-fan-control-design.md.
: "${COOL_T_LO:=50}"; : "${COOL_P_LO:=30}"; : "${COOL_T_HI:=83}"; : "${COOL_P_HI:=100}"
: "${QUIET_T_LO:=68}"; : "${QUIET_P_LO:=20}"; : "${QUIET_T_HI:=83}"; : "${QUIET_P_HI:=100}"
: "${MIN_STEP:=3}" # min fan-% change worth an IPMI write (anti-jitter on the smooth curve)
log() { printf '%s %s\n' "$(date '+%Y-%m-%dT%H:%M:%S%z')" "$*"; }
# ---- pure functions (no side effects; unit-tested) ----
# fc_curve <mode> <temp> -> fan percent (continuous linear interpolation between
# the per-mode (T_LO,P_LO)..(T_HI,P_HI) anchors; clamped flat outside the range).
fc_curve() {
local mode="$1" temp="$2" tlo plo thi phi
if [[ "$mode" == "quiet" ]]; then tlo=$QUIET_T_LO; plo=$QUIET_P_LO; thi=$QUIET_T_HI; phi=$QUIET_P_HI
else tlo=$COOL_T_LO; plo=$COOL_P_LO; thi=$COOL_T_HI; phi=$COOL_P_HI; fi
if (( temp <= tlo )); then echo "$plo"; return 0; fi
if (( temp >= thi )); then echo "$phi"; return 0; fi
echo $(( plo + ( (temp - tlo) * (phi - plo) + (thi - tlo) / 2 ) / (thi - tlo) )) # rounded
}
# fc_decide <mode> <temp> <current_pct> <deadband> -> fan percent
# Ramps up immediately; only steps down once the curve still wants a lower
# percent even DEADBAND degrees hotter (prevents flapping at band edges).
fc_decide() {
local mode="$1" temp="$2" current="$3" deadband="$4" target
target="$(fc_curve "$mode" "$temp")"
if (( current < 0 || target >= current )); then echo "$target"; return 0; fi
if (( $(fc_curve "$mode" "$((temp + deadband))") < current )); then echo "$target"; else echo "$current"; fi
}
# fc_presence_mode <state> <last_changed_epoch> <now_epoch> <hold_secs> <open_state> -> quiet|cool
fc_presence_mode() {
local state="$1" lc="$2" now="$3" hold="$4" open="$5"
if [[ "$state" == "$open" ]]; then echo "quiet"; return 0; fi
if (( now - lc < hold )); then echo "quiet"; return 0; fi
echo "cool"
}
# fc_parse_temp <ipmitool 'Temp' line> -> integer degC
fc_parse_temp() {
echo "$1" | grep -oE '[0-9]+ degrees C' | grep -oE '^[0-9]+' | head -1
}
# fc_json_str_field <json> <key> -> string value (first match; jq-free)
fc_json_str_field() {
printf '%s' "$1" | grep -oE "\"$2\"[[:space:]]*:[[:space:]]*\"[^\"]*\"" | head -1 \
| sed -E "s/.*:[[:space:]]*\"(.*)\"\$/\1/"
}
# fc_pct_to_hex <pct> -> 0xNN
fc_pct_to_hex() { printf '0x%02x' "$1"; }
# fc_clamp <pct> -> 0..100
fc_clamp() { local p="$1"; (( p < 0 )) && p=0; (( p > 100 )) && p=100; echo "$p"; }
# fc_fan_watts <rpm> -> estimated TOTAL fan power (W). The iDRAC reports only
# total DCMI watts + RPM (no per-fan power), so this is a MODEL: fan power ∝ RPM³
# (fan affinity law), calibrated to the 2026-06-05 power sweep — fits within ~3W
# (~2W @4800rpm · ~17W @9360 · ~42W @12720 · ~99W @16920). Integer: 0.0205·(rpm/1e3)³.
fc_fan_watts() { echo $(( $1 * $1 * $1 * 205 / 10000000000000 )); }
# fc_resolve <ha_mode> <temp> <manual_pct> <presence> <current> <deadband> -> pct
# HA mode resolution (the hard ceiling is handled by the caller):
# manual -> clamp(manual_pct), no hysteresis
# cool|quiet -> that curve (with hysteresis)
# auto (else) -> presence-driven curve (garage door)
fc_resolve() {
local ha_mode="$1" temp="$2" manual_pct="$3" presence="$4" current="$5" deadband="$6"
if [[ "$ha_mode" == "manual" ]]; then fc_clamp "$manual_pct"; return 0; fi
local eff; [[ "$ha_mode" == "auto" ]] && eff="$presence" || eff="$ha_mode"
fc_decide "$eff" "$temp" "$current" "$deadband"
}
# ---- side-effecting wrappers ----
ipmi_manual_on=0
set_manual() { # <pct>
local pct="$1" hex; hex="$(fc_pct_to_hex "$pct")"
if (( DRY_RUN == 1 )); then log "DRY set fan ${pct}% (${hex})"; ipmi_manual_on=1; return 0; fi
if (( ipmi_manual_on == 0 )); then
"$IPMITOOL" raw 0x30 0x30 0x01 0x00 >/dev/null 2>&1 || return 1
ipmi_manual_on=1
fi
"$IPMITOOL" raw 0x30 0x30 0x02 0xff "$hex" >/dev/null 2>&1
}
restore_auto() {
if (( DRY_RUN == 1 )); then log "DRY restore Dell auto fan control"; ipmi_manual_on=0; return 0; fi
"$IPMITOOL" raw 0x30 0x30 0x01 0x01 >/dev/null 2>&1
ipmi_manual_on=0
}
read_cpu_temp() {
fc_parse_temp "$("$IPMITOOL" sdr type temperature 2>/dev/null | grep -E '^Temp ' | head -1)"
}
read_fan_rpm() { # Fan1 RPM — representative (all 6 fans are set together)
"$IPMITOOL" sdr type fan 2>/dev/null | awk -F'|' '/^Fan1/{gsub(/[^0-9]/,"",$5); print $5+0; exit}'
}
presence_cache="cool"; presence_ts=0
get_presence() {
local now; now="$(date +%s)"
if (( now - presence_ts < PRESENCE_INTERVAL )); then echo "$presence_cache"; return 0; fi
presence_ts="$now"
[[ -z "$HA_TOKEN" ]] && { echo "$presence_cache"; return 0; }
local resp state lc_iso lc_epoch
resp="$(curl -fsS --max-time 5 -H "Authorization: Bearer $HA_TOKEN" \
"$HA_URL/api/states/$GARAGE_ENTITY" 2>/dev/null)" || { echo "$presence_cache"; return 0; }
state="$(fc_json_str_field "$resp" state)"
[[ -z "$state" ]] && { echo "$presence_cache"; return 0; }
lc_iso="$(fc_json_str_field "$resp" last_changed)"
lc_epoch="$(date -d "$lc_iso" +%s 2>/dev/null || echo "$now")"
presence_cache="$(fc_presence_mode "$state" "$lc_epoch" "$now" "$HOLD_SECS" "$GARAGE_OPEN_STATE")"
echo "$presence_cache"
}
# ha_entity_state <entity> -> state string (empty if HA disabled/unreachable)
ha_entity_state() {
[[ -z "$HA_TOKEN" ]] && return 0
local resp
resp="$(curl -fsS --max-time 5 -H "Authorization: Bearer $HA_TOKEN" \
"$HA_URL/api/states/$1" 2>/dev/null)" || return 0
fc_json_str_field "$resp" state
}
push_metrics() { # <temp> <pct> <mode> <ha_ok> <fallback> [fan_rpm] [fan_watts_est]
[[ -z "$PUSHGATEWAY_URL" ]] && return 0
local mode_num; case "$3" in quiet) mode_num=1;; cool) mode_num=2;; manual) mode_num=3;; *) mode_num=0;; esac
curl -fsS --max-time 5 --data-binary @- \
"$PUSHGATEWAY_URL/metrics/job/fan_control/instance/pve-r730" >/dev/null 2>&1 <<EOF || true
# TYPE pve_fan_control_cpu_temp_celsius gauge
pve_fan_control_cpu_temp_celsius $1
# TYPE pve_fan_control_fan_percent gauge
pve_fan_control_fan_percent $2
# TYPE pve_fan_control_mode gauge
pve_fan_control_mode $mode_num
# TYPE pve_fan_control_ha_reachable gauge
pve_fan_control_ha_reachable $4
# TYPE pve_fan_control_fallback gauge
pve_fan_control_fallback $5
# TYPE pve_fan_control_fan_rpm gauge
pve_fan_control_fan_rpm ${6:-0}
# TYPE pve_fan_control_fan_watts_est gauge
pve_fan_control_fan_watts_est ${7:-0}
EOF
}
main() {
log "fan-control start (loop=${LOOP_INTERVAL}s presence=${PRESENCE_INTERVAL}s hold=${HOLD_SECS}s ceiling=${CEILING}C dry_run=${DRY_RUN})"
trap 'log "exit — restoring Dell auto fan control"; restore_auto' EXIT
local current=-1 fails=0 in_fallback=0 cool_since=0
while true; do
local temp; temp="$(read_cpu_temp)"
if [[ -z "$temp" ]]; then
fails=$((fails + 1)); log "WARN cannot read CPU temp ($fails/$MAX_IPMI_FAILS)"
if (( fails >= MAX_IPMI_FAILS )); then log "ERR temp unreadable — Dell auto"; restore_auto; current=-1; fi
(( RUN_ONCE == 1 )) && break || { sleep "$LOOP_INTERVAL"; continue; }
fi
fails=0
if (( temp >= CEILING )); then
(( in_fallback == 0 )) && { log "CEILING temp=${temp}${CEILING} — Dell auto"; restore_auto; current=-1; in_fallback=1; }
push_metrics "$temp" 0 fallback 1 1
(( RUN_ONCE == 1 )) && break || { sleep "$LOOP_INTERVAL"; continue; }
fi
if (( in_fallback == 1 )); then
if (( temp < RESUME_BELOW )); then
(( cool_since == 0 )) && cool_since="$(date +%s)"
if (( $(date +%s) - cool_since >= RESUME_STABLE )); then
log "recovered (temp<${RESUME_BELOW}C ${RESUME_STABLE}s) — resuming manual"; in_fallback=0; cool_since=0
else
push_metrics "$temp" 0 fallback 1 1; (( RUN_ONCE == 1 )) && break || { sleep "$LOOP_INTERVAL"; continue; }
fi
else
cool_since=0; push_metrics "$temp" 0 fallback 1 1
(( RUN_ONCE == 1 )) && break || { sleep "$LOOP_INTERVAL"; continue; }
fi
fi
# HA-desired mode (auto/cool/quiet/manual); unreachable/unset => auto.
local ha_mode ha_ok=1; ha_mode="$(ha_entity_state "$MODE_ENTITY")"; [[ -z "$HA_TOKEN" ]] && ha_ok=0
[[ -z "$ha_mode" ]] && ha_mode="auto"
case "$ha_mode" in auto|cool|quiet|manual) ;; *) ha_mode="auto" ;; esac
local manual_pct=0
if [[ "$ha_mode" == "manual" ]]; then
manual_pct="$(ha_entity_state "$MANUAL_ENTITY")"; manual_pct="${manual_pct%%.*}"
[[ "$manual_pct" =~ ^[0-9]+$ ]] || manual_pct=0
fi
local presence="cool"; [[ "$ha_mode" == "auto" ]] && presence="$(get_presence)"
local eff; if [[ "$ha_mode" == "manual" ]]; then eff="manual"; elif [[ "$ha_mode" == "auto" ]]; then eff="$presence"; else eff="$ha_mode"; fi
local pct; pct="$(fc_resolve "$ha_mode" "$temp" "$manual_pct" "$presence" "$current" "$DEADBAND")"
# Only write when first-run or the change clears MIN_STEP (kills 1-2% jitter
# on the continuous curve; fc_decide already gives asymmetric hysteresis).
if (( current < 0 || pct - current >= MIN_STEP || current - pct >= MIN_STEP )); then
if set_manual "$pct"; then log "temp=${temp}C ha_mode=${ha_mode} eff=${eff} fan=${pct}% (was ${current}%)"; current="$pct"
else log "WARN set_manual ${pct}% failed"; fi
fi
local rpm fan_w; rpm="$(read_fan_rpm)"; rpm="${rpm:-0}"; fan_w="$(fc_fan_watts "$rpm")"
push_metrics "$temp" "$current" "$eff" "$ha_ok" 0 "$rpm" "$fan_w"
(( RUN_ONCE == 1 )) && break || sleep "$LOOP_INTERVAL"
done
}
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then main "$@"; fi

View file

@ -0,0 +1,76 @@
#!/usr/bin/env bash
# One-shot migration of every private image on registry.viktorbarzin.me to
# Forgejo. Used as a stop-gap when the dual-push CI pipelines aren't
# producing Forgejo images on their own (Forgejo-Woodpecker forge driver
# context-deadline-exceeded issue, see bd code-d3y / 2026-05-07).
#
# Pulls each image from registry.viktorbarzin.me, retags, pushes to
# forgejo.viktorbarzin.me/viktor/<name>:<tag> — preserving the blob bytes
# verbatim so the cluster can flip image= without a rebuild.
#
# Run from any host with docker + network reach to BOTH registries. Auth
# from `docker login` (~/.docker/config.json) — make sure both registries
# are logged in:
# docker login registry.viktorbarzin.me -u viktorbarzin
# docker login forgejo.viktorbarzin.me -u viktor # use viktor PAT, not ci-pusher
#
# (ci-pusher CANNOT push to viktor/<image> — Forgejo container packages
# are scoped to the pushing user. Only viktor's PAT can write to viktor/*.)
#
# After the script, the new image lives at
# forgejo.viktorbarzin.me/viktor/<name>:<tag>
# Phase 3 of the consolidation flips infra/stacks/<svc>/main.tf image=
# to that path.
set -euo pipefail
OLD_REG=registry.viktorbarzin.me
NEW_REG=forgejo.viktorbarzin.me/viktor
# Image list: <name>:<tag>. Generated 2026-05-07 from `grep -rEn 'image\s*=\s*
# "registry\.viktorbarzin\.me'` across infra/stacks/.
#
# Excluded:
# - wealthfolio-sync: registry repo exists but has 0 tags (CronJob has been
# broken for 36+ days, separate decision needed). User to triage before
# migration.
# - fire-planner: registry repo exists but has 0 tags. Dockerfile + CI added
# in this session (commit 8b53d99e); rebuild via Woodpecker before flipping.
IMAGES=(
"chrome-service-novnc:v4"
"chrome-service-novnc:latest"
"payslip-ingest:latest"
"job-hunter:latest"
"claude-agent-service:latest"
"freedify:latest"
"beadboard:latest"
"infra-ci:latest"
)
for img in "${IMAGES[@]}"; do
echo "=== $img ==="
src="$OLD_REG/$img"
dst="$NEW_REG/$img"
if ! docker pull "$src" 2>&1 | tee /tmp/pull-$$ | grep -q 'Status: '; then
if grep -q 'not found' /tmp/pull-$$; then
echo " SKIP — image not present in source registry"
rm -f /tmp/pull-$$
continue
fi
fi
rm -f /tmp/pull-$$
echo " tag → $dst"
docker tag "$src" "$dst"
echo " push $dst"
docker push "$dst" 2>&1 | tail -2
echo " cleanup local copy"
docker rmi "$src" "$dst" 2>&1 | tail -1 || true
done
echo ""
echo "Done. Verify in Forgejo Web UI: https://forgejo.viktorbarzin.me/viktor/-/packages?type=container"
echo "Phase 3 of the plan flips infra/stacks/{wealthfolio,fire-planner}/main.tf image= references."

View file

@ -0,0 +1,698 @@
// Frigate Bulk Classification Labeler
// Paste this into the browser console on the Frigate /classification page
// while viewing a model's training images.
//
// Image URL pattern: /clips/{modelName}/train/{filename}
// Categorize API: POST /api/classification/{modelName}/dataset/categorize
// body: { category: "...", training_file: "..." }
// Delete API: POST /api/classification/{modelName}/train/delete
// body: { ids: ["..."] }
// Dataset API: GET /api/classification/{modelName}/dataset
// returns: { categories: { catName: [files...] }, training_metadata: {...} }
(async () => {
"use strict";
// --- Configuration ---
const API_BASE = window.location.origin + "/api";
const TOOLBAR_ID = "bulk-classify-toolbar";
// Frigate's axios instance sends these headers on every request.
// X-CSRF-TOKEN is required for state-modifying (POST/PUT/DELETE) requests.
const API_HEADERS = {
"Content-Type": "application/json",
"X-CSRF-TOKEN": "1",
"X-CACHE-BYPASS": "1",
};
// Abort if already injected
if (document.getElementById(TOOLBAR_ID)) {
console.log("Bulk classifier already active. Refresh page to re-inject.");
return;
}
// --- Extract model name from page ---
// Training images use src="/clips/{modelName}/train/{filename}"
let modelName = null;
// Method 1: Extract from training image src on the page
for (const img of document.querySelectorAll("img")) {
const src = img.getAttribute("src") || "";
const m = src.match(/\/clips\/([^/]+)\/train\//);
if (m) { modelName = decodeURIComponent(m[1]); break; }
}
// Method 2: List all custom models from config and let the user pick
if (!modelName) {
try {
const resp = await fetch(`${API_BASE}/config`);
const config = await resp.json();
// Custom classification models are under config.classification.custom
const models = Object.keys(config.classification?.custom || {});
if (models.length === 1) {
modelName = models[0];
} else if (models.length > 1) {
modelName = prompt(
`Multiple classification models found. Enter the model name:\n\n${models.join(", ")}`,
);
}
} catch (_) {}
}
if (!modelName) {
alert(
"Could not detect model name.\nMake sure you are on the /classification page with training images visible.",
);
return;
}
console.log(`[bulk-classify] Detected model: "${modelName}"`);
// --- Fetch categories from the dataset API ---
let categories = [];
try {
const resp = await fetch(`${API_BASE}/classification/${encodeURIComponent(modelName)}/dataset`);
const data = await resp.json();
// Dataset response: { categories: { catName: [files...] }, training_metadata: {...} }
categories = Object.keys(data.categories || data);
} catch (e) {
console.error("Failed to fetch categories:", e);
}
// Deduplicate
categories = [...new Set(categories)];
console.log("[bulk-classify] Categories:", categories);
// --- Fetch all training filenames and build event groups ---
// Frigate groups training images by eventId (first two segments of the filename).
// Filename format: {timestamp}-{randomId}-{timestamp2}-{label}-{score}.webp
// EventId = "{timestamp}-{randomId}"
let allTrainFiles = [];
const eventGroups = {}; // eventId -> [filename, ...]
function parseEventId(filename) {
const base = filename.replace(/\.webp$/, "");
const parts = base.split("-");
if (parts.length >= 2) return `${parts[0]}-${parts[1]}`;
return filename; // fallback: treat as its own group
}
try {
const resp = await fetch(
`${API_BASE}/classification/${encodeURIComponent(modelName)}/train`,
{ headers: API_HEADERS },
);
allTrainFiles = await resp.json();
for (const f of allTrainFiles) {
const eid = parseEventId(f);
if (!eventGroups[eid]) eventGroups[eid] = [];
eventGroups[eid].push(f);
}
console.log(
`[bulk-classify] Loaded ${allTrainFiles.length} training files in ${Object.keys(eventGroups).length} event groups.`,
);
} catch (e) {
console.error("[bulk-classify] Failed to fetch training files:", e);
}
// Get all filenames in the same event group as the given filename
function getGroupFiles(filename) {
const eid = parseEventId(filename);
return eventGroups[eid] || [filename];
}
// --- State ---
const selected = new Set();
// --- Inject styles ---
const style = document.createElement("style");
style.textContent = `
#${TOOLBAR_ID} {
position: fixed;
bottom: 20px;
left: 50%;
transform: translateX(-50%);
z-index: 99999;
background: #1e1e2e;
border: 1px solid #444;
border-radius: 12px;
padding: 12px 20px;
display: flex;
align-items: center;
gap: 12px;
box-shadow: 0 8px 32px rgba(0,0,0,0.5);
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
font-size: 14px;
color: #cdd6f4;
}
#${TOOLBAR_ID} button {
padding: 6px 14px;
border: 1px solid #555;
border-radius: 6px;
background: #313244;
color: #cdd6f4;
cursor: pointer;
font-size: 13px;
white-space: nowrap;
}
#${TOOLBAR_ID} button:hover {
background: #45475a;
}
#${TOOLBAR_ID} button.primary {
background: #89b4fa;
color: #1e1e2e;
border-color: #89b4fa;
font-weight: 600;
}
#${TOOLBAR_ID} button.primary:hover {
background: #74c7ec;
}
#${TOOLBAR_ID} button.primary:disabled {
opacity: 0.5;
cursor: not-allowed;
}
#${TOOLBAR_ID} button.danger {
background: #f38ba8;
color: #1e1e2e;
border-color: #f38ba8;
font-weight: 600;
}
#${TOOLBAR_ID} button.danger:hover {
background: #eba0ac;
}
.bulk-classify-dropdown {
position: relative;
display: inline-block;
}
.bulk-classify-dropdown-btn {
padding: 6px 14px;
border: 1px solid #555;
border-radius: 6px;
background: #313244;
color: #cdd6f4;
cursor: pointer;
font-size: 13px;
white-space: nowrap;
min-width: 140px;
text-align: left;
}
.bulk-classify-dropdown-btn::after {
content: " ▾";
float: right;
margin-left: 8px;
}
.bulk-classify-dropdown-menu {
display: none;
position: absolute;
bottom: 100%;
left: 0;
margin-bottom: 4px;
background: #313244;
border: 1px solid #555;
border-radius: 6px;
max-height: 250px;
overflow-y: auto;
min-width: 180px;
box-shadow: 0 -4px 16px rgba(0,0,0,0.4);
z-index: 100000;
}
.bulk-classify-dropdown-menu.open {
display: block;
}
.bulk-classify-dropdown-item {
padding: 8px 14px;
cursor: pointer;
font-size: 13px;
color: #cdd6f4;
white-space: nowrap;
}
.bulk-classify-dropdown-item:hover {
background: #45475a;
}
.bulk-classify-dropdown-item.active {
background: #89b4fa;
color: #1e1e2e;
}
#${TOOLBAR_ID} .count {
font-weight: 600;
min-width: 30px;
text-align: center;
}
#${TOOLBAR_ID} .separator {
width: 1px;
height: 24px;
background: #555;
}
#${TOOLBAR_ID} .progress {
font-size: 12px;
color: #a6adc8;
}
.bulk-classify-checkbox {
position: absolute;
top: 6px;
left: 6px;
z-index: 9999;
width: 22px;
height: 22px;
cursor: pointer;
accent-color: #89b4fa;
pointer-events: auto;
}
.bulk-classify-selected {
outline: 3px solid #89b4fa !important;
outline-offset: -3px;
}
.bulk-classify-overlay {
position: fixed;
inset: 0;
z-index: 99998;
background: rgba(0,0,0,0.6);
display: flex;
align-items: center;
justify-content: center;
}
.bulk-classify-dialog {
background: #1e1e2e;
border: 1px solid #444;
border-radius: 12px;
padding: 24px;
min-width: 350px;
color: #cdd6f4;
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
}
.bulk-classify-dialog h3 {
margin: 0 0 16px;
font-size: 16px;
}
.bulk-classify-dialog .progress-bar {
width: 100%;
height: 8px;
background: #313244;
border-radius: 4px;
overflow: hidden;
margin: 12px 0;
}
.bulk-classify-dialog .progress-fill {
height: 100%;
background: #89b4fa;
transition: width 0.2s;
}
.bulk-classify-dialog .status {
font-size: 13px;
color: #a6adc8;
}
`;
document.head.appendChild(style);
// --- Helper: find all training image cards ---
function getImageCards() {
// Training images use src="/clips/{modelName}/train/{filename}"
// Filenames are like: 1770573871.602803-in4y00-1770573889.027752-none-1.0.webp
const pattern = /\/clips\/[^/]+\/train\/([^/?#]+)/;
const imgs = document.querySelectorAll("img");
const cards = [];
const seen = new Set();
for (const img of imgs) {
const src = img.getAttribute("src") || "";
const match = src.match(pattern);
if (match && !seen.has(match[1])) {
seen.add(match[1]);
// Walk up to find the card container (Frigate uses aspect-square divs)
let card =
img.closest("[class*='aspect-']") ||
img.closest("[class*='card']") ||
img.parentElement?.parentElement ||
img.parentElement;
// Resolve the full group of filenames for this card
const groupFiles = getGroupFiles(match[1]);
cards.push({ element: card, filename: match[1], img, groupFiles });
}
}
return cards;
}
// --- Debug: log what images we found ---
const debugImgs = document.querySelectorAll("img");
const debugSrcs = Array.from(debugImgs)
.map((i) => i.getAttribute("src"))
.filter(Boolean);
console.log(
`[bulk-classify] Found ${debugSrcs.length} <img> elements. Sample srcs:`,
debugSrcs.slice(0, 5),
);
const initialCards = getImageCards();
console.log(
`[bulk-classify] Matched ${initialCards.length} training image cards.`,
);
// --- Add checkboxes to all cards ---
function injectCheckboxes() {
const cards = getImageCards();
for (const { element, filename, groupFiles } of cards) {
if (element.querySelector(".bulk-classify-checkbox")) continue;
// Ensure relative positioning for absolute checkbox
element.style.position = "relative";
const cb = document.createElement("input");
cb.type = "checkbox";
cb.className = "bulk-classify-checkbox";
cb.dataset.filename = filename;
cb.checked = selected.has(filename);
// Show group count badge next to checkbox if group has >1 image
let badge = null;
if (groupFiles.length > 1) {
badge = document.createElement("span");
badge.className = "bulk-classify-badge";
badge.textContent = groupFiles.length;
badge.style.cssText =
"position:absolute;top:6px;left:32px;z-index:9999;background:#89b4fa;color:#1e1e2e;" +
"font-size:11px;font-weight:700;padding:1px 5px;border-radius:8px;pointer-events:none;";
}
cb.addEventListener("change", (e) => {
e.stopPropagation();
if (cb.checked) {
// Select ALL files in this event group
for (const f of groupFiles) selected.add(f);
element.classList.add("bulk-classify-selected");
} else {
for (const f of groupFiles) selected.delete(f);
element.classList.remove("bulk-classify-selected");
}
updateCount();
});
// Also allow clicking the image to toggle
element.addEventListener("click", (e) => {
// Don't intercept if clicking the checkbox itself or a button
if (
e.target === cb ||
e.target.closest("button") ||
e.target.closest("a")
)
return;
e.preventDefault();
e.stopPropagation();
cb.checked = !cb.checked;
cb.dispatchEvent(new Event("change"));
});
element.prepend(cb);
if (badge) element.appendChild(badge);
}
}
// --- Toolbar ---
const toolbar = document.createElement("div");
toolbar.id = TOOLBAR_ID;
const countLabel = document.createElement("span");
countLabel.className = "count";
countLabel.textContent = "0";
const countText = document.createElement("span");
countText.textContent = "selected";
const sep1 = document.createElement("div");
sep1.className = "separator";
const selectAllBtn = document.createElement("button");
selectAllBtn.textContent = "Select All";
selectAllBtn.addEventListener("click", () => {
const cards = getImageCards();
for (const { element, groupFiles } of cards) {
for (const f of groupFiles) selected.add(f);
element.classList.add("bulk-classify-selected");
const cb = element.querySelector(".bulk-classify-checkbox");
if (cb) cb.checked = true;
}
updateCount();
});
const deselectBtn = document.createElement("button");
deselectBtn.textContent = "Deselect All";
deselectBtn.addEventListener("click", () => {
const cards = getImageCards();
for (const { element, groupFiles } of cards) {
for (const f of groupFiles) selected.delete(f);
element.classList.remove("bulk-classify-selected");
const cb = element.querySelector(".bulk-classify-checkbox");
if (cb) cb.checked = false;
}
updateCount();
});
const sep2 = document.createElement("div");
sep2.className = "separator";
// --- Custom dropdown (replaces native <select> which React intercepts) ---
let selectedCategory = "";
const dropdown = document.createElement("div");
dropdown.className = "bulk-classify-dropdown";
const dropdownBtn = document.createElement("div");
dropdownBtn.className = "bulk-classify-dropdown-btn";
dropdownBtn.textContent = "-- pick category --";
const dropdownMenu = document.createElement("div");
dropdownMenu.className = "bulk-classify-dropdown-menu";
function buildMenuItems() {
dropdownMenu.innerHTML = "";
for (const cat of categories) {
const item = document.createElement("div");
item.className = "bulk-classify-dropdown-item";
if (cat === selectedCategory) item.classList.add("active");
item.textContent = cat;
item.addEventListener("mousedown", (e) => {
e.preventDefault();
e.stopPropagation();
selectedCategory = cat;
dropdownBtn.textContent = cat;
dropdownMenu.classList.remove("open");
buildMenuItems(); // refresh active state
});
dropdownMenu.appendChild(item);
}
}
buildMenuItems();
dropdownBtn.addEventListener("mousedown", (e) => {
e.preventDefault();
e.stopPropagation();
dropdownMenu.classList.toggle("open");
});
// Close dropdown when clicking outside
document.addEventListener("mousedown", (e) => {
if (!dropdown.contains(e.target)) {
dropdownMenu.classList.remove("open");
}
});
dropdown.appendChild(dropdownBtn);
dropdown.appendChild(dropdownMenu);
// Allow typing a new category
const newCatInput = document.createElement("input");
newCatInput.type = "text";
newCatInput.placeholder = "or type new...";
newCatInput.style.cssText =
"padding:6px 10px;border:1px solid #555;border-radius:6px;background:#313244;color:#cdd6f4;font-size:13px;width:120px;";
const categorizeBtn = document.createElement("button");
categorizeBtn.className = "primary";
categorizeBtn.textContent = "Categorize Selected";
const deleteBtn = document.createElement("button");
deleteBtn.className = "danger";
deleteBtn.textContent = "Delete Selected";
toolbar.append(
countLabel,
countText,
sep1,
selectAllBtn,
deselectBtn,
sep2,
dropdown,
newCatInput,
categorizeBtn,
deleteBtn,
);
// Prevent events from bubbling out of toolbar to React's root handler
for (const evt of ["click", "mousedown", "mouseup", "pointerdown", "pointerup", "focus", "blur"]) {
toolbar.addEventListener(evt, (e) => e.stopPropagation());
}
document.body.appendChild(toolbar);
function updateCount() {
countLabel.textContent = selected.size;
categorizeBtn.disabled = selected.size === 0;
}
// --- Progress dialog ---
function showProgress(title, total) {
const overlay = document.createElement("div");
overlay.className = "bulk-classify-overlay";
const dialog = document.createElement("div");
dialog.className = "bulk-classify-dialog";
dialog.innerHTML = `
<h3>${title}</h3>
<div class="status">0 / ${total}</div>
<div class="progress-bar"><div class="progress-fill" style="width:0%"></div></div>
<div class="errors" style="color:#f38ba8;font-size:12px;margin-top:8px"></div>
`;
overlay.appendChild(dialog);
document.body.appendChild(overlay);
return {
update(current, errorMsg) {
const pct = Math.round((current / total) * 100);
dialog.querySelector(".status").textContent =
`${current} / ${total}`;
dialog.querySelector(".progress-fill").style.width = pct + "%";
if (errorMsg) {
dialog.querySelector(".errors").textContent += errorMsg + "\n";
}
},
close() {
overlay.remove();
},
};
}
// --- Categorize handler ---
// POST /api/classification/{modelName}/dataset/categorize
// body: { category: "...", training_file: "..." }
categorizeBtn.addEventListener("click", async () => {
const category = newCatInput.value.trim() || selectedCategory;
if (!category) {
alert("Select a category or type a new one.");
return;
}
if (selected.size === 0) {
alert("No images selected.");
return;
}
const files = Array.from(selected);
if (
!confirm(
`Categorize ${files.length} image(s) as "${category}"?`,
)
)
return;
const progress = showProgress(
`Categorizing as "${category}"`,
files.length,
);
let errors = 0;
for (let i = 0; i < files.length; i++) {
try {
const resp = await fetch(
`${API_BASE}/classification/${encodeURIComponent(modelName)}/dataset/categorize`,
{
method: "POST",
headers: API_HEADERS,
body: JSON.stringify({
category: category,
training_file: files[i],
}),
},
);
if (!resp.ok) {
const text = await resp.text();
progress.update(i + 1, `Failed: ${files[i]} - ${text}`);
errors++;
} else {
progress.update(i + 1);
}
} catch (e) {
progress.update(i + 1, `Error: ${files[i]} - ${e.message}`);
errors++;
}
}
setTimeout(() => {
progress.close();
if (errors === 0) {
selected.clear();
updateCount();
alert(
`Done! ${files.length} image(s) categorized as "${category}".\nRefreshing the training view...`,
);
window.location.reload();
} else {
alert(
`Completed with ${errors} error(s). Check console for details.`,
);
}
}, 500);
});
// --- Delete handler ---
// POST /api/classification/{modelName}/train/delete
// body: { ids: ["filename1", "filename2", ...] }
deleteBtn.addEventListener("click", async () => {
if (selected.size === 0) {
alert("No images selected.");
return;
}
const files = Array.from(selected);
if (
!confirm(
`DELETE ${files.length} training image(s)? This cannot be undone.`,
)
)
return;
const progress = showProgress("Deleting training images", 1);
try {
const resp = await fetch(
`${API_BASE}/classification/${encodeURIComponent(modelName)}/train/delete`,
{
method: "POST",
headers: API_HEADERS,
body: JSON.stringify({ ids: files }),
},
);
if (!resp.ok) {
const text = await resp.text();
progress.update(1, `Failed: ${text}`);
} else {
progress.update(1);
}
} catch (e) {
progress.update(1, `Error: ${e.message}`);
}
setTimeout(() => {
progress.close();
selected.clear();
updateCount();
alert(`Deleted ${files.length} training image(s).\nRefreshing...`);
window.location.reload();
}, 500);
});
// --- Initial injection + MutationObserver for dynamic loading ---
injectCheckboxes();
const observer = new MutationObserver(() => {
injectCheckboxes();
});
observer.observe(document.body, { childList: true, subtree: true });
updateCount();
console.log(
`Bulk classifier active for model "${modelName}". ${categories.length} categories found: [${categories.join(", ")}]`,
);
})();

305
scripts/frigate-inspect.mjs Normal file
View file

@ -0,0 +1,305 @@
#!/usr/bin/env node
// Frigate Classification Page Inspector
// Phase 1: Fetch API data via HTTP to understand the data model
// Phase 2: Fetch the classification page HTML and parse its DOM structure
// No browser needed — uses plain HTTP requests.
import { spawn } from "child_process";
import http from "http";
const KUBE_CONFIG = `${process.cwd()}/config`;
const LOCAL_PORT = 15000;
const FRIGATE_NS = "frigate";
const FRIGATE_SVC = "svc/frigate";
const FRIGATE_PORT = 80;
const BASE_URL = `http://localhost:${LOCAL_PORT}`;
async function startPortForward() {
console.log(
`[port-forward] Starting: kubectl port-forward ${FRIGATE_SVC} ${LOCAL_PORT}:${FRIGATE_PORT} -n ${FRIGATE_NS}`,
);
const proc = spawn(
"kubectl",
[
"--kubeconfig",
KUBE_CONFIG,
"port-forward",
FRIGATE_SVC,
`${LOCAL_PORT}:${FRIGATE_PORT}`,
"-n",
FRIGATE_NS,
],
{ stdio: ["ignore", "pipe", "pipe"] },
);
await new Promise((resolve, reject) => {
const timer = setTimeout(
() => reject(new Error("Port-forward timed out")),
15000,
);
proc.stdout.on("data", (data) => {
if (data.toString().includes("Forwarding from")) {
clearTimeout(timer);
resolve();
}
});
proc.stderr.on("data", (data) => {
console.error(`[port-forward stderr] ${data.toString().trim()}`);
});
proc.on("error", (err) => {
clearTimeout(timer);
reject(err);
});
proc.on("exit", (code) => {
if (code !== null && code !== 0) {
clearTimeout(timer);
reject(new Error(`port-forward exited with code ${code}`));
}
});
});
console.log("[port-forward] Ready");
return proc;
}
function httpGet(path) {
return new Promise((resolve, reject) => {
const url = `${BASE_URL}${path}`;
http.get(url, (res) => {
let body = "";
res.on("data", (chunk) => (body += chunk));
res.on("end", () =>
resolve({ status: res.statusCode, body, headers: res.headers }),
);
}).on("error", (err) => reject(err));
});
}
async function main() {
let portForwardProc = null;
try {
portForwardProc = await startPortForward();
// ================================================================
// API INSPECTION
// ================================================================
console.log("\n" + "=".repeat(80));
console.log("API INSPECTION");
console.log("=".repeat(80));
// Get config to find model names
const configResp = await httpGet("/api/config");
let modelNames = [];
if (configResp.status === 200) {
try {
const config = JSON.parse(configResp.body);
// Custom classification models are under config.classification.custom
const classificationModels = config.classification?.custom || {};
modelNames = Object.keys(classificationModels);
console.log(
`\n[API] /api/config - Classification models: ${JSON.stringify(modelNames)}`,
);
console.log(
`[API] Classification config:\n${JSON.stringify(config.classification, null, 2)}`,
);
} catch (e) {
console.log(`[API] /api/config - Failed to parse: ${e.message}`);
console.log(
`[API] Raw (first 500): ${configResp.body.slice(0, 500)}`,
);
}
} else {
console.log(`[API] /api/config - HTTP ${configResp.status}`);
}
for (const model of modelNames) {
console.log(`\n--- Model: ${model} ---`);
const encodedModel = encodeURIComponent(model);
// Dataset endpoint
const datasetResp = await httpGet(
`/api/classification/${encodedModel}/dataset`,
);
if (datasetResp.status === 200) {
try {
const dataset = JSON.parse(datasetResp.body);
// Dataset response: { categories: { catName: [files...] }, training_metadata: {...} }
const cats = dataset.categories || dataset;
const categories = Object.keys(cats);
console.log(`[API] /api/classification/${model}/dataset`);
console.log(` Categories: ${JSON.stringify(categories)}`);
for (const cat of categories) {
const items = Array.isArray(cats[cat]) ? cats[cat] : [];
console.log(
` "${cat}": ${items.length} items, first 3: ${JSON.stringify(items.slice(0, 3))}`,
);
}
if (dataset.training_metadata) {
console.log(` Training metadata: ${JSON.stringify(dataset.training_metadata, null, 2)}`);
}
} catch (e) {
console.log(` Failed to parse dataset: ${e.message}`);
}
} else {
console.log(
`[API] /api/classification/${model}/dataset - HTTP ${datasetResp.status}: ${datasetResp.body.slice(0, 200)}`,
);
}
// Train endpoint
const trainResp = await httpGet(
`/api/classification/${encodedModel}/train`,
);
if (trainResp.status === 200) {
try {
const train = JSON.parse(trainResp.body);
const entries = Array.isArray(train) ? train : Object.entries(train);
console.log(`[API] /api/classification/${model}/train`);
console.log(
` Type: ${Array.isArray(train) ? "array" : typeof train}, length/keys: ${Array.isArray(train) ? train.length : Object.keys(train).length}`,
);
console.log(
` First 5 entries:\n${JSON.stringify(entries.slice(0, 5), null, 2)}`,
);
} catch (e) {
console.log(` Failed to parse train: ${e.message}`);
}
} else {
console.log(
`[API] /api/classification/${model}/train - HTTP ${trainResp.status}: ${trainResp.body.slice(0, 200)}`,
);
}
// Try to get a thumbnail URL to understand the image src pattern
if (trainResp.status === 200) {
try {
const train = JSON.parse(trainResp.body);
const firstFile = Array.isArray(train) ? train[0] : null;
if (firstFile) {
// Try various thumbnail URL patterns
const patterns = [
`/api/classification/${encodedModel}/train/${firstFile}/thumbnail.jpg`,
`/api/classification/${encodedModel}/train/${firstFile}`,
`/clips/${encodedModel}/train/${firstFile}`,
];
for (const p of patterns) {
const resp = await httpGet(p);
console.log(
` Thumbnail URL test: ${p} -> HTTP ${resp.status} (content-type: ${resp.headers["content-type"]}, size: ${resp.body.length})`,
);
}
}
} catch (_) {}
}
}
// ================================================================
// HTML/DOM INSPECTION
// ================================================================
console.log("\n" + "=".repeat(80));
console.log("HTML / DOM INSPECTION");
console.log("=".repeat(80));
// Fetch the main classification page HTML
const classifPageResp = await httpGet("/classification");
console.log(
`\n[HTML] /classification - HTTP ${classifPageResp.status} (${classifPageResp.body.length} bytes)`,
);
// This is likely a React SPA, so the HTML will be minimal. Let's check.
const html = classifPageResp.body;
console.log(`[HTML] First 2000 chars:\n${html.slice(0, 2000)}`);
// Check for any JS bundle references (to find source maps or component names)
const scriptMatches = html.match(/<script[^>]*src="([^"]+)"[^>]*>/g) || [];
console.log(`\n[HTML] Script tags: ${scriptMatches.length}`);
for (const s of scriptMatches) {
console.log(` ${s}`);
}
// Fetch the main JS bundle to look for classification component code
const jsMatch = html.match(/src="(\/assets\/[^"]+\.js)"/);
if (jsMatch) {
console.log(`\n[JS] Fetching main bundle: ${jsMatch[1]}`);
const jsResp = await httpGet(jsMatch[1]);
if (jsResp.status === 200) {
const js = jsResp.body;
console.log(`[JS] Bundle size: ${js.length} bytes`);
// Search for classification-related code patterns
const searchTerms = [
"classify image as",
"Classify image as",
"categorize",
"/classification/",
"dataset/categorize",
"training_file",
"train/delete",
"ModelTraining",
"classification",
];
for (const term of searchTerms) {
const idx = js.indexOf(term);
if (idx !== -1) {
const context = js.slice(Math.max(0, idx - 200), idx + 200);
console.log(`\n[JS] Found "${term}" at offset ${idx}:`);
console.log(` ...${context}...`);
}
}
// Look for the dropdown/select implementation
const selectTerms = [
"combobox",
"listbox",
"SelectTrigger",
"SelectContent",
"SelectItem",
"Select>",
"DropdownMenu",
];
for (const term of selectTerms) {
const idx = js.indexOf(term);
if (idx !== -1) {
const context = js.slice(Math.max(0, idx - 150), idx + 150);
console.log(`\n[JS] Found "${term}" at offset ${idx}:`);
console.log(` ...${context}...`);
}
}
}
}
// Also check if there are multiple JS chunks
const allJsMatches =
html.match(/src="(\/assets\/[^"]+\.js)"/g) || [];
console.log(`\n[JS] All JS assets: ${allJsMatches.length}`);
for (const m of allJsMatches) {
const path = m.match(/src="([^"]+)"/)?.[1];
if (path) console.log(` ${path}`);
}
// Try to fetch the Frigate source for classification view from GitHub
console.log("\n" + "=".repeat(80));
console.log("FRIGATE VERSION");
console.log("=".repeat(80));
const versionResp = await httpGet("/api/version");
if (versionResp.status === 200) {
console.log(`[API] Frigate version: ${versionResp.body}`);
}
console.log("\n" + "=".repeat(80));
console.log("INSPECTION COMPLETE");
console.log("=".repeat(80));
} catch (err) {
console.error(`\n[ERROR] ${err.message}`);
console.error(err.stack);
} finally {
if (portForwardProc) {
console.log("\n[cleanup] Killing port-forward...");
portForwardProc.kill("SIGTERM");
}
}
}
main().catch(console.error);

View file

@ -0,0 +1,511 @@
#!/usr/bin/env python3
"""Generate Terragrunt service stack files for all app-level services."""
import os
import textwrap
REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Each service: (module_name, source_dir, [(arg_name, var_expr), ...], tier)
# var_expr is what goes on the right side of = in the module call.
# If var_expr starts with "var.", it's a variable passthrough and we declare the variable.
# If it's a literal string, we inline it.
# Special: "LOCAL_TIER" means we use local.tiers.<tier>
SERVICES = [
("blog", "blog", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("descheduler", "descheduler", []),
("f1-stream", "f1-stream", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
("turn_secret", "var.coturn_turn_secret"),
("public_ip", "var.public_ip"),
]),
("coturn", "coturn", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:edge"),
("turn_secret", "var.coturn_turn_secret"),
("public_ip", "var.public_ip"),
]),
("hackmd", "hackmd", [
("hackmd_db_password", "var.hackmd_db_password"),
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:edge"),
]),
("kms", "kms", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("k8s-dashboard", "k8s-dashboard", [
("tier", "LOCAL_TIER:cluster"),
("tls_secret_name", "var.tls_secret_name"),
("client_certificate_secret_name", "var.client_certificate_secret_name"),
]),
("privatebin", "privatebin", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:edge"),
]),
("reloader", "reloader", [
("tier", "LOCAL_TIER:aux"),
]),
("shadowsocks", "shadowsocks", [
("password", "var.shadowsocks_password"),
("tier", "LOCAL_TIER:edge"),
]),
("city-guesser", "city-guesser", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("echo", "echo", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:edge"),
]),
("url", "url-shortener", [
("tls_secret_name", "var.tls_secret_name"),
("geolite_license_key", "var.url_shortener_geolite_license_key"),
("api_key", "var.url_shortener_api_key"),
("mysql_password", "var.url_shortener_mysql_password"),
("tier", "LOCAL_TIER:aux"),
]),
("webhook_handler", "webhook_handler", [
("tls_secret_name", "var.tls_secret_name"),
("webhook_secret", "var.webhook_handler_secret"),
("fb_verify_token", "var.webhook_handler_fb_verify_token"),
("fb_page_token", "var.webhook_handler_fb_page_token"),
("fb_app_secret", "var.webhook_handler_fb_app_secret"),
("git_user", "var.webhook_handler_git_user"),
("git_token", "var.webhook_handler_git_token"),
("ssh_key", "var.webhook_handler_ssh_key"),
("tier", "LOCAL_TIER:aux"),
]),
("excalidraw", "excalidraw", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("travel_blog", "travel_blog", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("dashy", "dashy", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("send", "send", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("ytdlp", "youtube_dl", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
("openrouter_api_key", "var.openrouter_api_key"),
("slack_bot_token", "var.slack_bot_token"),
("slack_channel", "var.slack_channel"),
]),
("immich", "immich", [
("tls_secret_name", "var.tls_secret_name"),
("postgresql_password", "var.immich_postgresql_password"),
("frame_api_key", "var.immich_frame_api_key"),
("homepage_token", 'var.homepage_credentials["immich"]["token"]'),
("tier", "LOCAL_TIER:gpu"),
]),
("resume", "resume", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
("database_url", "var.resume_database_url"),
("auth_secret", "var.resume_auth_secret"),
("smtp_password", 'var.mailserver_accounts["info@viktorbarzin.me"]'),
]),
("frigate", "frigate", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:gpu"),
]),
("paperless-ngx", "paperless-ngx", [
("tls_secret_name", "var.tls_secret_name"),
("db_password", "var.paperless_db_password"),
("homepage_username", 'var.homepage_credentials["paperless-ngx"]["username"]'),
("homepage_password", 'var.homepage_credentials["paperless-ngx"]["password"]'),
("tier", "LOCAL_TIER:edge"),
]),
("jsoncrack", "jsoncrack", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("servarr", "servarr", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
("aiostreams_database_connection_string", "var.aiostreams_database_connection_string"),
]),
("ollama", "ollama", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:gpu"),
("ollama_api_credentials", "var.ollama_api_credentials"),
]),
("ntfy", "ntfy", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("cyberchef", "cyberchef", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("diun", "diun", [
("tls_secret_name", "var.tls_secret_name"),
("diun_nfty_token", "var.diun_nfty_token"),
("diun_slack_url", "var.diun_slack_url"),
("tier", "LOCAL_TIER:aux"),
]),
("meshcentral", "meshcentral", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("netbox", "netbox", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("nextcloud", "nextcloud", [
("tls_secret_name", "var.tls_secret_name"),
("db_password", "var.nextcloud_db_password"),
("tier", "LOCAL_TIER:edge"),
]),
("homepage", "homepage", [
("tier", "LOCAL_TIER:aux"),
("tls_secret_name", "var.tls_secret_name"),
]),
("matrix", "matrix", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("linkwarden", "linkwarden", [
("tls_secret_name", "var.tls_secret_name"),
("postgresql_password", "var.linkwarden_postgresql_password"),
("authentik_client_id", "var.linkwarden_authentik_client_id"),
("authentik_client_secret", "var.linkwarden_authentik_client_secret"),
("tier", "LOCAL_TIER:aux"),
]),
("actualbudget", "actualbudget", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:edge"),
("credentials", "var.actualbudget_credentials"),
]),
("owntracks", "owntracks", [
("tls_secret_name", "var.tls_secret_name"),
("owntracks_credentials", "var.owntracks_credentials"),
("tier", "LOCAL_TIER:aux"),
]),
("dawarich", "dawarich", [
("tls_secret_name", "var.tls_secret_name"),
("database_password", "var.dawarich_database_password"),
("geoapify_api_key", "var.geoapify_api_key"),
("tier", "LOCAL_TIER:edge"),
]),
("changedetection", "changedetection", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("tandoor", "tandoor", [
("tls_secret_name", "var.tls_secret_name"),
("tandoor_database_password", "var.tandoor_database_password"),
("tandoor_email_password", "var.tandoor_email_password"),
("tier", "LOCAL_TIER:aux"),
]),
("n8n", "n8n", [
("tls_secret_name", "var.tls_secret_name"),
("postgresql_password", "var.n8n_postgresql_password"),
("tier", "LOCAL_TIER:aux"),
]),
("real-estate-crawler", "real-estate-crawler", [
("tls_secret_name", "var.tls_secret_name"),
("db_password", "var.realestate_crawler_db_password"),
("notification_settings", "var.realestate_crawler_notification_settings"),
("tier", "LOCAL_TIER:aux"),
]),
("osm_routing", "osm-routing", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("tor-proxy", "tor-proxy", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("onlyoffice", "onlyoffice", [
("tls_secret_name", "var.tls_secret_name"),
("db_password", "var.onlyoffice_db_password"),
("jwt_token", "var.onlyoffice_jwt_token"),
("tier", "LOCAL_TIER:edge"),
]),
("forgejo", "forgejo", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:edge"),
]),
("freshrss", "freshrss", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("navidrome", "navidrome", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("networking-toolbox", "networking-toolbox", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("tuya-bridge", "tuya-bridge", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:cluster"),
("tiny_tuya_api_key", "var.tiny_tuya_api_key"),
("tiny_tuya_api_secret", "var.tiny_tuya_api_secret"),
("tiny_tuya_service_secret", "var.tiny_tuya_service_secret"),
("slack_url", "var.tiny_tuya_slack_url"),
]),
("stirling-pdf", "stirling-pdf", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("isponsorblocktv", "isponsorblocktv", [
("tier", "LOCAL_TIER:edge"),
]),
("ebook2audiobook", "ebook2audiobook", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:gpu"),
]),
("rybbit", "rybbit", [
("tls_secret_name", "var.tls_secret_name"),
("clickhouse_password", "var.clickhouse_password"),
("postgres_password", "var.clickhouse_postgres_password"),
("tier", "LOCAL_TIER:aux"),
]),
("wealthfolio", "wealthfolio", [
("tls_secret_name", "var.tls_secret_name"),
("wealthfolio_password_hash", "var.wealthfolio_password_hash"),
("tier", "LOCAL_TIER:aux"),
]),
("speedtest", "speedtest", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
("db_password", "var.speedtest_db_password"),
]),
("freedify", "freedify", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
("additional_credentials", "var.freedify_credentials"),
]),
("affine", "affine", [
("tls_secret_name", "var.tls_secret_name"),
("postgresql_password", "var.affine_postgresql_password"),
("smtp_password", 'var.mailserver_accounts["info@viktorbarzin.me"]'),
("tier", "LOCAL_TIER:aux"),
]),
("plotting-book", "plotting-book", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:aux"),
]),
("health", "health", [
("tls_secret_name", "var.tls_secret_name"),
("postgresql_password", "var.health_postgresql_password"),
("secret_key", "var.health_secret_key"),
("tier", "LOCAL_TIER:aux"),
]),
("whisper", "whisper", [
("tls_secret_name", "var.tls_secret_name"),
("tier", "LOCAL_TIER:gpu"),
]),
("grampsweb", "grampsweb", [
("tls_secret_name", "var.tls_secret_name"),
("smtp_password", 'var.mailserver_accounts["info@viktorbarzin.me"]'),
("tier", "LOCAL_TIER:aux"),
]),
("openclaw", "openclaw", [
("tls_secret_name", "var.tls_secret_name"),
("ssh_key", "var.openclaw_ssh_key"),
("skill_secrets", "var.openclaw_skill_secrets"),
("gemini_api_key", "var.gemini_api_key"),
("llama_api_key", "var.llama_api_key"),
("brave_api_key", "var.brave_api_key"),
("modal_api_key", "var.modal_api_key"),
("tier", "LOCAL_TIER:aux"),
]),
]
# Variable type overrides (var_name -> type declaration)
VAR_TYPES = {
"tls_secret_name": "string",
"client_certificate_secret_name": "string",
"public_ip": "string",
"hackmd_db_password": "string",
"shadowsocks_password": "string",
"openrouter_api_key": "string",
"slack_bot_token": "string",
"slack_channel": "string",
"ollama_api_credentials": "string",
"clickhouse_password": "string",
"clickhouse_postgres_password": "string",
"wealthfolio_password_hash": "string",
"speedtest_db_password": "string",
"affine_postgresql_password": "string",
"health_postgresql_password": "string",
"health_secret_key": "string",
"gemini_api_key": "string",
"llama_api_key": "string",
"brave_api_key": "string",
"modal_api_key": "string",
"coturn_turn_secret": "string",
"onlyoffice_db_password": "string",
"onlyoffice_jwt_token": "string",
"resume_database_url": "string",
"resume_auth_secret": "string",
"nextcloud_db_password": "string",
"paperless_db_password": "string",
"diun_nfty_token": "string",
"diun_slack_url": "string",
"dawarich_database_password": "string",
"geoapify_api_key": "string",
"tandoor_database_password": "string",
"tandoor_email_password": "string",
"n8n_postgresql_password": "string",
"realestate_crawler_db_password": "string",
"immich_postgresql_password": "string",
"immich_frame_api_key": "string",
"linkwarden_postgresql_password": "string",
"linkwarden_authentik_client_id": "string",
"linkwarden_authentik_client_secret": "string",
"aiostreams_database_connection_string": "string",
"tiny_tuya_api_key": "string",
"tiny_tuya_api_secret": "string",
"tiny_tuya_service_secret": "string",
"tiny_tuya_slack_url": "string",
"url_shortener_geolite_license_key": "string",
"url_shortener_api_key": "string",
"url_shortener_mysql_password": "string",
"webhook_handler_secret": "string",
"webhook_handler_fb_verify_token": "string",
"webhook_handler_fb_page_token": "string",
"webhook_handler_fb_app_secret": "string",
"webhook_handler_git_user": "string",
"webhook_handler_git_token": "string",
"webhook_handler_ssh_key": "string",
"openclaw_ssh_key": "string",
"openclaw_skill_secrets": "map(string)",
"actualbudget_credentials": "map(any)",
"freedify_credentials": "map(any)",
"realestate_crawler_notification_settings": "map(string)",
"homepage_credentials": "map(any)",
"mailserver_accounts": "map(any)",
"owntracks_credentials": "string",
}
TERRAGRUNT_HCL = """\
include "root" {
path = find_in_parent_folders()
}
dependency "platform" {
config_path = "../platform"
skip_outputs = true
}
"""
TIERS_BLOCK = """\
locals {
tiers = {
core = "0-core"
cluster = "1-cluster"
gpu = "2-gpu"
edge = "3-edge"
aux = "4-aux"
}
}
"""
def extract_var_name(expr):
"""Extract variable name from var.xxx or var.xxx["yyy"]["zzz"]."""
if not expr.startswith("var."):
return None
# Get the base variable name (before any indexing)
name = expr[4:]
bracket = name.find("[")
if bracket != -1:
name = name[:bracket]
return name
def gen_main_tf(mod_name, source_dir, args):
"""Generate main.tf content for a service stack."""
lines = []
# Collect variables needed
vars_needed = {}
needs_tiers = False
for arg_name, var_expr in args:
if var_expr.startswith("LOCAL_TIER:"):
needs_tiers = True
continue
vname = extract_var_name(var_expr)
if vname and vname not in vars_needed:
vtype = VAR_TYPES.get(vname, None)
vars_needed[vname] = vtype
# Variable declarations
for vname, vtype in vars_needed.items():
if vtype:
lines.append(f'variable "{vname}" {{ type = {vtype} }}')
else:
lines.append(f'variable "{vname}" {{}}')
if vars_needed:
lines.append("")
# Tiers block if needed
if needs_tiers:
lines.append(TIERS_BLOCK)
# Module call
lines.append(f'module "{mod_name}" {{')
lines.append(f' source = "../../modules/kubernetes/{source_dir}"')
for arg_name, var_expr in args:
if var_expr.startswith("LOCAL_TIER:"):
tier = var_expr.split(":")[1]
val = f"local.tiers.{tier}"
else:
val = var_expr
# Pad for alignment
lines.append(f" {arg_name:30s} = {val}")
lines.append("}")
lines.append("")
return "\n".join(lines)
def main():
stacks_dir = os.path.join(REPO_ROOT, "stacks")
for mod_name, source_dir, args in SERVICES:
# Use source_dir as the stack directory name for consistency
# But some modules have different names than source dirs
# Use the module name for the stack dir
stack_dir = os.path.join(stacks_dir, mod_name)
os.makedirs(stack_dir, exist_ok=True)
# terragrunt.hcl
tg_path = os.path.join(stack_dir, "terragrunt.hcl")
with open(tg_path, "w") as f:
f.write(TERRAGRUNT_HCL)
# main.tf
main_path = os.path.join(stack_dir, "main.tf")
with open(main_path, "w") as f:
f.write(gen_main_tf(mod_name, source_dir, args))
# secrets symlink
secrets_link = os.path.join(stack_dir, "secrets")
if not os.path.exists(secrets_link):
os.symlink("../../secrets", secrets_link)
print(f" Created stacks/{mod_name}/")
print(f"\nGenerated {len(SERVICES)} service stacks")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,143 @@
#!/usr/bin/env bash
# graceful-db-maintenance.sh — Scale down/up dependents of a service
# based on the dependency.kyverno.io/wait-for pod annotation.
#
# Usage:
# ./scripts/graceful-db-maintenance.sh shutdown mysql.dbaas
# # ... perform maintenance ...
# ./scripts/graceful-db-maintenance.sh startup mysql.dbaas
#
# The shutdown action saves original replica counts to a state file
# so startup can restore them exactly.
set -euo pipefail
ACTION="${1:-}"
SERVICE="${2:-}"
STATE_DIR="/tmp"
usage() {
echo "Usage: $0 <shutdown|startup> <service>"
echo ""
echo "Examples:"
echo " $0 shutdown mysql.dbaas # Scale down all MySQL dependents"
echo " $0 startup mysql.dbaas # Restore all MySQL dependents"
echo " $0 shutdown postgresql.dbaas # Scale down all PostgreSQL dependents"
echo " $0 shutdown redis.redis # Scale down all Redis dependents"
exit 1
}
[[ -z "$ACTION" || -z "$SERVICE" ]] && usage
[[ "$ACTION" != "shutdown" && "$ACTION" != "startup" ]] && usage
STATE_FILE="${STATE_DIR}/dep-maintenance-$(echo "$SERVICE" | tr '.' '-').json"
KUBECONFIG="${KUBECONFIG:-$(dirname "$0")/../config}"
export KUBECONFIG
# Find all pods with the dependency annotation containing our service
find_dependent_owners() {
local service="$1"
kubectl get pods --all-namespaces \
-o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.annotations.dependency\.kyverno\.io/wait-for}{"\t"}{.metadata.ownerReferences[0].kind}{"\t"}{.metadata.ownerReferences[0].name}{"\n"}{end}' \
2>/dev/null | \
grep "$service" | \
while IFS=$'\t' read -r ns annotation owner_kind owner_name; do
[[ -z "$owner_kind" || -z "$owner_name" ]] && continue
# Resolve ReplicaSet -> Deployment
if [[ "$owner_kind" == "ReplicaSet" ]]; then
deploy_name=$(kubectl get replicaset "$owner_name" -n "$ns" \
-o jsonpath='{.metadata.ownerReferences[0].name}' 2>/dev/null || true)
if [[ -n "$deploy_name" ]]; then
echo "Deployment/${deploy_name}/${ns}"
fi
elif [[ "$owner_kind" == "StatefulSet" ]]; then
echo "StatefulSet/${owner_name}/${ns}"
fi
done | sort -u
}
do_shutdown() {
echo "Finding dependents of $SERVICE..."
local owners
owners=$(find_dependent_owners "$SERVICE")
if [[ -z "$owners" ]]; then
echo "No dependents found for $SERVICE"
exit 0
fi
echo "Dependents found:"
echo "$owners" | while IFS='/' read -r kind name ns; do
echo " $ns/$kind/$name"
done
# Save current replica counts
local state="[]"
while IFS='/' read -r kind name ns; do
replicas=$(kubectl get "$kind" "$name" -n "$ns" \
-o jsonpath='{.spec.replicas}' 2>/dev/null || echo "1")
state=$(echo "$state" | jq --arg kind "$kind" --arg name "$name" \
--arg ns "$ns" --argjson replicas "${replicas:-1}" \
'. + [{"kind": $kind, "name": $name, "namespace": $ns, "replicas": $replicas}]')
done <<< "$owners"
echo "$state" > "$STATE_FILE"
echo "Saved replica state to $STATE_FILE"
# Scale down
while IFS='/' read -r kind name ns; do
echo "Scaling $ns/$kind/$name to 0..."
kubectl scale "$kind" "$name" -n "$ns" --replicas=0
done <<< "$owners"
echo ""
echo "Waiting for pods to terminate..."
while IFS='/' read -r kind name ns; do
kubectl rollout status "$kind" "$name" -n "$ns" --timeout=120s 2>/dev/null || true
done <<< "$owners"
echo ""
echo "All dependents of $SERVICE scaled to 0."
echo "Run '$0 startup $SERVICE' after maintenance to restore."
}
do_startup() {
if [[ ! -f "$STATE_FILE" ]]; then
echo "Error: No state file found at $STATE_FILE"
echo "Did you run '$0 shutdown $SERVICE' first?"
exit 1
fi
echo "Restoring dependents of $SERVICE from $STATE_FILE..."
local count
count=$(jq length "$STATE_FILE")
for ((i = 0; i < count; i++)); do
kind=$(jq -r ".[$i].kind" "$STATE_FILE")
name=$(jq -r ".[$i].name" "$STATE_FILE")
ns=$(jq -r ".[$i].namespace" "$STATE_FILE")
replicas=$(jq -r ".[$i].replicas" "$STATE_FILE")
echo "Scaling $ns/$kind/$name to $replicas..."
kubectl scale "$kind" "$name" -n "$ns" --replicas="$replicas"
done
echo ""
echo "Waiting for rollouts..."
for ((i = 0; i < count; i++)); do
kind=$(jq -r ".[$i].kind" "$STATE_FILE")
name=$(jq -r ".[$i].name" "$STATE_FILE")
ns=$(jq -r ".[$i].namespace" "$STATE_FILE")
kubectl rollout status "$kind" "$name" -n "$ns" --timeout=300s 2>/dev/null || true
done
rm -f "$STATE_FILE"
echo ""
echo "All dependents of $SERVICE restored."
}
case "$ACTION" in
shutdown) do_shutdown ;;
startup) do_startup ;;
esac

10
scripts/image_pull.sh Executable file
View file

@ -0,0 +1,10 @@
#!/usr/bin/env bash
for n in $(kubectl get nodes -o wide | grep node | awk '{print $1}'); do
echo $n;
kubectl drain $n --ignore-daemonsets --delete-emptydir-data && \
ssh wizard@$n < image_pull_remote.sh
# Check result
kubectl get --raw "/api/v1/nodes/$n/proxy/configz" | jq '.kubeletconfig | {serializeImagePulls, maxParallelImagePulls}'
kubectl uncordon $n
done

14
scripts/image_pull_remote.sh Executable file
View file

@ -0,0 +1,14 @@
#!/usr/bin/env bash
# Containerd
sudo sed -i 's/.*max_concurrent_downloads.*/max_concurrent_downloads = 5/g' /etc/containerd/config.toml
sudo systemctl restart containerd
# Kubelet
#sed serializeImagePulls: false # Allow container images to be downloaded in parallel
#maxParallelImagePulls: 20 # To limit the number of parallel image pulls.
sudo sed -i '/serializeImagePulls:/d' /var/lib/kubelet/config.yaml && \
sudo sed -i '/maxParallelImagePulls:/d' /var/lib/kubelet/config.yaml && \
echo -e 'serializeImagePulls: false\nmaxParallelImagePulls: 5' | sudo tee -a /var/lib/kubelet/config.yaml
sudo systemctl restart kubelet

View file

@ -0,0 +1,57 @@
# kube-apiserver audit policy -- k8s-master (10.0.20.100), single control-plane.
#
# Goal: a durable "who/when/what" trail for MUTATIONS (create/update/patch/
# delete) so resource deletions can be attributed even though direct
# kubectl-to-apiserver calls otherwise leave no trace (see the 2026-06-06
# novelapp incident: a dashboard delete was attributable, a direct-kubectl
# recreate was not). Deployed OUTSIDE Terraform (the k8s VMs are not TF-managed,
# see memory id=1575); this file is the source of truth, scp'd to
# /etc/kubernetes/audit-policy.yaml and wired into the apiserver static-pod
# manifest + the kubeadm-config ConfigMap (so "kubeadm upgrade" preserves it).
#
# Tuned for LOW WRITE VOLUME (the cluster's sdc HDD is write-sensitive, see
# memory id=559): reads are dropped entirely, high-churn resources and probe
# endpoints are dropped, and the verbose RequestReceived stage is omitted, so
# only one Metadata-level line is written per mutating request.
apiVersion: audit.k8s.io/v1
kind: Policy
# Only emit the post-execution stage -- halves volume vs logging both stages.
omitStages:
- RequestReceived
rules:
# 1. Never log read-only verbs -- the overwhelming majority of traffic and
# irrelevant to "who changed/deleted X".
- level: None
verbs: ["get", "list", "watch"]
# 2. Drop high-churn / low-value resources even on writes.
- level: None
resources:
- group: ""
resources: ["events", "endpoints", "nodes/status", "pods/status"]
- group: "coordination.k8s.io"
resources: ["leases"]
- group: "discovery.k8s.io"
resources: ["endpointslices"]
- group: "metrics.k8s.io"
- group: "authentication.k8s.io"
resources: ["tokenreviews"]
- group: "authorization.k8s.io"
resources: ["subjectaccessreviews", "selfsubjectaccessreviews"]
# 3. Drop noisy non-resource probe / discovery URLs.
- level: None
nonResourceURLs:
- "/healthz*"
- "/readyz*"
- "/livez*"
- "/version"
- "/metrics"
- "/openapi/*"
- "/swagger*"
# 4. Everything else (every create/update/patch/delete on real resources):
# record WHO (user + sourceIP + userAgent), WHAT (resource/namespace/name),
# WHEN, and the verb -- at Metadata level (no request/response bodies, so
# each entry stays small).
- level: Metadata

12
scripts/kill_ns.sh Executable file
View file

@ -0,0 +1,12 @@
#!/usr/bin/env bash
set -e
NAMESPACE=$1
if [ -z "$NAMESPACE" ]; then
echo "Pass in parameter namespace"
exit 1
fi
kubectl proxy &
kubectl get namespace $NAMESPACE -o json |jq '.spec = {"finalizers":[]}' > /tmp/kill_rogue_ns.json
curl -k -H "Content-Type: application/json" -X PUT --data-binary @/tmp/kill_rogue_ns.json 127.0.0.1:8001/api/v1/namespaces/$NAMESPACE/finalize
kill %1

469
scripts/lvm-pvc-snapshot.sh Executable file
View file

@ -0,0 +1,469 @@
#!/usr/bin/env bash
# lvm-pvc-snapshot — LVM thin snapshot management for Proxmox CSI PVCs
# Deploy to PVE host at /usr/local/bin/lvm-pvc-snapshot
set -euo pipefail
# --- Configuration ---
VG="pve"
THINPOOL="data"
SNAP_SUFFIX_FORMAT="%Y%m%d_%H%M"
RETENTION_DAYS=7
MIN_FREE_PCT=10
PUSHGATEWAY="${LVM_SNAP_PUSHGATEWAY:-http://10.0.20.100:30091}"
PUSHGATEWAY_JOB="lvm-pvc-snapshot"
LOCKFILE="/run/lvm-pvc-snapshot.lock"
KUBECONFIG="${KUBECONFIG:-/root/.kube/config}"
export KUBECONFIG
# Namespaces to exclude from snapshots (high-churn, have app-level dumps)
# These PVCs cause significant CoW write amplification (~36% overhead)
EXCLUDE_NAMESPACES="${LVM_SNAP_EXCLUDE_NS:-dbaas,monitoring}"
# --- Logging ---
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
warn() { log "WARN: $*" >&2; }
die() { log "FATAL: $*" >&2; exit 1; }
# --- Helpers ---
get_thinpool_free_pct() {
local data_pct
data_pct=$(lvs --noheadings --nosuffix -o data_percent "${VG}/${THINPOOL}" 2>/dev/null | tr -d ' ')
echo "scale=2; 100 - ${data_pct}" | bc
}
build_exclude_lv_list() {
# Query K8s for PVs in excluded namespaces, extract their LV names
if [[ -z "${EXCLUDE_NAMESPACES}" ]] || ! command -v kubectl &>/dev/null; then
return
fi
kubectl get pv -o json 2>/dev/null | jq -r --arg ns "${EXCLUDE_NAMESPACES}" '
($ns | split(",")) as $excl |
.items[] |
select(.spec.csi.driver == "csi.proxmox.sinextra.dev") |
select(.spec.claimRef.namespace as $n | $excl | index($n)) |
.spec.csi.volumeHandle | split("/") | last
' 2>/dev/null || true
}
discover_pvc_lvs() {
# List thin LVs matching PVC pattern, excluding snapshots, pre-restore backups,
# and LVs belonging to excluded namespaces (high-churn databases/metrics)
local all_lvs exclude_lvs
all_lvs=$(lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \
| awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \
| grep -E '^vm-[0-9]+-pvc-' \
| grep -v '_snap_' \
| grep -v '_pre_restore_')
exclude_lvs=$(build_exclude_lv_list)
if [[ -n "${exclude_lvs}" ]]; then
# Filter out excluded LVs
local exclude_pattern
exclude_pattern=$(echo "${exclude_lvs}" | paste -sd'|' -)
echo "${all_lvs}" | grep -vE "(${exclude_pattern})" || true
else
echo "${all_lvs}"
fi
}
list_snapshots() {
lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \
| awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \
| grep '_snap_' || true
}
parse_snap_timestamp() {
# Extract YYYYMMDD_HHMM from snapshot name, convert to epoch
local snap_name="$1"
local ts_str
ts_str=$(echo "${snap_name}" | grep -oE '[0-9]{8}_[0-9]{4}$')
if [[ -z "${ts_str}" ]]; then
echo "0"
return
fi
local ymd="${ts_str:0:8}"
local hm="${ts_str:9:4}"
date -d "${ymd:0:4}-${ymd:4:2}-${ymd:6:2} ${hm:0:2}:${hm:2:2}" +%s 2>/dev/null || echo "0"
}
get_original_lv_from_snap() {
# vm-200-pvc-abc_snap_20260403_1200 -> vm-200-pvc-abc
echo "$1" | sed 's/_snap_[0-9]\{8\}_[0-9]\{4\}$//'
}
push_metrics() {
local status="$1" created="$2" failed="$3" pruned="$4"
local free_pct
free_pct=$(get_thinpool_free_pct)
cat <<METRICS | curl -sf --connect-timeout 5 --max-time 10 --data-binary @- \
"${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || warn "Failed to push metrics to Pushgateway"
# HELP lvm_snapshot_last_run_timestamp Unix timestamp of last snapshot run
# TYPE lvm_snapshot_last_run_timestamp gauge
lvm_snapshot_last_run_timestamp $(date +%s)
# HELP lvm_snapshot_last_status Exit status (0=success, 1=partial failure, 2=aborted)
# TYPE lvm_snapshot_last_status gauge
lvm_snapshot_last_status ${status}
# HELP lvm_snapshot_created_total Number of snapshots created in last run
# TYPE lvm_snapshot_created_total gauge
lvm_snapshot_created_total ${created}
# HELP lvm_snapshot_failed_total Number of snapshot failures in last run
# TYPE lvm_snapshot_failed_total gauge
lvm_snapshot_failed_total ${failed}
# HELP lvm_snapshot_pruned_total Number of snapshots pruned in last run
# TYPE lvm_snapshot_pruned_total gauge
lvm_snapshot_pruned_total ${pruned}
# HELP lvm_snapshot_thinpool_free_pct Thin pool free percentage
# TYPE lvm_snapshot_thinpool_free_pct gauge
lvm_snapshot_thinpool_free_pct ${free_pct}
METRICS
}
# --- Subcommands ---
cmd_snapshot() {
log "Starting PVC LVM thin snapshot run"
# Check thin pool free space
local free_pct
free_pct=$(get_thinpool_free_pct)
log "Thin pool free space: ${free_pct}%"
if (( $(echo "${free_pct} < ${MIN_FREE_PCT}" | bc -l) )); then
warn "Thin pool has only ${free_pct}% free (minimum: ${MIN_FREE_PCT}%). Aborting."
push_metrics 2 0 0 0
exit 1
fi
# Discover PVC LVs
local lvs_list
lvs_list=$(discover_pvc_lvs)
if [[ -z "${lvs_list}" ]]; then
warn "No PVC LVs found matching pattern"
push_metrics 2 0 0 0
exit 1
fi
local count=0 failed=0 total
total=$(echo "${lvs_list}" | wc -l | tr -d ' ')
local snap_ts
snap_ts=$(date +"${SNAP_SUFFIX_FORMAT}")
log "Found ${total} PVC LVs to snapshot"
while IFS= read -r lv; do
local snap_name="${lv}_snap_${snap_ts}"
if lvcreate -s -kn -n "${snap_name}" "${VG}/${lv}" >/dev/null 2>&1; then
log " Created: ${snap_name}"
count=$((count + 1))
else
warn " Failed to create snapshot for ${lv}"
failed=$((failed + 1))
fi
done <<< "${lvs_list}"
log "Snapshot run complete: ${count} created, ${failed} failed out of ${total}"
# Auto-prune
log "Running auto-prune..."
local pruned
pruned=$(cmd_prune_count)
# Determine status
local status=0
if (( failed > 0 && count > 0 )); then
status=1 # partial
elif (( failed > 0 && count == 0 )); then
status=2 # all failed
fi
push_metrics "${status}" "${count}" "${failed}" "${pruned}"
log "Done"
}
cmd_list() {
printf "%-45s %-50s %8s %8s\n" "ORIGINAL LV" "SNAPSHOT" "AGE" "DATA%"
printf "%-45s %-50s %8s %8s\n" "-----------" "--------" "---" "-----"
local now
now=$(date +%s)
local snap_lines
snap_lines=$(lvs --noheadings --nosuffix -o lv_name,lv_size,data_percent "${VG}" 2>/dev/null \
| grep -E '_snap_|_pre_restore_' || true)
if [[ -z "${snap_lines}" ]]; then
echo "(no snapshots found)"
return
fi
echo "${snap_lines}" | while read -r name size data_pct; do
local original age_str ts epoch
if [[ "${name}" == *"_pre_restore_"* ]]; then
original=$(echo "${name}" | sed 's/_pre_restore_[0-9]\{8\}_[0-9]\{4\}$//')
ts=$(echo "${name}" | grep -oE '[0-9]{8}_[0-9]{4}$')
else
original=$(get_original_lv_from_snap "${name}")
ts=$(echo "${name}" | grep -oE '[0-9]{8}_[0-9]{4}$')
fi
epoch=$(parse_snap_timestamp "${name}")
if (( epoch > 0 )); then
local age_s=$(( now - epoch ))
local days=$(( age_s / 86400 ))
local hours=$(( (age_s % 86400) / 3600 ))
age_str="${days}d${hours}h"
else
age_str="unknown"
fi
printf "%-45s %-50s %8s %7s%%\n" "${original}" "${name}" "${age_str}" "${data_pct}"
done
}
cmd_prune() {
local pruned
pruned=$(cmd_prune_count)
log "Pruned ${pruned} expired snapshots"
}
cmd_prune_count() {
# NOTE: stdout of this function is captured by callers (`pruned=$(cmd_prune_count)`),
# so all log/warn output must go to stderr — the only thing on stdout is the count.
local now cutoff pruned=0
now=$(date +%s)
cutoff=$(( now - RETENTION_DAYS * 86400 ))
local snaps
snaps=$(lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \
| awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \
| grep -E '_snap_|_pre_restore_' || true)
if [[ -z "${snaps}" ]]; then
echo "0"
return
fi
while IFS= read -r snap; do
local epoch
epoch=$(parse_snap_timestamp "${snap}")
if (( epoch > 0 && epoch < cutoff )); then
if lvremove -f "${VG}/${snap}" >/dev/null 2>&1; then
log " Pruned: ${snap}" >&2
pruned=$((pruned + 1))
else
warn " Failed to prune: ${snap}"
fi
fi
done <<< "${snaps}"
echo "${pruned}"
}
cmd_restore() {
local pvc_lv="${1:-}" snapshot_lv="${2:-}"
if [[ -z "${pvc_lv}" || -z "${snapshot_lv}" ]]; then
die "Usage: $0 restore <pvc-lv-name> <snapshot-lv-name>"
fi
# Validate LVs exist
if ! lvs "${VG}/${pvc_lv}" >/dev/null 2>&1; then
die "PVC LV '${pvc_lv}' not found in VG '${VG}'"
fi
if ! lvs "${VG}/${snapshot_lv}" >/dev/null 2>&1; then
die "Snapshot LV '${snapshot_lv}' not found in VG '${VG}'"
fi
# Discover K8s context
log "Discovering Kubernetes context for LV '${pvc_lv}'..."
local volume_handle="local-lvm:${pvc_lv}"
local pv_info
pv_info=$(kubectl get pv -o json 2>/dev/null | jq -r \
--arg vh "${volume_handle}" \
'.items[] | select(.spec.csi.volumeHandle == $vh) | "\(.metadata.name) \(.spec.claimRef.namespace) \(.spec.claimRef.name)"' \
) || die "Failed to query PVs (is kubectl configured?)"
if [[ -z "${pv_info}" ]]; then
die "No PV found with volumeHandle '${volume_handle}'"
fi
local pv_name pvc_ns pvc_name
read -r pv_name pvc_ns pvc_name <<< "${pv_info}"
log "Found: PV=${pv_name}, PVC=${pvc_ns}/${pvc_name}"
# Find the workload (Deployment or StatefulSet) that uses this PVC
local workload_type="" workload_name="" original_replicas=""
# Check StatefulSets first (databases use these)
local sts_info
sts_info=$(kubectl get statefulset -n "${pvc_ns}" -o json 2>/dev/null | jq -r \
--arg pvc "${pvc_name}" \
'.items[] | select(
(.spec.template.spec.volumes // [] | .[].persistentVolumeClaim.claimName == $pvc) or
(.spec.volumeClaimTemplates // [] | .[].metadata.name as $vct |
.spec.replicas as $r | range($r) | "\($vct)-\(.metadata.name)-\(.)" ) == $pvc
) | "\(.metadata.name) \(.spec.replicas)"' 2>/dev/null \
) || true
# If not found via simple volume check, try matching VCT naming pattern
if [[ -z "${sts_info}" ]]; then
sts_info=$(kubectl get statefulset -n "${pvc_ns}" -o json 2>/dev/null | jq -r \
--arg pvc "${pvc_name}" \
'.items[] | .metadata.name as $sts | .spec.replicas as $r |
select(.spec.volumeClaimTemplates != null) |
.spec.volumeClaimTemplates[].metadata.name as $vct |
[range($r)] | map("\($vct)-\($sts)-\(.)") |
if any(. == $pvc) then "\($sts) \($r)" else empty end' 2>/dev/null \
) || true
fi
if [[ -n "${sts_info}" ]]; then
read -r workload_name original_replicas <<< "${sts_info}"
workload_type="statefulset"
else
# Check Deployments
local deploy_info
deploy_info=$(kubectl get deployment -n "${pvc_ns}" -o json 2>/dev/null | jq -r \
--arg pvc "${pvc_name}" \
'.items[] | select(
.spec.template.spec.volumes // [] | .[].persistentVolumeClaim.claimName == $pvc
) | "\(.metadata.name) \(.spec.replicas)"' 2>/dev/null \
) || true
if [[ -n "${deploy_info}" ]]; then
read -r workload_name original_replicas <<< "${deploy_info}"
workload_type="deployment"
fi
fi
if [[ -z "${workload_type}" ]]; then
warn "Could not auto-discover workload for PVC '${pvc_name}' in namespace '${pvc_ns}'."
warn "You may need to scale down the pod manually."
echo ""
read -rp "Continue with LV swap anyway? (yes/no): " confirm
[[ "${confirm}" == "yes" ]] || die "Aborted by user"
workload_type="manual"
fi
# Dry-run output
local backup_name="${pvc_lv}_pre_restore_$(date +"${SNAP_SUFFIX_FORMAT}")"
echo ""
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ RESTORE DRY-RUN ║"
echo "╠══════════════════════════════════════════════════════════════╣"
echo "║ PVC: ${pvc_ns}/${pvc_name}"
echo "║ PV: ${pv_name}"
if [[ "${workload_type}" != "manual" ]]; then
echo "║ Workload: ${workload_type}/${workload_name} (replicas: ${original_replicas}→0→${original_replicas})"
fi
echo "║"
echo "║ Actions:"
if [[ "${workload_type}" != "manual" ]]; then
echo "║ 1. Scale ${workload_type}/${workload_name} to 0 replicas"
echo "║ 2. Wait for pod termination"
fi
echo "║ 3. Rename ${pvc_lv}${backup_name}"
echo "║ 4. Rename ${snapshot_lv}${pvc_lv}"
if [[ "${workload_type}" != "manual" ]]; then
echo "║ 5. Scale ${workload_type}/${workload_name} back to ${original_replicas} replicas"
fi
echo "╚══════════════════════════════════════════════════════════════╝"
echo ""
# Interactive confirmation
read -rp "Type 'yes' to proceed with restore: " confirm
if [[ "${confirm}" != "yes" ]]; then
die "Aborted by user"
fi
# Scale down
if [[ "${workload_type}" != "manual" ]]; then
log "Scaling ${workload_type}/${workload_name} to 0 replicas..."
kubectl scale "${workload_type}/${workload_name}" -n "${pvc_ns}" --replicas=0
log "Waiting for pod termination (timeout: 120s)..."
kubectl wait --for=delete pod -l "app.kubernetes.io/name=${workload_name}" -n "${pvc_ns}" --timeout=120s 2>/dev/null || \
kubectl wait --for=delete pod -l "app=${workload_name}" -n "${pvc_ns}" --timeout=120s 2>/dev/null || \
warn "Timeout waiting for pods — continuing anyway (LV may still be in use)"
sleep 5 # extra grace period for device detach
fi
# Verify LV is not active
local lv_active
lv_active=$(lvs --noheadings -o lv_active "${VG}/${pvc_lv}" 2>/dev/null | tr -d ' ')
if [[ "${lv_active}" == "active" ]]; then
warn "LV ${pvc_lv} is still active. Attempting to deactivate..."
# Close any LUKS mapper on the LV before deactivation
if dmsetup ls 2>/dev/null | grep -q "${pvc_lv}"; then
log "Closing LUKS mapper for ${pvc_lv}..."
cryptsetup luksClose "${pvc_lv}" 2>/dev/null || true
fi
lvchange -an "${VG}/${pvc_lv}" 2>/dev/null || warn "Could not deactivate — proceeding with caution"
fi
# LV swap
log "Renaming ${pvc_lv}${backup_name}"
lvrename "${VG}" "${pvc_lv}" "${backup_name}" || die "Failed to rename original LV"
log "Renaming ${snapshot_lv}${pvc_lv}"
lvrename "${VG}" "${snapshot_lv}" "${pvc_lv}" || die "Failed to rename snapshot LV"
# Scale back up
if [[ "${workload_type}" != "manual" ]]; then
log "Scaling ${workload_type}/${workload_name} back to ${original_replicas} replicas..."
kubectl scale "${workload_type}/${workload_name}" -n "${pvc_ns}" --replicas="${original_replicas}"
log "Waiting for pod to become Ready (timeout: 300s)..."
kubectl wait --for=condition=Ready pod -l "app.kubernetes.io/name=${workload_name}" -n "${pvc_ns}" --timeout=300s 2>/dev/null || \
kubectl wait --for=condition=Ready pod -l "app=${workload_name}" -n "${pvc_ns}" --timeout=300s 2>/dev/null || \
warn "Timeout waiting for pod Ready — check manually"
fi
echo ""
log "Restore complete!"
log "Old data preserved as: ${backup_name}"
log "To delete old data after verification: lvremove -f ${VG}/${backup_name}"
}
# --- Main ---
usage() {
cat <<EOF
Usage: $(basename "$0") <command> [args]
Commands:
snapshot Create thin snapshots of all PVC LVs
list List existing snapshots with age and data%
prune Remove snapshots older than ${RETENTION_DAYS} days
restore <lv> <snap> Restore a PVC from a snapshot (interactive)
Environment:
LVM_SNAP_PUSHGATEWAY Pushgateway URL (default: ${PUSHGATEWAY})
KUBECONFIG Kubeconfig path (default: /root/.kube/config)
EOF
}
main() {
local cmd="${1:-}"
shift || true
# Acquire lock (except for list which is read-only)
if [[ "${cmd}" != "list" && "${cmd}" != "" && "${cmd}" != "help" && "${cmd}" != "--help" && "${cmd}" != "-h" ]]; then
exec 200>"${LOCKFILE}"
if ! flock -n 200; then
die "Another instance is already running (lockfile: ${LOCKFILE})"
fi
fi
case "${cmd}" in
snapshot) cmd_snapshot ;;
list) cmd_list ;;
prune) cmd_prune ;;
restore) cmd_restore "$@" ;;
help|--help|-h|"") usage ;;
*) die "Unknown command: ${cmd}. Run '$0 help' for usage." ;;
esac
}
main "$@"

View file

@ -0,0 +1,10 @@
[Unit]
Description=Daily LVM thin snapshots of Proxmox CSI PVCs
[Timer]
OnCalendar=*-*-* 03:00:00
Persistent=true
RandomizedDelaySec=300
[Install]
WantedBy=timers.target

117
scripts/migrate-state-to-pg Executable file
View file

@ -0,0 +1,117 @@
#!/usr/bin/env bash
# scripts/migrate-state-to-pg — One-shot migration from local SOPS state to PG backend.
# Prerequisites: vault login -method=oidc, PG terraform_state DB exists, Vault static role created.
# Usage: scripts/migrate-state-to-pg [--dry-run]
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
SYNC="$REPO_ROOT/scripts/state-sync"
STACKS_DIR="$REPO_ROOT/stacks"
STATE_DIR="$REPO_ROOT/state/stacks"
TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
is_tier0() {
echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
}
DRY_RUN=false
[ "${1:-}" = "--dry-run" ] && DRY_RUN=true
# Fetch PG credentials from Vault
echo "==> Fetching PG credentials from Vault..."
PG_CREDS=$(vault read -format=json database/static-creds/pg-terraform-state) || {
echo "ERROR: Cannot read PG credentials. Run: vault login -method=oidc" >&2
exit 1
}
PG_USER=$(echo "$PG_CREDS" | jq -r .data.username)
PG_PASS=$(echo "$PG_CREDS" | jq -r .data.password)
export PG_CONN_STR="postgres://${PG_USER}:${PG_PASS}@10.0.20.200:5432/terraform_state?sslmode=disable"
echo " PG_CONN_STR set (user: $PG_USER)"
# Enable provider cache
export TF_PLUGIN_CACHE_DIR="${TF_PLUGIN_CACHE_DIR:-$HOME/.terraform.d/plugin-cache}"
export TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1
mkdir -p "$TF_PLUGIN_CACHE_DIR"
migrated=0
failed=0
skipped=0
failed_stacks=""
# Increment helpers (avoid arithmetic exit code 1 when value is 0)
inc_migrated() { migrated=$((migrated + 1)); }
inc_failed() { failed=$((failed + 1)); }
inc_skipped() { skipped=$((skipped + 1)); }
# Iterate over all stack directories that have state
for state_dir in "$STATE_DIR"/*/; do
stack="$(basename "$state_dir")"
# Skip Tier 0
if is_tier0 "$stack"; then
echo "--- SKIP (Tier 0): $stack"
inc_skipped
continue
fi
# Skip stacks with no state file
if [ ! -f "$state_dir/terraform.tfstate.enc" ] && [ ! -f "$state_dir/terraform.tfstate" ]; then
echo "--- SKIP (no state): $stack"
inc_skipped
continue
fi
# Skip stacks with no corresponding stack directory
if [ ! -d "$STACKS_DIR/$stack" ]; then
echo "--- SKIP (no stack dir): $stack"
inc_skipped
continue
fi
echo "==> Migrating: $stack"
if $DRY_RUN; then
echo " [dry-run] Would migrate $stack"
inc_skipped
continue
fi
# Decrypt state if needed (call decrypt_state directly — state-sync skips Tier 1)
if [ -f "$state_dir/terraform.tfstate.enc" ] && [ ! -f "$state_dir/terraform.tfstate" ]; then
sops -d --input-type json --output-type json "$state_dir/terraform.tfstate.enc" > "$state_dir/terraform.tfstate" || {
echo " WARNING: decrypt failed, skipping"
inc_skipped
continue
}
fi
# Migrate state
cd "$STACKS_DIR/$stack"
if terragrunt init -upgrade -migrate-state -force-copy -input=false 2>&1 | tee /tmp/tg-migrate-$stack.log; then
echo " init OK"
# Verify — plan should show no changes
if terragrunt plan -detailed-exitcode -input=false 2>&1 | tail -5 | grep -q "No changes"; then
echo " plan OK — no drift"
inc_migrated
else
echo " WARNING: plan shows changes (may be normal drift, not migration issue)"
inc_migrated
fi
else
echo " FAILED: init error (see /tmp/tg-migrate-$stack.log)"
inc_failed
failed_stacks="$failed_stacks $stack"
fi
done
echo ""
echo "========================================"
echo "Migration complete"
echo " Migrated: $migrated"
echo " Failed: $failed"
echo " Skipped: $skipped"
if [ -n "$failed_stacks" ]; then
echo " Failed stacks:$failed_stacks"
fi
echo "========================================"

View file

@ -0,0 +1,112 @@
#!/bin/bash
# Phase 3: Migrate all service module state from root to individual stacks
# Each module in root state is at: module.kubernetes_cluster.module.<name>["<name>"]
# Target: state/stacks/<name>/terraform.tfstate as module.<name>
set -euo pipefail
ROOT_STATE="$(pwd)/terraform.tfstate"
STATE_DIR="$(pwd)/state/stacks"
# All 64 service modules currently in root state
MODULES=(
actualbudget
affine
blog
changedetection
city-guesser
coturn
cyberchef
dashy
dawarich
descheduler
diun
ebook2audiobook
echo
excalidraw
f1-stream
forgejo
freedify
freshrss
frigate
hackmd
health
homepage
immich
isponsorblocktv
jsoncrack
kms
linkwarden
matrix
meshcentral
n8n
navidrome
netbox
networking-toolbox
nextcloud
ntfy
ollama
onlyoffice
openclaw
osm_routing
owntracks
paperless-ngx
plotting-book
privatebin
real-estate-crawler
reloader
resume
rybbit
send
servarr
shadowsocks
speedtest
stirling-pdf
tandoor
tor-proxy
travel_blog
tuya-bridge
url
wealthfolio
webhook_handler
whisper
ytdlp
)
TOTAL=${#MODULES[@]}
SUCCESS=0
FAIL=0
echo "=== Phase 3: Service State Migration ==="
echo "Migrating $TOTAL modules from root state to individual stacks"
echo ""
for mod in "${MODULES[@]}"; do
idx=$((SUCCESS + FAIL + 1))
echo "[$idx/$TOTAL] Migrating: $mod"
# Create state directory
mkdir -p "$STATE_DIR/$mod"
# Source address (with for_each key)
SRC="module.kubernetes_cluster.module.${mod}[\"${mod}\"]"
DST="module.${mod}"
DST_STATE="$STATE_DIR/$mod/terraform.tfstate"
if terraform state mv \
-state="$ROOT_STATE" \
-state-out="$DST_STATE" \
"$SRC" "$DST" 2>&1; then
echo "$mod migrated successfully"
SUCCESS=$((SUCCESS + 1))
else
echo "$mod FAILED"
FAIL=$((FAIL + 1))
fi
echo ""
done
echo "=== Migration Summary ==="
echo "Total: $TOTAL"
echo "Success: $SUCCESS"
echo "Failed: $FAIL"

View file

@ -0,0 +1,19 @@
[Unit]
Description=Track NFS filesystem changes for incremental offsite backup
After=local-fs.target
[Service]
Type=simple
ExecStart=/usr/bin/inotifywait -m -r \
--format '%%w%%f' \
-e create -e modify -e moved_to -e delete \
--exclude '(/\..*swp$|/\.nfs|/\.Trash|\.db-shm$|\.db-wal$|\.db-journal$|/stats/.*\.stat$|^/srv/nfs/anca-elements/)' \
/srv/nfs \
/srv/nfs-ssd
StandardOutput=append:/mnt/backup/.nfs-changes.log
StandardError=journal
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target

View file

@ -0,0 +1,15 @@
[Unit]
Description=Mirror /srv/nfs (selective) to /mnt/backup (local 2nd copy of critical NFS)
After=network-online.target local-fs.target
Wants=network-online.target
[Service]
Type=oneshot
ExecStart=/usr/local/bin/nfs-mirror
StandardOutput=journal
StandardError=journal
SyslogIdentifier=nfs-mirror
# Heavy sustained IO — don't compete with foreground services.
Nice=10
IOSchedulingClass=idle
TimeoutStartSec=18000

179
scripts/nfs-mirror.sh Normal file
View file

@ -0,0 +1,179 @@
#!/usr/bin/env bash
# nfs-mirror — local 2nd copy of /srv/nfs (selective) → /mnt/backup
#
# Deploy to PVE host at /usr/local/bin/nfs-mirror.
# Schedule: weekly Mon 04:00 via nfs-mirror.timer.
#
# ROLE in the 3-2-1 strategy:
# Copy 1 (sdc): /srv/nfs/* (live PVE NFS)
# Copy 2 (sda, this): /mnt/backup/<svc>/ ← this script
# Copy 3 (Synology): /Backup/Viki/nfs/ (via offsite-sync-backup + inotify)
#
# Replaces the dedicated anca-elements-mirror script; same disk, same
# destination layout (anca-elements lives at /mnt/backup/anca-elements/),
# but now covers every other critical NFS subtree in one pass.
#
# SKIP-LIST rationale (2026-05-26 simplification; REGENERABLE-SERVICE
# CARVE-OUT added 2026-06-01 — see below):
# immich — 1.5T, doesn't fit on sda; offsite-sync ships it direct to Synology
# frigate — camera ring buffer; intentionally NOT backed up anywhere
# temp — scratch; intentionally NOT backed up
#
# 2026-06-01 carve-out: the offsite Synology (5.3T) hit 97% and the
# `Backup` share had grown +670G in a week — traced to the 2026-05-26
# change that started mirroring large *regenerable* services to sda and
# thence to Synology pve-backup/. These are now re-excluded because they
# cost offsite capacity for data we can rebuild on demand:
# ollama (20G) — LLM model blobs, re-pullable
# prometheus-backup (64G) — metrics TSDB snapshots; was offsite-excluded
# pre-2026-05-26 by original intent
# audiblez (24G) — generated audiobooks, re-derivable from ebooks
# ebook2audiobook (11G) — same, generation output
# Their live copy stays on sdc (/srv/nfs); only the sda + Synology copies
# are dropped. `*-backup` DB dumps (sqlite-backup et al.) are intentionally
# KEPT — they are real database safety copies, not regenerable.
#
# Note: /srv/nfs-ssd is intentionally NOT mirrored — its dirs (immich,
# ollama, llamacpp) go direct to Synology nfs-ssd/ via offsite-sync
# Step 2, which (also 2026-06-01) was narrowed to immich-only so ollama
# + llamacpp on the SSD stop reaching Synology too.
set -euo pipefail
SRC=/srv/nfs/
DST=/mnt/backup/
LOG=/var/log/nfs-mirror.log
LOCKFILE=/run/nfs-mirror.lock
# Manifest of files changed under /mnt/backup since the last offsite-sync.
# offsite-sync-backup Step 1 reads this and rsyncs the listed files to Synology
# pve-backup/ on its next daily run. Without populating it, nfs-mirror's writes
# would only reach Synology via the monthly full sync (1st-7th of month), and
# the monthly --delete pass would also wipe any pre-positioned data.
MANIFEST=/mnt/backup/.changed-files
PUSHGATEWAY="${NFS_MIRROR_PUSHGATEWAY:-http://10.0.20.100:30091}"
PUSHGATEWAY_JOB=nfs-mirror
EXCLUDES=(
# ---- /mnt/backup subtrees owned by daily-backup — leave alone ----
--exclude='/pvc-data/'
--exclude='/sqlite-backup/'
--exclude='/pfsense/'
--exclude='/pve-config/'
--exclude='/lost+found/'
# ---- state files used by other backup jobs ----
--exclude='/.changed-files'
--exclude='/.last-offsite-sync'
--exclude='/.lv-pvc-mapping.json'
--exclude='/.nfs-changes.log'
# ---- anca-elements: now in Immich (canonical), /mnt/backup copy deleted
# 2026-05-26. Kept in excludes so nfs-mirror doesn't re-populate from sdc
# if /srv/nfs/anca-elements is ever re-attached.
--exclude='/anca-elements/'
# ---- NFS paths intentionally NOT backed up ----
--exclude='/immich/' # 1.5T — ships sdc → Synology direct (Step 2)
--exclude='/frigate/' # ring buffer — no backup anywhere
--exclude='/temp/' # scratch — no backup anywhere
# ---- regenerable services: live-only on sdc, no offsite (2026-06-01) ----
# See header carve-out. --delete reaps any existing copies from sda on
# the next run; a one-off direct delete already cleared them from Synology.
--exclude='/ollama/' # LLM models — re-pullable
--exclude='/prometheus-backup/' # metrics TSDB snapshots
--exclude='/audiblez/' # generated audiobooks
--exclude='/ebook2audiobook/' # generated audiobooks
# ---- Synology / Windows / macOS cruft ----
--exclude='/@eaDir/'
--exclude='*@synoeastream'
--exclude='/.DS_Store'
--exclude='/Thumbs.db'
)
log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG"; }
warn() { log "WARN: $*"; }
# Locked manifest append (shared with daily-backup) — see daily-backup.sh
# for the rationale. flock prevents interleaved appends when nfs-mirror
# (Mon 04:11) overruns into daily-backup (Mon 05:00).
MANIFEST_LOCK="${MANIFEST}.lock"
manifest_append() {
(
flock -x 200
cat >> "${MANIFEST}"
) 200>"${MANIFEST_LOCK}"
}
push_metrics() {
local status="${1:-0}" bytes="${2:-0}"
cat <<EOF | curl -s --connect-timeout 5 --max-time 10 --data-binary @- "${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || true
nfs_mirror_last_run_timestamp $(date +%s)
nfs_mirror_last_status ${status}
nfs_mirror_bytes ${bytes}
EOF
}
KILLED=""
STAMP=""
cleanup() {
rm -f "$LOCKFILE"
[ -n "$STAMP" ] && rm -f "$STAMP"
if [ -n "$KILLED" ]; then
push_metrics 2 0 # status=2 = aborted
fi
}
trap cleanup EXIT
trap 'KILLED=1; exit 143' TERM INT
if ! ( set -o noclobber; echo $$ > "$LOCKFILE" ) 2>/dev/null; then
log "FATAL: another instance running (pid $(cat "$LOCKFILE" 2>/dev/null || echo unknown))"
exit 1
fi
mountpoint -q /mnt/backup || { log "FATAL: /mnt/backup not mounted"; push_metrics 1 0; exit 1; }
[ -d "$SRC" ] || { log "FATAL: source $SRC missing"; push_metrics 1 0; exit 1; }
log "=== mirror starting: $SRC$DST ==="
log "skip: immich (Synology direct), frigate/temp (no backup), anca-elements, ollama/prometheus-backup/audiblez/ebook2audiobook (regenerable, live-only)"
# Marker file used to identify files written by this rsync run, so we can append
# their paths to the offsite-sync manifest. Touch BEFORE rsync; `find -newer` AFTER.
STAMP=$(mktemp)
RSYNC_RC=0
rsync \
-rlt --delete -H \
--no-perms --no-owner --no-group \
--info=stats2 \
"${EXCLUDES[@]}" \
"$SRC" "$DST" 2>&1 | tee -a "$LOG" || RSYNC_RC=${PIPESTATUS[0]}
DST_BYTES=$(df -B1 --output=used /mnt/backup | tail -1)
if [ "$RSYNC_RC" -eq 0 ]; then
# Capture files that rsync created/modified and feed them to the offsite-sync
# manifest so daily Step 1 incremental picks them up tomorrow morning.
# Use -cnewer (ctime), not -newer (mtime): rsync -t preserves SOURCE mtime
# on the dest, so freshly-written files with old source mtime look "older"
# than $STAMP and -newer misses them. ctime is set when the inode is written,
# regardless of -t, so it correctly identifies what this run created.
# (Bug hit 2026-05-26 full bypass-list mirror: 800k files copied, manifest
# captured only 2 entries → forced a .force-full-sync to recover.)
NEW_COUNT=$(find /mnt/backup -cnewer "$STAMP" -type f \
! -path '/mnt/backup/.changed-files' \
! -path '/mnt/backup/.changed-files.lock' \
! -path '/mnt/backup/.lv-pvc-mapping.json' \
! -path '/mnt/backup/.nfs-changes.log' \
! -path '/mnt/backup/.last-offsite-sync' \
! -path '/mnt/backup/.force-full-sync' \
-printf '%P\n' 2>/dev/null | tee >(manifest_append) | wc -l)
log "=== mirror complete; ${NEW_COUNT} files added to offsite manifest ==="
log "/mnt/backup used: $(df -h --output=used /mnt/backup | tail -1 | tr -d ' ')"
push_metrics 0 "$DST_BYTES"
else
log "=== mirror failed: rsync exited $RSYNC_RC ==="
push_metrics 1 "$DST_BYTES"
exit "$RSYNC_RC"
fi

16
scripts/nfs-mirror.timer Normal file
View file

@ -0,0 +1,16 @@
[Unit]
Description=Daily local NFS mirror to /mnt/backup
[Timer]
# Daily 02:00 — runs 3h before daily-backup (05:00) so the .changed-files
# manifest is populated and offsite-sync (06:00) ships both legs' deltas.
# Switched from weekly Mon 04:00 → daily 2026-05-26: steady-state delta is
# 10-20 min of mostly-metadata rsync, so the IO cost is negligible and it
# cuts non-CronJob app-data RPO from 7d to ~24h (matters for nextcloud
# shared files, audiobookshelf library, mailserver Maildir, etc.).
OnCalendar=*-*-* 02:00:00
Persistent=true
RandomizedDelaySec=15min
[Install]
WantedBy=timers.target

View file

@ -0,0 +1,97 @@
#!/bin/bash
# Simple and reliable containerd registry mirror manager
# Usage: ./registry-mirror.sh [--add|--remove] [mirror_url]
# Docs - https://github.com/containerd/containerd/blob/main/docs/cri/registry.md
# To apply on all nodes (tail +3 skips master node):
# for node in $(kubectl get nodes -o wide | awk '{print $6}' | tail -n +3); do cat node_registry_manager.sh | s wizard@$node "sudo bash -s -- --add http://10.0.20.10:5000"; done
# for node in $(kubectl get nodes -o wide | awk '{print $6}' | tail -n +3); do cat node_registry_manager.sh | s wizard@$node "sudo bash -s -- --remove http://10.0.20.10:5000"; done
set -euo pipefail
CONFIG_FILE="/etc/containerd/config.toml"
BACKUP_FILE="/etc/containerd/config.toml.bak"
# Validate environment
[ -f "$CONFIG_FILE" ] || { echo "Error: $CONFIG_FILE not found" >&2; exit 1; }
[ "$(id -u)" -eq 0 ] || { echo "Error: Requires root privileges" >&2; exit 1; }
add_mirror() {
local mirror_url="$1"
# Create backup
cp -p "$CONFIG_FILE" "$BACKUP_FILE"
# Check if mirror already exists
if grep -q "endpoint = \[.*\"$mirror_url\".*\]" "$CONFIG_FILE"; then
echo "Mirror already exists: $mirror_url"
return 0
fi
# Check if docker.io section exists
if grep -q "^\[plugins\.\"io\.containerd\.grpc\.v1\.cri\"\.registry\.mirrors\.\"docker.io\"\]" "$CONFIG_FILE"; then
# Append to existing section
sed -i "/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."docker.io"\]/a \ endpoint = [\"$mirror_url\"]" "$CONFIG_FILE"
else
# Add new section after registry.mirrors
if grep -q "^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\]" "$CONFIG_FILE"; then
sed -i "/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\]/a \\n[plugins.\"io.containerd.grpc.v1.cri\".registry.mirrors.\"docker.io\"]\n endpoint = [\"$mirror_url\"]" "$CONFIG_FILE"
else
# Add complete new section
echo -e "\n[plugins.\"io.containerd.grpc.v1.cri\".registry.mirrors.\"docker.io\"]\n endpoint = [\"$mirror_url\"]" >> "$CONFIG_FILE"
fi
fi
echo "Added mirror: $mirror_url"
}
remove_mirror() {
local mirror_url="$1"
# Create backup
cp -p "$CONFIG_FILE" "$BACKUP_FILE"
# Remove the specific mirror URL
sed -i "/endpoint = \[.*\"$mirror_url\".*\]/d" "$CONFIG_FILE"
# Clean up empty sections
sed -i '/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."docker.io"\]$/,/^\[/{//!d}' "$CONFIG_FILE"
sed -i '/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."docker.io"\]$/d' "$CONFIG_FILE"
# Clean up multiple empty lines
sed -i '/^$/N;/^\n$/D' "$CONFIG_FILE"
echo "Removed mirror: $mirror_url"
}
restart_containerd() {
echo "Restarting containerd..."
if systemctl restart containerd; then
echo "Successfully restarted containerd"
return 0
else
echo "Error: Failed to restart containerd" >&2
return 1
fi
}
case "$1" in
--add)
[ -z "$2" ] && { echo "Error: Mirror URL required" >&2; exit 1; }
add_mirror "$2"
restart_containerd || exit 1
;;
--remove)
[ -z "$2" ] && { echo "Error: Mirror URL required" >&2; exit 1; }
remove_mirror "$2"
restart_containerd || exit 1
;;
*)
echo "Usage: $0 [--add|--remove] [mirror_url]" >&2
echo "Examples:" >&2
echo " Add mirror: $0 --add https://registry.example.com" >&2
echo " Remove mirror: $0 --remove https://registry.example.com" >&2
exit 1
;;
esac
exit 0

View file

@ -0,0 +1,11 @@
[Unit]
Description=Daily offsite sync: sda + NFS changes to Synology
After=network-online.target daily-backup.service
[Service]
Type=oneshot
ExecStart=/usr/local/bin/offsite-sync-backup
StandardOutput=journal
StandardError=journal
SyslogIdentifier=offsite-sync-backup
TimeoutStartSec=7200

View file

@ -0,0 +1,187 @@
#!/usr/bin/env bash
# offsite-sync-backup — Sync backups to Synology NAS
# Deploy to PVE host at /usr/local/bin/offsite-sync-backup
# Schedule: Daily 06:00 via systemd timer (After=daily-backup.service)
#
# Two sync paths:
# Step 1: sda (/mnt/backup) → Synology pve-backup/ (PVC snapshots, pfsense, pve-config, sqlite)
# Step 2: NFS (/srv/nfs, /srv/nfs-ssd) → Synology nfs/, nfs-ssd/ (inotify change-tracked)
set -euo pipefail
# --- Configuration ---
BACKUP_ROOT="/mnt/backup"
SYNOLOGY="Administrator@192.168.1.13"
PVE_BACKUP_DEST="${SYNOLOGY}:/volume1/Backup/Viki/pve-backup"
NFS_DEST="${SYNOLOGY}:/volume1/Backup/Viki/nfs"
NFS_SSD_DEST="${SYNOLOGY}:/volume1/Backup/Viki/nfs-ssd"
MANIFEST="${BACKUP_ROOT}/.changed-files"
NFS_CHANGE_LOG="${BACKUP_ROOT}/.nfs-changes.log"
PUSHGATEWAY="${OFFSITE_SYNC_PUSHGATEWAY:-http://10.0.20.100:30091}"
PUSHGATEWAY_JOB="offsite-backup-sync"
LOCKFILE="/run/offsite-sync-backup.lock"
# --- Logging ---
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
warn() { log "WARN: $*" >&2; }
# --- Locking ---
cleanup() { rm -f "${LOCKFILE}"; }
trap cleanup EXIT
if ! ( set -o noclobber; echo $$ > "${LOCKFILE}" ) 2>/dev/null; then
log "FATAL: Another instance running"; exit 1
fi
# --- Main ---
log "=== Offsite sync starting ==="
STATUS=0
if ! mountpoint -q "${BACKUP_ROOT}"; then
log "FATAL: ${BACKUP_ROOT} is not mounted"; exit 1
fi
if ! timeout 10 ssh -o BatchMode=yes -o ConnectTimeout=5 "${SYNOLOGY}" true 2>/dev/null; then
log "FATAL: Cannot SSH to Synology"
echo "backup_last_success_timestamp 0" | \
curl -s --connect-timeout 5 --max-time 10 --data-binary @- \
"${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || true
exit 1
fi
DAY_OF_MONTH=$(date +%d)
# ============================================================
# STEP 1: sda → Synology pve-backup/ (PVC snapshots, pfsense, pve-config)
# ============================================================
log "--- Step 1: sda → Synology pve-backup/ ---"
# Trigger: monthly cleanup window OR daily-backup signalled the manifest grew
# past its cap (Synology was unreachable too long for incremental to keep up).
FORCE_FULL_FLAG="${BACKUP_ROOT}/.force-full-sync"
FORCE_FULL=""
[ -f "${FORCE_FULL_FLAG}" ] && FORCE_FULL=1
if [ "${DAY_OF_MONTH}" -le 7 ] || [ -n "${FORCE_FULL}" ]; then
[ -n "${FORCE_FULL}" ] && log "Forced full sync (manifest size cap tripped)..." || log "Monthly full sync (1st Sunday)..."
# No -z on LAN: gigabit hop to 192.168.1.13 doesn't benefit from compression
# and burns CPU on the PVE host that's already busy with cluster IO.
rsync -rlt --delete --chmod=Du=rwx,Dgo=rx,Fu=rw,Fog=r \
--exclude='.changed-files' \
--exclude='.changed-files.lock' \
--exclude='.last-offsite-sync' \
--exclude='.lv-pvc-mapping.json' \
--exclude='.nfs-changes.log' \
--exclude='.force-full-sync' \
--exclude='/anca-elements/' \
"${BACKUP_ROOT}/" "${PVE_BACKUP_DEST}/" 2>&1 || STATUS=1
rm -f "${FORCE_FULL_FLAG}"
elif [ -s "${MANIFEST}" ]; then
MANIFEST_LINES=$(wc -l < "${MANIFEST}")
log "Incremental sync (${MANIFEST_LINES} files from manifest)..."
# anca-elements: now in Immich (canonical); /mnt/backup copy deleted
# 2026-05-26. Exclude retained as a safety belt in case it re-appears.
rsync -rlt --chmod=Du=rwx,Dgo=rx,Fu=rw,Fog=r --files-from="${MANIFEST}" \
--exclude='anca-elements/' \
"${BACKUP_ROOT}/" "${PVE_BACKUP_DEST}/" 2>&1 || STATUS=1
else
log "No changed files in manifest, nothing to sync"
fi
# ============================================================
# STEP 2: NFS → Synology nfs/ + nfs-ssd/ (inotify change-tracked, FILTERED)
# ============================================================
#
# DESIGN: Step 2 only carries paths that BYPASS the sda mirror. As of
# 2026-05-26 that's just /srv/nfs/immich/ (1.5T, doesn't fit on sda).
# Everything else under /srv/nfs/ now flows through sda via nfs-mirror,
# reaching Synology via Step 1 (sda → pve-backup/). frigate and temp are
# excluded from both legs — intentionally NOT backed up.
#
# nfs-ssd: as of 2026-06-01 this leg is ALSO immich-only. ollama (59G) and
# llamacpp (26G) on the SSD were filling the offsite Synology (5.3T hit 97%)
# for re-pullable model blobs, so they're dropped — live copy stays on the
# SSD, no offsite. The monthly --delete pass below reaps them from Synology
# nfs-ssd/; a one-off direct delete cleared the bulk on 2026-06-01.
#
# Keep this aligned with /usr/local/bin/nfs-mirror's EXCLUDES. Both legs now
# carry immich only; everything else is either curated through sda (Step 1)
# or intentionally live-only (frigate, temp, ollama, llamacpp, audiblez,
# ebook2audiobook, prometheus-backup).
log "--- Step 2: NFS → Synology (immich-only on both nfs/ and nfs-ssd/) ---"
# Regex matching paths NOT on sda (must reach Synology directly).
NFS_SDA_BYPASS_RE='^/srv/nfs/immich/'
# rsync include/exclude args for the monthly full sync (HDD).
NFS_FULL_INCLUDES=(
--include='/immich/' --include='/immich/***'
--exclude='*'
)
if [ "${DAY_OF_MONTH}" -le 7 ]; then
# Monthly: full sync with --delete for cleanup, restricted to bypass-list.
# --delete here will reap legacy dirs on Synology (frigate, ollama,
# audiblez, ebook2audiobook, *-backup, prometheus, loki, temp,
# alertmanager) since they're no longer in NFS_FULL_INCLUDES.
log "Monthly full NFS sync (immich-only — reaps legacy bypass dirs)..."
rsync -rlt --delete "${NFS_FULL_INCLUDES[@]}" /srv/nfs/ "${NFS_DEST}/" 2>&1 \
&& log " OK: nfs/ full sync (immich-only)" || { warn "nfs/ full sync failed"; STATUS=1; }
# nfs-ssd: immich-only (2026-06-01) — --delete reaps legacy ollama/llamacpp.
rsync -rlt --delete "${NFS_FULL_INCLUDES[@]}" /srv/nfs-ssd/ "${NFS_SSD_DEST}/" 2>&1 \
&& log " OK: nfs-ssd/ full sync (immich-only)" || { warn "nfs-ssd/ full sync failed"; STATUS=1; }
> "${NFS_CHANGE_LOG}"
elif [ -s "${NFS_CHANGE_LOG}" ]; then
# Incremental: only sync changed files matching the bypass leg (immich).
sort -u "${NFS_CHANGE_LOG}" > /tmp/nfs-changes-deduped
# HDD NFS — include only /srv/nfs/immich/ paths.
# `|| true` is REQUIRED: if the last iteration's `[ -f "$f" ]` is false
# (file was deleted between inotify capture and now — e.g., immich
# encoded-video temp file that got cleaned up), the while loop returns
# 1, pipefail propagates, and `set -e` kills the script silently before
# reaching the rsync. Matches the SSD section's pattern below.
grep -E "${NFS_SDA_BYPASS_RE}" /tmp/nfs-changes-deduped | \
while IFS= read -r f; do [ -f "$f" ] && echo "${f#/srv/nfs/}"; done \
> /tmp/sync-nfs.list 2>/dev/null || true
NFS_COUNT=$(wc -l < /tmp/sync-nfs.list 2>/dev/null || echo 0)
if [ "${NFS_COUNT:-0}" -gt 0 ]; then
rsync -rlt --files-from=/tmp/sync-nfs.list /srv/nfs/ "${NFS_DEST}/" 2>&1 \
&& log " OK: nfs/ (${NFS_COUNT} immich files)" \
|| { warn "nfs/ incremental failed"; STATUS=1; }
fi
# SSD NFS — immich-only (2026-06-01); ollama/llamacpp are live-only, no offsite.
grep '^/srv/nfs-ssd/immich/' /tmp/nfs-changes-deduped | \
while IFS= read -r f; do [ -f "$f" ] && echo "${f#/srv/nfs-ssd/}"; done \
> /tmp/sync-nfs-ssd.list 2>/dev/null || true
SSD_COUNT=$(wc -l < /tmp/sync-nfs-ssd.list 2>/dev/null || echo 0)
if [ "${SSD_COUNT:-0}" -gt 0 ]; then
rsync -rlt --files-from=/tmp/sync-nfs-ssd.list /srv/nfs-ssd/ "${NFS_SSD_DEST}/" 2>&1 \
&& log " OK: nfs-ssd/ (${SSD_COUNT} files)" \
|| { warn "nfs-ssd/ incremental failed"; STATUS=1; }
fi
TOTAL=$(wc -l < /tmp/nfs-changes-deduped)
log " Processed ${TOTAL} change events (${NFS_COUNT} nfs/immich + ${SSD_COUNT} nfs-ssd files synced)"
> "${NFS_CHANGE_LOG}"
rm -f /tmp/nfs-changes-deduped /tmp/sync-nfs.list /tmp/sync-nfs-ssd.list
else
log " No NFS changes to sync"
fi
# ============================================================
# Finish
# ============================================================
if [ "${STATUS}" -eq 0 ]; then
touch "${BACKUP_ROOT}/.last-offsite-sync"
> "${MANIFEST}"
log "=== Offsite sync complete (success) ==="
else
warn "Offsite sync had errors — manifest preserved for retry"
log "=== Offsite sync complete (with errors) ==="
fi
cat <<EOF | curl -s --connect-timeout 5 --max-time 10 --data-binary @- "${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || true
backup_last_success_timestamp $(date +%s)
offsite_sync_last_status ${STATUS}
EOF
exit "${STATUS}"

View file

@ -0,0 +1,10 @@
[Unit]
Description=Daily offsite sync: sda + NFS changes to Synology
[Timer]
OnCalendar=*-*-* 06:00:00
Persistent=true
RandomizedDelaySec=300
[Install]
WantedBy=timers.target

View file

@ -0,0 +1,89 @@
#!/bin/sh
# parse-postmortem-todos.sh — Extract auto-implementable TODOs from a post-mortem markdown file
# Usage: bash scripts/parse-postmortem-todos.sh docs/post-mortems/2026-04-14-foo.md
# Output: JSON with file path and list of TODOs
#
# Supports two table formats:
# New: | Priority | Action | Type | Details | Status |
# Old: | Action | Status | Details | (infers type from action text)
set -eu
PM_FILE="${1:?Usage: $0 <post-mortem.md>}"
if [ ! -f "$PM_FILE" ]; then
echo '{"file": "", "todos": [], "error": "File not found"}' >&2
exit 1
fi
python3 -c "
import re, json, sys
pm_file = sys.argv[1]
with open(pm_file) as f:
content = f.read()
safe_types = {'Alert', 'Config', 'Monitor'}
todos = []
# Format 1 (new template): | Priority | Action | Type | Details | Status |
pattern_new = r'\|\s*(P[0-3])\s*\|\s*(.+?)\s*\|\s*(\w+)\s*\|\s*(.+?)\s*\|\s*TODO\s*\|'
for priority, action, todo_type, details in re.findall(pattern_new, content):
todos.append({
'priority': priority.strip(),
'action': action.strip(),
'type': todo_type.strip(),
'details': details.strip(),
'safe': todo_type.strip() in safe_types
})
# Format 2 (old): | Action | TODO/Done | Details | or | Action | Owner | Status |
# Look for rows with TODO in any column
if not todos:
pattern_old = r'\|\s*(.+?)\s*\|\s*TODO\s*\|\s*(.+?)\s*\|'
for action, details in re.findall(pattern_old, content):
action = action.strip()
details = details.strip()
# Skip header rows and clean up leading pipes
if action.startswith('--') or action.lower() == 'action':
continue
action = action.lstrip('| ').strip()
# Infer type from action text
action_lower = action.lower()
if any(kw in action_lower for kw in ['prometheusrule', 'alert', 'alerting']):
todo_type = 'Alert'
elif any(kw in action_lower for kw in ['uptime kuma', 'monitor', 'ping', 'tcp check']):
todo_type = 'Monitor'
elif any(kw in action_lower for kw in ['config', 'manage', 'add.*option', 'document', 'nfs.conf']):
todo_type = 'Config'
elif any(kw in action_lower for kw in ['migrate', 'move']):
todo_type = 'Migration'
elif any(kw in action_lower for kw in ['review', 'investigate', 'verify']):
todo_type = 'Investigation'
else:
todo_type = 'Config' # default to Config for ambiguous items
# Infer priority from section header context
priority = 'P2' # default
todos.append({
'priority': priority,
'action': action,
'type': todo_type,
'details': details,
'safe': todo_type in safe_types
})
safe_todos = [t for t in todos if t['safe']]
unsafe_todos = [t for t in todos if not t['safe']]
result = {
'file': pm_file,
'todos': safe_todos,
'skipped': unsafe_todos,
'total_todos_in_doc': len(todos),
'safe_todos': len(safe_todos),
'skipped_todos': len(unsafe_todos)
}
print(json.dumps(result, indent=2))
" "$PM_FILE"

View file

@ -0,0 +1,236 @@
<?php
// pfSense HAProxy bootstrap — configures the mailserver PROXY-v2 path
// (bd code-yiu, Phases 2/3 + 5).
//
// WHY THIS EXISTS
// pfSense HAProxy config is stored XML-in-`/cf/conf/config.xml` under
// `<installedpackages><haproxy>`. That file IS picked up by the nightly
// `daily-backup` on the PVE host (see `scripts/daily-backup.sh` → `scp
// root@10.0.20.1:/cf/conf/config.xml`) and synced to Synology. This script
// is the canonical reproducer: run it to rebuild the pfSense HAProxy config
// from scratch (DR restore, fresh pfSense install, etc.).
//
// WHAT IT BUILDS
// 4 backend pools — one per mail port:
// mailserver_nodes_smtp → k8s-node1..4:30125 (container :2525 postscreen)
// mailserver_nodes_smtps → k8s-node1..4:30126 (container :4465 smtps)
// mailserver_nodes_sub → k8s-node1..4:30127 (container :5587 submission)
// mailserver_nodes_imaps → k8s-node1..4:30128 (container :10993 IMAPS)
// Each server uses `send-proxy-v2` and TCP health-check every 120s.
// 4 frontends on pfSense 10.0.20.1:{25,465,587,993} TCP mode.
// + 1 legacy test frontend on :2525 (kept for validation; safe to remove later).
//
// USAGE (on pfSense host, via SSH as admin)
// scp infra/scripts/pfsense-haproxy-bootstrap.php admin@10.0.20.1:/tmp/
// ssh admin@10.0.20.1 'php /tmp/pfsense-haproxy-bootstrap.php'
//
// IDEMPOTENCY
// Removes any existing entries named mailserver_* before re-adding, so
// repeat runs are safe and behave as reset-to-declared.
require_once('/etc/inc/config.inc');
require_once('/usr/local/pkg/haproxy/haproxy.inc');
require_once('/usr/local/pkg/haproxy/haproxy_utils.inc');
global $config;
parse_config(true);
if (!is_array($config['installedpackages']['haproxy'])) {
$config['installedpackages']['haproxy'] = [];
}
$h = &$config['installedpackages']['haproxy'];
$h['enable'] = 'yes';
$h['maxconn'] = '1000';
// Our declared object names (anything starting with mailserver_ is ours)
$POOL_NAMES = [
'mailserver_nodes', // legacy (Phase 2/3 test)
'mailserver_nodes_smtp',
'mailserver_nodes_smtps',
'mailserver_nodes_sub',
'mailserver_nodes_imaps',
];
$FRONTEND_NAMES = [
'mailserver_proxy_test', // legacy (Phase 2/3 test, :2525)
'mailserver_proxy_25',
'mailserver_proxy_465',
'mailserver_proxy_587',
'mailserver_proxy_993',
];
// k8s workers. Not in the cluster: master (control-plane) and node5
// (doesn't exist in this topology).
$NODES = [
['k8s-node1', '10.0.20.101'],
['k8s-node2', '10.0.20.102'],
['k8s-node3', '10.0.20.103'],
['k8s-node4', '10.0.20.104'],
];
// Build a pool with optional split healthcheck path.
//
// $check_port: if non-null, HAProxy sends health probes to that NodePort
// (which Service `mailserver-proxy` maps to the pod's stock no-PROXY
// listener — see infra/stacks/mailserver/.../mailserver_proxy ports
// 30145/30146/30147). Real client traffic still goes to $nodeport with
// PROXY v2 framing.
// $check_type: 'TCP' for plain accept-on-port checks, 'ESMTP' for
// `option smtpchk EHLO <monitor_domain>` (real SMTP banner+EHLO+250).
//
// Why split: smtpd-proxy587/4465 fatal on every PROXY-v2-aware health
// probe with `smtpd_peer_hostaddr_to_sockaddr: ... Servname not supported`
// — the daemon respawns get throttled by Postfix master and real clients
// land mid-respawn → 6s TCP timeout. Routing health probes to the stock
// no-PROXY port sidesteps the bug entirely while data path still gets
// PROXY v2 for CrowdSec/Postfix client-IP visibility. The HAProxy package
// has no `checkport` field, so `port N` is appended via the server's
// `advanced` string (HAProxy parses server keywords in any order).
function build_pool(
string $name,
string $nodeport,
array $nodes,
string $check_type = 'TCP',
?string $check_port = null,
string $monitor_domain = ''
): array {
$advanced_check = $check_port !== null
? "send-proxy-v2 port {$check_port}"
: 'send-proxy-v2';
$servers = [];
foreach ($nodes as $n) {
$servers[] = [
'name' => $n[0],
'address' => $n[1],
'port' => $nodeport,
'weight' => '10',
'ssl' => '',
// 5s = sub-block-window failover when a NodePort goes sour.
// Safe to be aggressive once health probes don't fatal smtpd.
'checkinter' => '5000',
'advanced' => $advanced_check,
'status' => 'active',
];
}
return [
'name' => $name,
'balance' => 'roundrobin',
'check_type' => $check_type,
'monitor_domain' => $monitor_domain,
'checkinter' => '5000',
'retries' => '3',
'ha_servers' => ['item' => $servers],
'advanced_bind' => '',
'persist_cookie_enabled' => '',
'transparent_clientip' => '',
'advanced' => '',
];
}
function build_frontend(string $name, string $descr, string $extaddr, string $port, string $pool): array {
return [
'name' => $name,
'descr' => $descr,
'status' => 'active',
'secondary' => '',
'type' => 'tcp',
'a_extaddr' => ['item' => [[
'extaddr' => $extaddr,
'extaddr_port' => $port,
'extaddr_ssl' => '',
'extaddr_advanced' => '',
]]],
'backend_serverpool' => $pool,
'ha_acls' => '',
'dontlognull'=> '',
'httpclose' => '',
'forwardfor' => '',
'advanced' => '',
];
}
// ── Backend pools ───────────────────────────────────────────────────────
if (!is_array($h['ha_pools'])) $h['ha_pools'] = ['item' => []];
if (!is_array($h['ha_pools']['item'])) $h['ha_pools']['item'] = [];
$h['ha_pools']['item'] = array_values(array_filter(
$h['ha_pools']['item'],
fn($p) => !in_array($p['name'] ?? '', $POOL_NAMES, true)
));
// Legacy test pool (still used by the :2525 test frontend for manual SMTP roundtrip).
$h['ha_pools']['item'][] = build_pool('mailserver_nodes', '30125', $NODES);
// Production pools — one per mail port.
//
// All SMTP/SMTPS/Submission backends use plain TCP checks against
// dedicated non-PROXY healthcheck NodePorts (30145/30146/30147 → pod
// stock 25/465/587) so probes hit the no-PROXY listeners and avoid
// the smtpd_peer_hostaddr_to_sockaddr fatal that fires on PROXY-v2
// LOCAL frames. Real client traffic still goes to 30125-30128 with
// PROXY v2 for client-IP visibility.
//
// We tried `option smtpchk EHLO` initially — it works on the plain
// `submission` daemon (587) but flaps the `postscreen` listener on
// port 25 (multi-line greet + DNSBL silence + anti-pre-greet
// detection makes HAProxy's simple smtpchk parser hit L7RSP). A
// plain TCP accept-on-port check is enough for both: HAProxy still
// gets fast failover when the listener actually goes away, and we
// stop triggering the Postfix fatal entirely.
//
// IMAPS stays on its existing TCP-check-with-PROXY-frame for now —
// Dovecot's PROXY parser doesn't show the same fatal pattern; adding
// a separate IMAP healthcheck path would require another svc port.
$h['ha_pools']['item'][] = build_pool('mailserver_nodes_smtp', '30125', $NODES, 'TCP', '30145');
$h['ha_pools']['item'][] = build_pool('mailserver_nodes_smtps', '30126', $NODES, 'TCP', '30146');
$h['ha_pools']['item'][] = build_pool('mailserver_nodes_sub', '30127', $NODES, 'TCP', '30147');
$h['ha_pools']['item'][] = build_pool('mailserver_nodes_imaps', '30128', $NODES);
// ── Frontends ───────────────────────────────────────────────────────────
if (!is_array($h['ha_backends'])) $h['ha_backends'] = ['item' => []];
if (!is_array($h['ha_backends']['item'])) $h['ha_backends']['item'] = [];
$h['ha_backends']['item'] = array_values(array_filter(
$h['ha_backends']['item'],
fn($f) => !in_array($f['name'] ?? '', $FRONTEND_NAMES, true)
));
// Legacy test frontend — :2525 — retained so SMTP roundtrip tests keep working
// without touching the real :25. Safe to remove once fully validated.
$h['ha_backends']['item'][] = build_frontend(
'mailserver_proxy_test',
'code-yiu Phase 2/3 test — PROXY v2 to k8s mailserver NodePort 30125 (alt port :2525)',
'10.0.20.1', '2525',
'mailserver_nodes'
);
// Production frontends — 4 ports listening on pfSense VLAN20 IP 10.0.20.1.
$h['ha_backends']['item'][] = build_frontend(
'mailserver_proxy_25',
'code-yiu Phase 4/5 — external SMTP (:25) via PROXY v2 → pod :2525 postscreen',
'10.0.20.1', '25',
'mailserver_nodes_smtp'
);
$h['ha_backends']['item'][] = build_frontend(
'mailserver_proxy_465',
'code-yiu Phase 4/5 — external SMTPS (:465) via PROXY v2 → pod :4465 smtpd',
'10.0.20.1', '465',
'mailserver_nodes_smtps'
);
$h['ha_backends']['item'][] = build_frontend(
'mailserver_proxy_587',
'code-yiu Phase 4/5 — external submission (:587) via PROXY v2 → pod :5587 smtpd',
'10.0.20.1', '587',
'mailserver_nodes_sub'
);
$h['ha_backends']['item'][] = build_frontend(
'mailserver_proxy_993',
'code-yiu Phase 4/5 — external IMAPS (:993) via PROXY v2 → pod :10993 Dovecot',
'10.0.20.1', '993',
'mailserver_nodes_imaps'
);
write_config('code-yiu: mailserver HAProxy — 4 production frontends + legacy :2525 test');
$messages = '';
$rc = haproxy_check_and_run($messages, true);
echo 'haproxy_check_and_run rc=' . ($rc ? 'OK' : 'FAIL') . "\n";
echo "messages: $messages\n";

View file

@ -0,0 +1,68 @@
<?php
// pfSense NAT redirect flip — mail ports 25/465/587/993 from
// <mailserver> alias (10.0.20.202 MetalLB LB) to pfSense's own HAProxy
// listener (10.0.20.1). bd code-yiu.
//
// THIS IS THE CUTOVER. After this script:
// Internet → pfSense WAN:{25,465,587,993} → rdr → 10.0.20.1:{...}
// (pfSense HAProxy) → send-proxy-v2 → k8s-node:{30125..30128} NodePort
// → kube-proxy → mailserver pod alt listeners (2525/4465/5587/10993)
// → Postfix/Dovecot parse PROXY v2 → real client IP recovered.
//
// Internal clients (Roundcube, email-roundtrip-monitor CronJob) continue
// using the existing mailserver ClusterIP Service on the stock ports
// (25/465/587/993) which hit container stock listeners WITHOUT PROXY.
// No change to internal traffic paths.
//
// USAGE
// scp infra/scripts/pfsense-nat-mailserver-haproxy-flip.php admin@10.0.20.1:/tmp/
// ssh admin@10.0.20.1 'php /tmp/pfsense-nat-mailserver-haproxy-flip.php'
//
// REVERT — run pfsense-nat-mailserver-haproxy-unflip.php (companion script).
//
// IDEMPOTENT — re-runs converge. Flips nothing if already pointed at 10.0.20.1.
require_once('/etc/inc/config.inc');
require_once('/etc/inc/filter.inc');
global $config;
parse_config(true);
$PORTS_TO_FLIP = ['25', '465', '587', '993'];
$OLD_TARGET = 'mailserver';
$NEW_TARGET = '10.0.20.1';
$changed = 0;
foreach ($config['nat']['rule'] as $i => &$r) {
$iface = $r['interface'] ?? '';
$lport = $r['local-port'] ?? '';
$tgt = $r['target'] ?? '';
if ($iface !== 'wan') continue;
if (!in_array($lport, $PORTS_TO_FLIP, true)) continue;
if ($tgt !== $OLD_TARGET) {
printf("rule %d (dport=%s) target=%s — not flipping (already %s or unexpected)\n",
$i, $lport, $tgt, $NEW_TARGET);
continue;
}
$r['target'] = $NEW_TARGET;
// Also unset the 'associated-rule-id' linked filter rule target if any —
// actually pfSense regenerates the associated rule from NAT rule on apply,
// so leaving associated-rule-id intact is fine.
$changed++;
printf("rule %d (dport=%s): target %s → %s\n", $i, $lport, $OLD_TARGET, $NEW_TARGET);
}
unset($r);
if ($changed === 0) {
echo "No changes. (Already flipped? Run unflip script to revert.)\n";
exit(0);
}
write_config("code-yiu: NAT rdr — mail ports {$changed} flipped to HAProxy (10.0.20.1)");
// Rebuild pf rules & reload.
$rc = filter_configure();
printf("filter_configure rc=%s\n", var_export($rc, true));
echo "done.\n";

View file

@ -0,0 +1,48 @@
<?php
// REVERT of pfsense-nat-mailserver-haproxy-flip.php.
// Moves mail-port NAT rdr target from 10.0.20.1 (pfSense HAProxy) back to
// <mailserver> alias (10.0.20.202 MetalLB LB IP). bd code-yiu rollback.
//
// USE THIS IF: external mail breaks after the flip, any postscreen
// PROXY timeouts show up in logs, or you need to back out before Phase 6.
require_once('/etc/inc/config.inc');
require_once('/etc/inc/filter.inc');
global $config;
parse_config(true);
$PORTS_TO_REVERT = ['25', '465', '587', '993'];
$OLD_TARGET = '10.0.20.1';
$NEW_TARGET = 'mailserver';
$changed = 0;
foreach ($config['nat']['rule'] as $i => &$r) {
$iface = $r['interface'] ?? '';
$lport = $r['local-port'] ?? '';
$tgt = $r['target'] ?? '';
if ($iface !== 'wan') continue;
if (!in_array($lport, $PORTS_TO_REVERT, true)) continue;
if ($tgt !== $OLD_TARGET) {
printf("rule %d (dport=%s) target=%s — not reverting (already %s or unexpected)\n",
$i, $lport, $tgt, $NEW_TARGET);
continue;
}
$r['target'] = $NEW_TARGET;
$changed++;
printf("rule %d (dport=%s): target %s → %s\n", $i, $lport, $OLD_TARGET, $NEW_TARGET);
}
unset($r);
if ($changed === 0) {
echo "No changes. (Already reverted.)\n";
exit(0);
}
write_config("code-yiu: NAT rdr — mail ports {$changed} reverted to <mailserver> alias");
$rc = filter_configure();
printf("filter_configure rc=%s\n", var_export($rc, true));
echo "done.\n";

81
scripts/postmortem-pipeline.sh Executable file
View file

@ -0,0 +1,81 @@
#!/bin/sh
# postmortem-pipeline.sh — Woodpecker pipeline step for post-mortem TODO automation
# Called from .woodpecker/postmortem-todos.yml
set -e
# 1. Find post-mortem(s) with TODO items
# Scan all post-mortems — don't rely on git diff (Woodpecker shallow clone breaks HEAD~1)
PM_FILE=""
for f in docs/post-mortems/*.md; do
if grep -q '| TODO |' "$f" 2>/dev/null; then
PM_FILE="$f"
break
fi
done
if [ -z "$PM_FILE" ]; then
echo "No post-mortem with pending TODOs found"
exit 0
fi
echo "Post-mortem with TODOs: $PM_FILE"
# 3. Parse TODOs
sh scripts/parse-postmortem-todos.sh "$PM_FILE" > /tmp/todos.json
cat /tmp/todos.json
TODO_COUNT=$(jq '.safe_todos' /tmp/todos.json)
echo "$TODO_COUNT safe TODO(s) found"
if [ "$TODO_COUNT" -eq 0 ]; then
echo "No auto-implementable TODOs — skipping"
exit 0
fi
# 4. Authenticate to Vault via K8s SA JWT
SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
VAULT_RESP=$(curl -sf -X POST http://vault-active.vault.svc.cluster.local:8200/v1/auth/kubernetes/login \
-d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}")
VAULT_TOKEN=$(echo "$VAULT_RESP" | jq -r .auth.client_token)
if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then
echo "ERROR: Vault authentication failed"
exit 1
fi
echo "Vault authenticated"
# 5. Fetch API token for claude-agent-service
AGENT_TOKEN=$(curl -sf -H "X-Vault-Token: $VAULT_TOKEN" \
http://vault-active.vault.svc.cluster.local:8200/v1/secret/data/claude-agent-service | \
jq -r '.data.data.api_bearer_token')
if [ -z "$AGENT_TOKEN" ] || [ "$AGENT_TOKEN" = "null" ]; then
echo "ERROR: Failed to fetch agent API token"
exit 1
fi
echo "Agent token fetched"
# 6. Submit to claude-agent-service
TODOS=$(cat /tmp/todos.json)
PAYLOAD=$(jq -n \
--arg prompt "Implement the auto-implementable TODOs from $PM_FILE. Parsed TODO list: $TODOS" \
--arg agent ".claude/agents/postmortem-todo-resolver" \
'{prompt: $prompt, agent: $agent, max_budget_usd: 5, timeout_seconds: 900}')
RESP=$(curl -sf -X POST \
-H "Authorization: Bearer $AGENT_TOKEN" \
-H "Content-Type: application/json" \
-d "$PAYLOAD" \
http://claude-agent-service.claude-agent.svc.cluster.local:8080/execute)
JOB_ID=$(echo "$RESP" | jq -r '.job_id')
echo "Job submitted: $JOB_ID"
# 7. Poll for completion (15min max)
for i in $(seq 1 60); do
sleep 15
RESULT=$(curl -sf \
-H "Authorization: Bearer $AGENT_TOKEN" \
http://claude-agent-service.claude-agent.svc.cluster.local:8080/jobs/$JOB_ID)
STATUS=$(echo "$RESULT" | jq -r '.status')
echo "[$i/60] Status: $STATUS"
if [ "$STATUS" != "running" ]; then
echo "$RESULT" | jq .
if [ "$STATUS" = "completed" ]; then exit 0; else exit 1; fi
fi
done
echo "ERROR: Job timed out after 15 minutes"
exit 1

109
scripts/provision-k8s-worker Executable file
View file

@ -0,0 +1,109 @@
#!/usr/bin/env bash
# provision-k8s-worker NAME VMID IP[/CIDR]
#
# Clone PVE template 2000 (ubuntu-2404-cloudinit-k8s-template) into a new
# VM, configure resources to match k8s-node3/4 (32G RAM, 8 vCPU, host CPU,
# 256G disk, VLAN 20 on vmbr1), attach the shared cicustom snippet
# (/var/lib/vz/snippets/k8s_cloud_init.yaml), and start it. Cloud-init
# inside the VM installs containerd + kubelet, applies the bundled
# setup script, and runs the kubeadm join. No manual steps after this.
#
# Hostname is derived from `qm set --name $NAME` and read by cloud-init
# from Proxmox metadata — DO NOT hard-code in the snippet.
#
# Idempotent: aborts if VMID already exists or IP is already in use.
#
# Usage:
# ssh root@192.168.1.127 bash -s -- k8s-node6 206 10.0.20.106 < provision-k8s-worker
# or, if the script lives on the PVE host:
# provision-k8s-worker k8s-node6 206 10.0.20.106
#
# Run on the PVE host (needs qm + /var/lib/vz/snippets access).
set -euo pipefail
if [ $# -ne 3 ]; then
echo "usage: $0 NAME VMID IP" >&2
echo " e.g. $0 k8s-node6 206 10.0.20.106" >&2
exit 2
fi
NAME=$1
VMID=$2
IP=$3
CIDR_IP="${IP}/22"
GW="10.0.20.1"
DNS="10.0.20.201"
SEARCH="viktorbarzin.lan"
TEMPLATE_ID=2000
STORAGE="local-lvm"
USER_SNIPPET="local:snippets/k8s_cloud_init.yaml"
# Per-node meta-data snippet — written below — supplies local-hostname.
# Proxmox's auto-generated metadata DOESN'T include hostname when
# cicustom user=… is set, so the shared user-data snippet alone leaves
# nodes joining as "ubuntu" (image default). Per-node meta-data is the
# clean fix.
META_SNIPPET_FILE="/var/lib/vz/snippets/${NAME}-meta.yaml"
META_SNIPPET="local:snippets/${NAME}-meta.yaml"
BRIDGE="vmbr1"
VLAN=20
# Sanity: VMID must be free
if qm status "$VMID" >/dev/null 2>&1; then
echo "ERROR: VM $VMID already exists. Refusing to clobber." >&2
qm status "$VMID" >&2
exit 1
fi
# Sanity: IP must not be pingable
if ping -c 1 -W 1 "$IP" >/dev/null 2>&1; then
echo "ERROR: $IP is already responding to ping. Refusing to assign." >&2
exit 1
fi
# Sanity: snippet must exist
if [ ! -f "/var/lib/vz/snippets/k8s_cloud_init.yaml" ]; then
echo "ERROR: /var/lib/vz/snippets/k8s_cloud_init.yaml missing." >&2
echo " Run `tg apply` in infra/stacks/infra/ to regenerate it." >&2
exit 1
fi
# Sanity: template must be a template
if ! qm config "$TEMPLATE_ID" | grep -q '^template: 1'; then
echo "ERROR: VMID $TEMPLATE_ID is not a template." >&2
exit 1
fi
echo "[1/6] write per-node meta-data snippet ($META_SNIPPET_FILE)"
cat > "$META_SNIPPET_FILE" <<META
local-hostname: $NAME
instance-id: $NAME-$(date +%s)
META
echo "[2/6] qm clone $TEMPLATE_ID -> $VMID ($NAME)"
qm clone "$TEMPLATE_ID" "$VMID" --name "$NAME" --full true --storage "$STORAGE"
echo "[3/6] qm set $VMID — VM resources + network + cicustom"
qm set "$VMID" \
--agent 1 \
--balloon 32768 \
--cores 8 \
--cpu host \
--memory 32768 \
--net0 "virtio,bridge=$BRIDGE,tag=$VLAN" \
--ipconfig0 "ip=$CIDR_IP,gw=$GW" \
--nameserver "$DNS" \
--searchdomain "$SEARCH" \
--onboot 1 \
--startup 'order=5,up=45,down=420' \
--cicustom "user=$USER_SNIPPET,meta=$META_SNIPPET"
echo "[4/6] qm resize $VMID scsi0 256G"
qm resize "$VMID" scsi0 256G
echo "[5/6] qm start $VMID"
qm start "$VMID"
echo "[6/6] Done. Cloud-init runs now; node should appear in 'kubectl get nodes' within ~6-10 min."
echo " Tail cloud-init: socat -u UNIX-CONNECT:/var/run/qemu-server/$VMID.serial0 STDOUT | strings"
echo " Final config:"
qm config "$VMID" | grep -E '^(name|cores|memory|net0|ipconfig0|cicustom|scsi0|onboot):'

26
scripts/pve-nfs-exports Normal file
View file

@ -0,0 +1,26 @@
# /etc/exports — NFS export configuration for Proxmox VE host
# Managed in git: infra/scripts/pve-nfs-exports
# Deploy: scp scripts/pve-nfs-exports root@192.168.1.127:/etc/exports && ssh root@192.168.1.127 exportfs -ra
#
# CRITICAL NOTES (learned from 2026-04-14 outage [PM-2026-04-14]):
# - NEVER add fsid=0 to /srv/nfs or /srv/nfs-ssd exports. fsid=0 designates the
# NFSv4 pseudo-root which changes path resolution for ALL subdirectory mounts.
# When CSI mounts use paths like /srv/nfs/technitium, fsid=0 makes them resolve
# as the root itself, causing ENOENT on all subdirectory mounts.
# - fsid=1 is acceptable on /srv/nfs-ssd (unique ID, not root).
# - The NFS CSI driver mounts subdirectories — never use fsid=0 on any export
# that serves dynamic path mounts.
# - NFSv3 is disabled on this host (vers3=n in /etc/nfs.conf) — all k8s mounts
# must use nfsvers=4 mount option.
#
# Mount options explanation:
# rw — read/write access (required for PVCs)
# async — async writes safe: UPS protects host + Vault Raft replication +
# databases on block storage. Only NFS metadata at risk.
# no_subtree_check — disable subtree checking for performance and reliability
# no_root_squash — k8s CSI driver runs as root; squashing breaks PVC writes
# insecure — allow source ports >1024 (required: pfSense VLAN NAT uses
# unprivileged ports for VLAN 10 → 192.168.1.x traffic)
#
/srv/nfs *(rw,async,no_subtree_check,no_root_squash,insecure)
/srv/nfs-ssd *(rw,sync,no_subtree_check,no_root_squash,insecure,fsid=1)

9
scripts/renew_worker_certs.sh Executable file
View file

@ -0,0 +1,9 @@
#!/usr/bin/env bash
echo 'KUBELET_KUBEADM_ARGS="--container-runtime-endpoint=unix:///var/run/containerd/containerd.sock --pod-infra-container-image=k8s.gcr.io/pause:3.7 --rotate-certificates=true --rotate-server-certificates=true"' | sudo tee /var/lib/kubelet/kubeadm-flags.env
sudo systemctl daemon-reload
sudo systemctl restart kubelet
# Aprprove all csrs:
# for csr in $(kb get csr | grep Pending | awk '{print $1}'); do echo $csr; kb certificate approve $csr; done

View file

@ -0,0 +1,3 @@
#!/usr/bin/env bash
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -o /tmp/powercheck-armv8 . && rsync /tmp/powercheck-armv8 Administrator@nas:~/server-power-cycle/ && rm /tmp/powercheck-armv8
rsync synology_main.sh Administrator@nas:~/server-power-cycle/

View file

@ -0,0 +1,12 @@
module viktorbarzin/server-lifecycle
go 1.22.0
toolchain go1.23.6
require (
github.com/gosnmp/gosnmp v1.39.0
github.com/nightlyone/lockfile v1.0.0
)
require github.com/golang/glog v1.2.4 // indirect

View file

@ -0,0 +1,14 @@
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/golang/glog v1.2.4 h1:CNNw5U8lSiiBk7druxtSHHTsRWcxKoac6kZKm2peBBc=
github.com/golang/glog v1.2.4/go.mod h1:6AhwSGph0fcJtXVM/PEHPqZlFeoLxhs7/t5UDAwmO+w=
github.com/gosnmp/gosnmp v1.39.0 h1:mPJtSWFLkEemo2bz4fdNztZIFHYG86MC6c6veocq0ZE=
github.com/gosnmp/gosnmp v1.39.0/go.mod h1:CxVS6bXqmWZlafUj9pZUnQX5e4fAltqPcijxWpCitDo=
github.com/nightlyone/lockfile v1.0.0 h1:RHep2cFKK4PonZJDdEl4GmkabuhbsRMgk/k3uAmxBiA=
github.com/nightlyone/lockfile v1.0.0/go.mod h1:rywoIealpdNse2r832aiD9jRk8ErCatROs6LzC841CI=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View file

@ -0,0 +1,125 @@
package main
import (
"bytes"
"crypto/tls"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net/http"
"github.com/golang/glog"
)
type PowerStateResponse struct {
PowerState string `json:"PowerState"`
}
type ResetType string
const (
On ResetType = "On"
GracefulShutdown ResetType = "GracefulShutdown"
)
func checkPowerState(idractCredentials idracCredentials) (string, error) {
// Construct the full URL for the Redfish Systems endpoint
redfishURL := fmt.Sprintf("%s/redfish/v1/Systems/System.Embedded.1", idractCredentials.url)
// Create an HTTP client
client := &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
},
}
// Create a new GET request
req, err := http.NewRequest("GET", redfishURL, nil)
if err != nil {
return "", fmt.Errorf("failed to create request: %v", err)
}
// Set basic authentication
req.SetBasicAuth(idractCredentials.username, idractCredentials.password)
// Set the Accept header to request JSON
req.Header.Set("Accept", "application/json")
// Send the request
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("failed to send request: %v", err)
}
defer resp.Body.Close()
// Check the HTTP status code
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return "", fmt.Errorf("unexpected status code: %d, response: %s", resp.StatusCode, string(body))
}
// Read the response body
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("failed to read response body: %v", err)
}
// return string(body), nil
// Parse the JSON response
var powerStateResponse PowerStateResponse
err = json.Unmarshal(body, &powerStateResponse)
if err != nil {
return "", fmt.Errorf("failed to parse JSON response: %v", err)
}
// Return the power state
return powerStateResponse.PowerState, nil
}
func performGracefulShutdown(idracCredentials idracCredentials) error {
return performResetType(idracCredentials, GracefulShutdown)
}
func performPowerOn(idracCredentials idracCredentials) error {
return performResetType(idracCredentials, On)
}
func performResetType(idracCredentials idracCredentials, resetType ResetType) error {
glog.Warningf("Starting graceful reset type %s!\n", resetType)
// Define the payload for the shutdown request
payload := map[string]string{
"ResetType": string(resetType), // Only ResetType is needed
}
payloadBytes, err := json.Marshal(payload)
if err != nil {
return fmt.Errorf("failed to marshal payload: %v", err)
}
// Create a new HTTP request
req, err := http.NewRequest("POST", idracCredentials.url, bytes.NewBuffer(payloadBytes))
if err != nil {
return fmt.Errorf("failed to create request: %v", err)
}
// Set headers
req.Header.Set("Content-Type", "application/json")
req.SetBasicAuth(idracCredentials.username, idracCredentials.password)
// Send the request
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return fmt.Errorf("failed to send request: %v", err)
}
defer resp.Body.Close()
// Check the response status code
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted {
body, _ := ioutil.ReadAll(resp.Body)
return fmt.Errorf("unexpected status code: %d, response: %s", resp.StatusCode, string(body))
}
glog.Infof("Reset type %s initiated successfully.\n")
return nil
}

View file

@ -0,0 +1,107 @@
package main
import (
"flag"
"log"
"github.com/golang/glog"
"github.com/nightlyone/lockfile"
)
const upsMinutesRemainingThreshold = 20
type idracCredentials = struct {
url string
username string
password string
}
func main() {
idracUsername := flag.String("idracUsername", "root", "iDRAC username")
idracPassword := flag.String("idracPassword", "calvin", "iDRAC password")
idracHost := flag.String("idracHost", "192.168.1.4", "iDRAC host")
flag.Parse()
defer glog.Flush()
// lock, err := tryGetLock()
// if err != nil {
// glog.Fatalf("Failed to acquire lock: %v", err)
// }
// defer lock.Unlock()
glog.Info("Checking server power state")
idracCredentials := idracCredentials{
url: "https://" + *idracHost,
username: *idracUsername,
password: *idracPassword,
}
powerState, err := checkPowerState(idracCredentials)
if err != nil {
glog.Fatalf("Failed to check power state: %v", err)
}
glog.Infof("Server power state: %s", powerState)
glog.Info("Checking UPS state")
snmp := getSNMPClient()
// Connect to the SNMP agent
err = snmp.Connect()
if err != nil {
log.Fatalf("Failed to connect to UPS SNMP agent: %v", err)
}
defer snmp.Conn.Close()
upsState, err := getPowerState(snmp)
if err != nil {
glog.Fatalf("Failed to get UPS power state: %v", err)
}
if powerState == "On" {
handleWhenServerOn(upsState, idracCredentials)
} else if powerState == "Off" {
handleWhenServerOff(upsState, idracCredentials)
} else {
glog.Fatalf("Unknown server state %s", powerState)
}
}
func handleWhenServerOn(upsState UPSPowerState, idracCredentials idracCredentials) {
if upsState.inputVoltage > 0 {
glog.Infof("UPS is on AC power: %d. Nothing to do.\n", upsState.inputVoltage)
return
} else {
glog.Warningln("UPS is on Battery power")
if upsState.minutesRemaining < upsMinutesRemainingThreshold {
glog.Warningf("Minutes remaining is too low - %d Turning off server.", upsState.minutesRemaining)
// Perform a graceful shutdown of the server
performGracefulShutdown(idracCredentials)
} else {
glog.Warningf("Minutes remaining is %d. Server will not be shutdown yet.", upsState.minutesRemaining)
return
}
}
}
func handleWhenServerOff(upsState UPSPowerState, idracCredentials idracCredentials) {
if upsState.inputVoltage > 0 {
glog.Infof("UPS is on AC power: %d\n", upsState.inputVoltage)
if upsState.minutesRemaining < upsMinutesRemainingThreshold {
glog.Infof("UPS battery is still too low - %d minutes remaining. Not turning on server yet.\n", upsState.minutesRemaining)
} else {
glog.Infof("UPS is on AC power and battery has charged - %d minutes remaining. Turning on server...\n", upsState.minutesRemaining)
// Perform startup of the server
performPowerOn(idracCredentials)
}
} else {
glog.Warningln("UPS is still on battery power")
return
}
}
func tryGetLock() (*lockfile.Lockfile, error) {
lock, err := lockfile.New("/tmp/server_safe_poweroff.pid")
if err != nil {
log.Fatalf("Failed to create lock file: %v", err)
}
err = lock.TryLock()
if err != nil {
return nil, err
}
return &lock, nil
}

View file

@ -0,0 +1,23 @@
#!/usr/bin/env bash
# This is used to run the main program on synology nas and log all messages to synology's log system
cd /var/services/homes/Administrator/server-power-cycle
echo "Starting powercheck"
./powercheck-armv8 -log_dir=./logs
echo "script completed successfully, logging to synlogy's logs"
while IFS= read -r line; do
# for line in $(cat ./logs/powercheck-armv8.INFO); do
msg=$(echo $line | grep -E '^[IWEF][0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{6}'| awk '{$1=$2=$3=$4=""; print $0}' | sed 's/^ *//')
#echo $line
echo $msg
if [[ -n $msg ]]; then
synologset1 sys info 0x11800000 "$msg"
fi
done < "./logs/powercheck-armv8.INFO"
# Cleanup logs
find ./logs -type f -mtime +7 -exec rm {} \;

View file

@ -0,0 +1,46 @@
package main
import (
"time"
"github.com/golang/glog"
"github.com/gosnmp/gosnmp"
)
type UPSPowerState = struct {
inputVoltage int
minutesRemaining uint
}
func getSNMPClient() *gosnmp.GoSNMP {
// Define SNMP connection parameters
target := "192.168.1.5"
community := "Public0"
// Create a new SNMP client
snmp := &gosnmp.GoSNMP{
Target: target,
Port: 161, // Default SNMP port
Community: community,
Version: gosnmp.Version2c, // Use SNMP v2c
Timeout: time.Duration(5) * time.Second,
}
return snmp
}
func getPowerState(snmp *gosnmp.GoSNMP) (UPSPowerState, error) {
oids := []string{
// "1.3.6.1.2.1.33.1.2.2.0", // seconds on battery
"1.3.6.1.2.1.33.1.3.3.1.3.1", // input voltage
"1.3.6.1.2.1.33.1.2.3.0", // minutes remaining
}
// Perform an SNMP GET request to retrieve the values for the specified OIDs
result, err := snmp.Get(oids)
if err != nil {
glog.Fatalf("Failed to perform SNMP GET request: %v", err)
}
inputVoltage := (result.Variables[0].Value).(int)
minutesRemaining := result.Variables[1].Value.(uint)
return UPSPowerState{inputVoltage, minutesRemaining}, nil
}

View file

@ -0,0 +1,115 @@
#!/usr/bin/env bash
set -euo pipefail
############################################
# CONFIGURATION
############################################
# Internal pull-through registry endpoint
# Examples:
# http://registry.internal:5000
# https://registry.internal
INTERNAL_REGISTRY="http://10.0.20.10:5002"
# Path where containerd reads registry configs
CERTS_DIR="/etc/containerd/certs.d"
# Optional: path to CA file if INTERNAL_REGISTRY uses HTTPS with custom CA
# Leave empty if not needed
INTERNAL_CA_PATH=""
# Restart containerd at the end
RESTART_CONTAINERD=true
############################################
# REGISTRIES TO MIRROR
############################################
REGISTRIES=(
"docker.io"
"registry-1.docker.io"
"registry.k8s.io"
"quay.io"
"ghcr.io"
"gcr.io"
"us-docker.pkg.dev"
"public.ecr.aws"
"mcr.microsoft.com"
)
############################################
# FUNCTIONS
############################################
require_root() {
if [[ "$(id -u)" -ne 0 ]]; then
echo "ERROR: must be run as root" >&2
exit 1
fi
}
ensure_containerd_config_path() {
local cfg="/etc/containerd/config.toml"
if [[ ! -f "$cfg" ]]; then
echo "Generating default containerd config"
containerd config default > "$cfg"
fi
if ! grep -q 'config_path *= *"/etc/containerd/certs.d"' "$cfg"; then
echo "Enabling config_path in containerd config"
# Minimal and safe append if section exists
if grep -q '\[plugins\."io.containerd.grpc.v1.cri".registry\]' "$cfg"; then
sed -i '/\[plugins\."io.containerd.grpc.v1.cri".registry\]/a \ config_path = "/etc/containerd/certs.d"' "$cfg"
else
cat >> "$cfg" <<'EOF'
[plugins."io.containerd.grpc.v1.cri".registry]
config_path = "/etc/containerd/certs.d"
EOF
fi
fi
}
write_hosts_toml() {
local registry="$1"
local dir="$CERTS_DIR/$registry"
local file="$dir/hosts.toml"
mkdir -p "$dir"
cat > "$file" <<EOF
server = "https://$registry"
[host."$INTERNAL_REGISTRY"]
capabilities = ["pull", "resolve"]
EOF
if [[ -n "$INTERNAL_CA_PATH" ]]; then
cat >> "$file" <<EOF
ca = "$INTERNAL_CA_PATH"
EOF
fi
}
############################################
# MAIN
############################################
require_root
ensure_containerd_config_path
echo "Creating registry mirror configurations..."
for r in "${REGISTRIES[@]}"; do
echo " - $r"
write_hosts_toml "$r"
done
if [[ "$RESTART_CONTAINERD" == "true" ]]; then
echo "Restarting containerd"
systemctl restart containerd
fi
echo "Done."

View file

@ -0,0 +1,60 @@
#!/usr/bin/env bash
# One-shot deployment of the forgejo.viktorbarzin.me containerd hosts.toml
# entry across every k8s node. Cloud-init only fires on VM provision, so
# existing nodes need this manual rollout.
#
# What it does, per node:
# 1. drain (ignore-daemonsets, delete-emptydir-data)
# 2. ssh in: mkdir + write /etc/containerd/certs.d/forgejo.viktorbarzin.me/hosts.toml
# 3. systemctl restart containerd
# 4. uncordon
#
# hosts.toml is documented as hot-reloaded but the post-2026-04-19
# containerd corruption playbook calls for an explicit restart so the
# config is unambiguously in effect. Running drain/uncordon around it
# avoids pulling against an in-flight containerd restart.
#
# Re-run is safe: writes are idempotent.
set -euo pipefail
CERTS_DIR=/etc/containerd/certs.d/forgejo.viktorbarzin.me
HOSTS_TOML='server = "https://forgejo.viktorbarzin.me"
[host."https://10.0.20.203"]
capabilities = ["pull", "resolve"]
skip_verify = true
'
NODES=$(kubectl get nodes -o name | sed 's|^node/||')
if [[ -z "$NODES" ]]; then
echo "ERROR: no nodes returned from kubectl get nodes" >&2
exit 1
fi
for n in $NODES; do
echo "=== $n ==="
kubectl drain "$n" --ignore-daemonsets --delete-emptydir-data --force --grace-period=60
ssh -o StrictHostKeyChecking=accept-new "wizard@$n" sudo bash <<EOF
set -euo pipefail
mkdir -p "$CERTS_DIR"
cat > "$CERTS_DIR/hosts.toml" <<'TOML'
$HOSTS_TOML
TOML
systemctl restart containerd
EOF
kubectl uncordon "$n"
# Wait for the node to report Ready before moving to the next one.
for i in {1..30}; do
if kubectl get node "$n" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' | grep -q True; then
echo " node Ready"
break
fi
sleep 2
done
done
echo "All nodes updated."

231
scripts/setup-task-pipeline.sh Executable file
View file

@ -0,0 +1,231 @@
#!/usr/bin/env bash
#
# Setup script for the Forgejo task ingestion pipeline.
# Creates Authentik OAuth2 provider/application, configures Forgejo OAuth2 auth source,
# creates "tasks" repo, and sets up webhook to n8n.
#
# Prerequisites:
# - Authentik admin API token
# - Forgejo admin API token (create at https://forgejo.viktorbarzin.me/user/settings/applications)
#
# Usage:
# AUTHENTIK_TOKEN="..." FORGEJO_TOKEN="..." bash scripts/setup-task-pipeline.sh
set -euo pipefail
AUTHENTIK_URL="${AUTHENTIK_URL:-https://authentik.viktorbarzin.me}"
FORGEJO_URL="${FORGEJO_URL:-https://forgejo.viktorbarzin.me}"
N8N_WEBHOOK_URL="${N8N_WEBHOOK_URL:-https://n8n.viktorbarzin.me/webhook/forgejo-tasks}"
FORGEJO_ADMIN_USER="${FORGEJO_ADMIN_USER:-viktor}"
: "${AUTHENTIK_TOKEN:?Set AUTHENTIK_TOKEN (Authentik admin API token)}"
: "${FORGEJO_TOKEN:?Set FORGEJO_TOKEN (Forgejo admin API token)}"
ak_api() { curl -sf -H "Authorization: Bearer $AUTHENTIK_TOKEN" -H "Content-Type: application/json" "$@"; }
fg_api() { curl -sf -H "Authorization: token $FORGEJO_TOKEN" -H "Content-Type: application/json" "$@"; }
echo "=== Step 1: Create Authentik group 'Task Submitters' ==="
GROUP_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/core/groups/" -d '{
"name": "Task Submitters",
"is_superuser": false,
"parent": null
}' 2>/dev/null) || {
echo " Group may already exist, checking..."
GROUP_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/core/groups/?name=Task+Submitters" | python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(json.dumps(r[0]) if r else '')")
if [ -z "$GROUP_RESP" ]; then echo "ERROR: Failed to create or find group"; exit 1; fi
}
GROUP_PK=$(echo "$GROUP_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['pk'])")
echo " Group PK: $GROUP_PK"
echo ""
echo "=== Step 2: Create Authentik OAuth2 Provider for Forgejo ==="
# Find the explicit consent authorization flow
AUTH_FLOW=$(ak_api "$AUTHENTIK_URL/api/v3/flows/instances/?designation=authorization&search=explicit" | \
python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(r[0]['pk'] if r else '')")
if [ -z "$AUTH_FLOW" ]; then
echo " WARNING: Could not find explicit consent flow, using implicit"
AUTH_FLOW=$(ak_api "$AUTHENTIK_URL/api/v3/flows/instances/?designation=authorization" | \
python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(r[0]['pk'] if r else '')")
fi
echo " Authorization flow: $AUTH_FLOW"
PROVIDER_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/providers/oauth2/" -d "{
\"name\": \"Forgejo\",
\"authorization_flow\": \"$AUTH_FLOW\",
\"client_type\": \"confidential\",
\"redirect_uris\": \"$FORGEJO_URL/user/oauth2/Authentik/callback\",
\"property_mappings\": [],
\"sub_mode\": \"hashed_user_id\",
\"include_claims_in_id_token\": true,
\"access_code_validity\": \"minutes=1\",
\"access_token_validity\": \"minutes=5\",
\"refresh_token_validity\": \"days=30\"
}" 2>/dev/null) || {
echo " Provider may already exist, checking..."
PROVIDER_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/providers/oauth2/?name=Forgejo" | \
python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(json.dumps(r[0]) if r else '')")
if [ -z "$PROVIDER_RESP" ]; then echo "ERROR: Failed to create or find provider"; exit 1; fi
}
PROVIDER_PK=$(echo "$PROVIDER_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['pk'])")
CLIENT_ID=$(echo "$PROVIDER_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['client_id'])")
CLIENT_SECRET=$(echo "$PROVIDER_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('client_secret','<already-created>'))")
echo " Provider PK: $PROVIDER_PK"
echo " Client ID: $CLIENT_ID"
echo " Client Secret: $CLIENT_SECRET"
echo ""
echo "=== Step 3: Create Authentik Application for Forgejo ==="
APP_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/core/applications/" -d "{
\"name\": \"Forgejo\",
\"slug\": \"forgejo\",
\"provider\": $PROVIDER_PK,
\"meta_launch_url\": \"$FORGEJO_URL\",
\"policy_engine_mode\": \"any\"
}" 2>/dev/null) || {
echo " Application may already exist, checking..."
APP_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/core/applications/?slug=forgejo" | \
python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(json.dumps(r[0]) if r else '')")
}
APP_SLUG=$(echo "$APP_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['slug'])")
echo " Application slug: $APP_SLUG"
echo ""
echo "=== Step 4: Bind 'Task Submitters' group to Forgejo application ==="
# Create a policy binding that restricts access to the Task Submitters group
ak_api "$AUTHENTIK_URL/api/v3/policies/bindings/" -d "{
\"target\": \"$(echo "$APP_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['pk'])")\",
\"group\": \"$GROUP_PK\",
\"enabled\": true,
\"order\": 0,
\"negate\": false,
\"timeout\": 30
}" > /dev/null 2>&1 || echo " Binding may already exist (OK)"
echo " Group binding created"
echo ""
echo "=== Step 5: Add users to 'Task Submitters' group ==="
echo " Adding Viktor Barzin..."
VIKTOR_PK=$(ak_api "$AUTHENTIK_URL/api/v3/core/users/?search=vbarzin" | \
python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(r[0]['pk'] if r else '')")
if [ -n "$VIKTOR_PK" ]; then
ak_api "$AUTHENTIK_URL/api/v3/core/groups/$GROUP_PK/" -X PATCH -d "{}" > /dev/null 2>&1 || true
ak_api -X POST "$AUTHENTIK_URL/api/v3/core/groups/$GROUP_PK/add_user/" -d "{\"pk\": $VIKTOR_PK}" > /dev/null 2>&1 || true
echo " Added Viktor (PK: $VIKTOR_PK)"
fi
echo ""
echo "=== Step 6: Configure Forgejo OAuth2 authentication source ==="
fg_api "$FORGEJO_URL/api/v1/admin/identity-sources" -d "{
\"authentication_source\": {
\"name\": \"Authentik\",
\"type\": \"oauth2\",
\"is_active\": true,
\"is_sync_enabled\": false,
\"oauth2\": {
\"provider\": \"openidConnect\",
\"client_id\": \"$CLIENT_ID\",
\"client_secret\": \"$CLIENT_SECRET\",
\"open_id_connect_auto_discovery_url\": \"$AUTHENTIK_URL/application/o/forgejo/.well-known/openid-configuration\",
\"scopes\": [\"openid\", \"profile\", \"email\"],
\"required_claim_name\": \"\",
\"required_claim_value\": \"\",
\"group_claim_name\": \"\",
\"admin_group\": \"\",
\"restricted_group\": \"\",
\"icon_url\": \"\",
\"skip_local_2fa\": true,
\"attribute_ssn\": \"\"
}
}
}" > /dev/null 2>&1 && echo " OAuth2 source created" || {
echo " Forgejo identity-sources API may not be available."
echo " Falling back to legacy authentication-source API..."
fg_api "$FORGEJO_URL/api/v1/admin/auths" -d "{
\"name\": \"Authentik\",
\"type\": 6,
\"is_active\": true,
\"is_sync_enabled\": false,
\"cfg\": {
\"Provider\": \"openidConnect\",
\"ClientID\": \"$CLIENT_ID\",
\"ClientSecret\": \"$CLIENT_SECRET\",
\"OpenIDConnectAutoDiscoveryURL\": \"$AUTHENTIK_URL/application/o/forgejo/.well-known/openid-configuration\",
\"Scopes\": [\"openid\", \"profile\", \"email\"],
\"SkipLocalTwoFA\": true
}
}" > /dev/null 2>&1 && echo " OAuth2 source created (legacy API)" || {
echo " ERROR: Could not create OAuth2 source via API."
echo " Please create it manually in Forgejo admin panel:"
echo " 1. Go to $FORGEJO_URL/-/admin/auths/new"
echo " 2. Auth Type: OAuth2"
echo " 3. Name: Authentik"
echo " 4. OAuth2 Provider: OpenID Connect"
echo " 5. Client ID: $CLIENT_ID"
echo " 6. Client Secret: $CLIENT_SECRET"
echo " 7. Discovery URL: $AUTHENTIK_URL/application/o/forgejo/.well-known/openid-configuration"
echo " 8. Scopes: openid profile email"
}
}
echo ""
echo "=== Step 7: Create 'tasks' repository in Forgejo ==="
REPO_RESP=$(fg_api "$FORGEJO_URL/api/v1/user/repos" -d '{
"name": "tasks",
"description": "Task queue for OpenClaw AI agent. Create an issue to submit a task.",
"private": false,
"auto_init": true,
"default_branch": "main"
}' 2>/dev/null) && echo " Repository created" || {
echo " Repository may already exist (OK)"
REPO_RESP=$(fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_ADMIN_USER/tasks")
}
echo " Repo: $FORGEJO_URL/$FORGEJO_ADMIN_USER/tasks"
echo ""
echo "=== Step 8: Disable non-issue features on tasks repo ==="
fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_ADMIN_USER/tasks" -X PATCH -d '{
"has_pull_requests": false,
"has_wiki": false,
"has_projects": false,
"has_releases": false,
"has_packages": false,
"has_actions": false
}' > /dev/null 2>&1 && echo " Disabled PRs, wiki, projects, releases, packages, actions" || echo " Some features may not be disableable (OK)"
echo ""
echo "=== Step 9: Create issue labels ==="
for label_data in \
'{"name":"pending","color":"#0075ca","description":"Task waiting to be processed"}' \
'{"name":"processing","color":"#e4e669","description":"Task currently being processed by OpenClaw"}' \
'{"name":"completed","color":"#0e8a16","description":"Task completed successfully"}' \
'{"name":"failed","color":"#d73a4a","description":"Task failed during processing"}'; do
fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_ADMIN_USER/tasks/labels" -d "$label_data" > /dev/null 2>&1 || true
done
echo " Labels created: pending, processing, completed, failed"
echo ""
echo "=== Step 10: Create webhook on tasks repo → n8n ==="
fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_ADMIN_USER/tasks/hooks" -d "{
\"type\": \"gitea\",
\"config\": {
\"url\": \"$N8N_WEBHOOK_URL\",
\"content_type\": \"json\",
\"secret\": \"\"
},
\"events\": [\"issues\"],
\"active\": true
}" > /dev/null 2>&1 && echo " Webhook created → $N8N_WEBHOOK_URL" || echo " Webhook may already exist (OK)"
echo ""
echo "=========================================="
echo "Setup complete!"
echo ""
echo "Next steps:"
echo " 1. Add SOPS secrets:"
echo " forgejo_authentik_client_id = \"$CLIENT_ID\""
echo " forgejo_authentik_client_secret = \"$CLIENT_SECRET\""
echo " 2. Run: scripts/tg apply -target=module.forgejo"
echo " 3. Create n8n workflow (webhook trigger → OpenClaw exec → Forgejo comment)"
echo " 4. Add more users to 'Task Submitters' group in Authentik"
echo " 5. Test: Create an issue at $FORGEJO_URL/$FORGEJO_ADMIN_USER/tasks/issues/new"
echo "=========================================="

View file

@ -0,0 +1,54 @@
#!/bin/bash
# setup_containerd_mirrors.sh
# Replaces deprecated wildcard registry mirror with per-registry hosts.toml config.
# Run on each K8s WORKER node: ssh wizard@<node-ip> 'sudo bash -s' < scripts/setup_containerd_mirrors.sh
# NOTE: Do NOT run on k8s-master (containerd 1.6.x has conflicts with config_path + mirrors coexisting)
set -euo pipefail
TIMESTAMP=$(date +%s)
CONFIG="/etc/containerd/config.toml"
CERTS_DIR="/etc/containerd/certs.d"
echo "=== Backing up containerd config ==="
cp "$CONFIG" "${CONFIG}.bak.${TIMESTAMP}"
echo "=== Removing deprecated mirror entries ==="
# Remove wildcard mirror and its endpoint
sed -i '/\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."\*"\]/d' "$CONFIG"
sed -i '/endpoint = \["http:\/\/10\.0\.20\.10:5000"\]/d' "$CONFIG"
# Remove any other per-registry mirror sections (e.g. docker.io) to avoid config_path conflict
sed -i '/\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."docker\.io"\]/d' "$CONFIG"
sed -i '/endpoint = \["https:\/\/registry-1\.docker\.io"\]/d' "$CONFIG"
# Remove the mirrors parent section header if it's now empty
sed -i '/\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\]$/d' "$CONFIG"
echo "=== Setting config_path ==="
# Replace empty config_path with certs.d path
if grep -q 'config_path = ""' "$CONFIG"; then
sed -i 's|config_path = ""|config_path = "/etc/containerd/certs.d"|' "$CONFIG"
elif grep -q 'config_path = "/etc/containerd/certs.d"' "$CONFIG"; then
echo "config_path already set, skipping"
else
# If config_path line doesn't exist at all, add it under [plugins."io.containerd.grpc.v1.cri".registry]
sed -i '/\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\]/a\ config_path = "/etc/containerd/certs.d"' "$CONFIG"
fi
echo "=== Creating hosts.toml files ==="
# docker.io (Docker Hub) — proxy first, upstream fallback
mkdir -p "$CERTS_DIR/docker.io"
printf 'server = "https://registry-1.docker.io"\n\n[host."http://10.0.20.10:5000"]\n capabilities = ["pull", "resolve"]\n\n[host."https://registry-1.docker.io"]\n capabilities = ["pull", "resolve"]\n' > "$CERTS_DIR/docker.io/hosts.toml"
# ghcr.io — proxy first, upstream fallback
mkdir -p "$CERTS_DIR/ghcr.io"
printf 'server = "https://ghcr.io"\n\n[host."http://10.0.20.10:5010"]\n capabilities = ["pull", "resolve"]\n\n[host."https://ghcr.io"]\n capabilities = ["pull", "resolve"]\n' > "$CERTS_DIR/ghcr.io/hosts.toml"
# Low-traffic registries (quay.io, registry.k8s.io, reg.kyverno.io) pull directly — no proxy.
# Remove stale hosts.toml from previous config if present.
rm -f "$CERTS_DIR/quay.io/hosts.toml" "$CERTS_DIR/registry.k8s.io/hosts.toml" "$CERTS_DIR/reg.kyverno.io/hosts.toml"
rmdir "$CERTS_DIR/quay.io" "$CERTS_DIR/registry.k8s.io" "$CERTS_DIR/reg.kyverno.io" 2>/dev/null || true
# No containerd restart needed — hosts.toml is re-read on each pull
echo "=== Done ==="

129
scripts/state-sync Executable file
View file

@ -0,0 +1,129 @@
#!/usr/bin/env bash
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
STATE_DIR="$REPO_ROOT/state/stacks"
VAULT_ADDR="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
cmd="${1:-help}"
stack="${2:-}" # optional: operate on single stack
# Check if Vault token is valid
vault_available() {
VAULT_ADDR="$VAULT_ADDR" vault token lookup &>/dev/null 2>&1
}
# Per-stack Transit key URI
transit_uri() {
local stack_name="$1"
echo "${VAULT_ADDR}/v1/transit/keys/sops-state-${stack_name}"
}
# Extract stack name from directory path
stack_name_from_dir() {
basename "$1"
}
# Tier 0 stacks keep SOPS-encrypted local state; Tier 1 uses PG backend
TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
is_tier0() {
echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
}
# Read age recipients from .sops.yaml
AGE_RECIPIENTS="$(python3 -c "
import yaml, sys
with open('$REPO_ROOT/.sops.yaml') as f: c = yaml.safe_load(f)
for r in c.get('creation_rules', []):
age = r.get('age', '')
if age:
print(age.replace('\n', '').strip())
break
" 2>/dev/null || echo "")"
encrypt_state() {
local dir="$1"
local src="$dir/terraform.tfstate"
local dst="$dir/terraform.tfstate.enc"
local name
name="$(stack_name_from_dir "$dir")"
[ -f "$src" ] || return 0
# Only re-encrypt if state is newer than encrypted version
if [ ! -f "$dst" ] || [ "$src" -nt "$dst" ]; then
sops -e --input-type json --output-type json \
--hc-vault-transit "$(transit_uri "$name")" \
--age "$AGE_RECIPIENTS" \
"$src" > "$dst"
fi
}
decrypt_state() {
local dir="$1"
local src="$dir/terraform.tfstate.enc"
local dst="$dir/terraform.tfstate"
[ -f "$src" ] || return 0
if vault_available; then
# Vault Transit — per-stack key, no local key needed
sops -d --input-type json --output-type json "$src" > "$dst"
elif [ -f "${SOPS_AGE_KEY_FILE:-$HOME/.config/sops/age/keys.txt}" ]; then
# Fallback: age key on disk (bootstrap / Vault down)
echo "state-sync: Vault unavailable, falling back to age key" >&2
SOPS_AGE_KEY_FILE="${SOPS_AGE_KEY_FILE:-$HOME/.config/sops/age/keys.txt}" \
sops -d --input-type json --output-type json "$src" > "$dst"
else
echo "state-sync: ERROR — no Vault token and no age key at ~/.config/sops/age/keys.txt" >&2
return 1
fi
}
case "$cmd" in
encrypt)
if [ -n "$stack" ]; then
if is_tier0 "$stack"; then
encrypt_state "$STATE_DIR/$stack"
else
echo "state-sync: skipping Tier 1 stack '$stack' (PG backend)" >&2
fi
else
for dir in "$STATE_DIR"/*/; do
_name="$(stack_name_from_dir "$dir")"
if is_tier0 "$_name"; then
encrypt_state "$dir"
fi
done
fi
;;
decrypt)
if [ -n "$stack" ]; then
if is_tier0 "$stack"; then
decrypt_state "$STATE_DIR/$stack"
else
echo "state-sync: skipping Tier 1 stack '$stack' (PG backend)" >&2
fi
else
for dir in "$STATE_DIR"/*/; do
_name="$(stack_name_from_dir "$dir")"
if is_tier0 "$_name"; then
decrypt_state "$dir"
fi
done
fi
;;
commit)
# Only Tier 0 stacks have encrypted state in git
"$0" encrypt
cd "$REPO_ROOT"
git add state/stacks/*/terraform.tfstate.enc
if ! git diff --cached --quiet; then
git commit -m "state: update encrypted terraform state"
fi
;;
help)
echo "Usage: state-sync {encrypt|decrypt|commit} [stack-name]"
echo "Operates on Tier 0 stacks only (infra, platform, cnpg, vault, dbaas, external-secrets)."
echo "Tier 1 stacks use the PG backend and don't need local state sync."
echo "Encrypt uses per-stack Vault Transit key (transit/keys/sops-state-<stack>)."
echo "Decrypt uses Vault Transit if logged in, falls back to age key."
;;
esac

View file

@ -0,0 +1,48 @@
#!/usr/bin/env bash
# Stop services that may become in a corrupted state if storage is suddenly disconnected
set -euxo pipefail
function scale() { kubectl scale deployment --replicas=$3 --namespace $1 $2; }
### ============================
### MAIN
### ============================
cmd="${1:-stop}"
case "$cmd" in
stop)
scale redis redis 0
scale uptime-kuma uptime-kuma 0
scale paperless-ngx paperless-ngx 0
scale vaultwarden vaultwarden 0
scale immich immich-postgresql 0
scale nextcloud nextcloud 0
scale monitoring prometheus-server 0
scale technitium technitium 0
scale dbaas mysql 0
scale dbaas postgresql 0
;;
start)
scale dbaas mysql 1
scale dbaas postgresql 1
scale technitium technitium 1
scale immich immich-postgresql 1
scale nextcloud nextcloud 1
scale paperless-ngx paperless-ngx 1
scale monitoring prometheus-server 1
scale redis redis 1
scale uptime-kuma uptime-kuma 1
scale vaultwarden vaultwarden 1
;;
# echo "[!] Cleanup only removes links (not flushing all iptables to avoid surprises)."
# ip netns list | grep -qw "$NS_NAME" && sudo ip netns del "$NS_NAME" || true
# has_link "$HOST_VETH" && sudo ip link del "$HOST_VETH" || true
# ;;
*)
echo "Usage: $0 [stop|start]"
exit 1
;;
esac

View file

@ -0,0 +1,6 @@
# The t3-dispatch service (unprivileged user t3-dispatch) may run ONLY the
# t3-mint wrapper, as root. t3-mint validates the target user against
# /etc/ttyd-user-map and mints a one-time t3 pairing token as that user.
# A compromise of the network-facing dispatch service can therefore mint
# pairing tokens for already-mapped users at most — never arbitrary root.
t3-dispatch ALL=(root) NOPASSWD: /usr/local/bin/t3-mint

View file

@ -0,0 +1,8 @@
[Unit]
Description=Track latest t3 nightly (health-checked, idle-only restart)
After=network-online.target
Wants=network-online.target
[Service]
Type=oneshot
ExecStart=/usr/local/bin/t3-autoupdate

49
scripts/t3-autoupdate.sh Normal file
View file

@ -0,0 +1,49 @@
#!/usr/bin/env bash
# Track the latest t3 nightly — with a health-check + auto-rollback (lesson from
# the Keel auto-update incidents: never blindly trust a new build) and idle-only
# restarts (never kill an in-flight coding session). Runs as root via the unit.
set -uo pipefail
LOG() { logger -t t3-autoupdate "$*"; echo "t3-autoupdate: $*"; }
ver() { t3 --version 2>/dev/null | awk '{print $NF}' | sed 's/^v//'; }
before=$(ver); LOG "current: ${before:-unknown}"
npm i -g t3@nightly >/dev/null 2>&1 || { LOG "npm install failed; staying on ${before:-current}"; exit 0; }
after=$(ver)
if [[ -z "$after" || "$after" == "$before" ]]; then
LOG "already latest (${before:-?}); nothing to do"; exit 0
fi
LOG "installed $after (was $before); health-checking…"
# Health-check the NEW binary on a throwaway port/base-dir before trusting it.
SMOKE_PORT=3799; SMOKE_DIR=$(mktemp -d)
t3 serve --host 127.0.0.1 --port "$SMOKE_PORT" --base-dir "$SMOKE_DIR" >/dev/null 2>&1 &
smoke=$!; ok=0
for _ in $(seq 1 15); do
[[ "$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://127.0.0.1:$SMOKE_PORT/" 2>/dev/null)" == "200" ]] && { ok=1; break; }
sleep 2
done
kill "$smoke" 2>/dev/null; wait "$smoke" 2>/dev/null; rm -rf "$SMOKE_DIR"
if [[ "$ok" != "1" ]]; then
LOG "HEALTH-CHECK FAILED for $after — rolling back to $before"
if [[ -n "$before" ]] && npm i -g "t3@$before" >/dev/null 2>&1; then
LOG "rolled back to $before"
else
LOG "ROLLBACK FAILED — manual fix needed (t3 may be broken)"
fi
exit 1
fi
LOG "health OK; restarting idle instances"
# Restart only IDLE per-user instances; defer any with an active agent child.
for unit in $(systemctl list-units --type=service --state=running --no-legend 't3-serve@*' | awk '{print $1}'); do
pid=$(systemctl show -p MainPID --value "$unit")
if [[ -n "$pid" && "$pid" != 0 ]] && pgrep -aP "$pid" 2>/dev/null | grep -qiE 'claude|codex|opencode'; then
LOG "deferring $unit (active agent) — updates next cycle when idle"
else
systemctl restart "$unit" && LOG "restarted $unit -> $after"
fi
done
LOG "update complete: $after"

View file

@ -0,0 +1,10 @@
[Unit]
Description=Daily t3 nightly auto-update
[Timer]
OnCalendar=*-*-* 04:00:00
RandomizedDelaySec=1h
Persistent=true
[Install]
WantedBy=timers.target

View file

@ -0,0 +1,15 @@
[Unit]
Description=t3 per-user dispatch + auto-pair (X-authentik-username -> user instance)
After=network.target
[Service]
Type=simple
# Unprivileged dedicated user; the only privileged action is `sudo t3-mint`
# (scoped in /etc/sudoers.d/t3-autopair). Compromise => mint tokens at most.
User=t3-dispatch
ExecStart=/usr/local/bin/t3-dispatch
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target

View file

@ -0,0 +1,3 @@
module t3-dispatch
go 1.22

139
scripts/t3-dispatch/main.go Normal file
View file

@ -0,0 +1,139 @@
// t3-dispatch: per-user dispatch + auto-pair for t3code.
// Sits behind Traefik+Authentik (which injects X-authentik-username) and routes
// each authenticated user to their own `t3 serve` instance. On a user's first
// visit (no t3 session cookie) it mints a pairing token for that user's instance
// and exchanges it for the session cookie, which it injects into the browser —
// so an Authentik login lands straight in the user's workspace.
package main
import (
"bytes"
"encoding/json"
"fmt"
"log"
"net/http"
"net/http/httputil"
"net/url"
"os"
"os/exec"
"strings"
"sync"
"time"
)
type entry struct {
OsUser string `json:"os_user"`
Port int `json:"port"`
}
const (
cookieName = "t3_session" // discovered: apps/server/src/auth/utils.ts (web mode)
listenAddr = ":3780"
dispatchFile = "/etc/t3-serve/dispatch.json"
)
var (
mu sync.RWMutex
table map[string]entry
)
func loadTable() error {
b, err := os.ReadFile(dispatchFile)
if err != nil {
return err
}
m := map[string]entry{}
if err := json.Unmarshal(b, &m); err != nil {
return err
}
mu.Lock()
table = m
mu.Unlock()
return nil
}
func lookup(ak string) (entry, bool) {
mu.RLock()
defer mu.RUnlock()
e, ok := table[ak]
return e, ok
}
// autoPair mints a one-time pairing token for the user's instance (as that OS
// user, via the scoped sudoers entry) and exchanges it at the instance's
// /api/auth/bootstrap, relaying the returned t3_session Set-Cookie to the browser.
func autoPair(e entry, w http.ResponseWriter, r *http.Request) {
// t3-mint (root, via scoped sudoers) validates the OS user is in
// /etc/ttyd-user-map, then mints as that user. The dispatch service itself
// runs unprivileged and can invoke nothing else.
out, err := exec.Command("sudo", "-n", "/usr/local/bin/t3-mint", e.OsUser).Output()
if err != nil {
log.Printf("mint for %s failed: %v", e.OsUser, err)
http.Error(w, "pairing mint failed", http.StatusInternalServerError)
return
}
var pc struct {
Credential string `json:"credential"` // CLI returns the token under "credential"
}
if err := json.Unmarshal(out, &pc); err != nil || pc.Credential == "" {
http.Error(w, "unparseable pairing output", http.StatusInternalServerError)
return
}
body, _ := json.Marshal(map[string]string{"credential": pc.Credential})
resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d/api/auth/bootstrap", e.Port),
"application/json", bytes.NewReader(body))
if err != nil {
http.Error(w, "bootstrap request failed", http.StatusBadGateway)
return
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Printf("bootstrap for %s returned %d", e.OsUser, resp.StatusCode)
http.Error(w, "bootstrap rejected", http.StatusBadGateway)
return
}
for _, c := range resp.Cookies() {
http.SetCookie(w, c) // relays t3_session (HttpOnly; Path=/; SameSite=Lax)
}
http.Redirect(w, r, "/", http.StatusFound)
}
func handler(w http.ResponseWriter, r *http.Request) {
ak := r.Header.Get("X-authentik-username")
// Authentik injects the full email (e.g. vbarzin@gmail.com); /etc/ttyd-user-map
// (and thus dispatch.json) keys on the local part. Strip @domain, matching the
// terminal stack's tmux-attach.sh (`${auth_user%%@*}`).
if i := strings.IndexByte(ak, '@'); i >= 0 {
ak = ak[:i]
}
e, ok := lookup(ak)
if !ok {
http.Error(w, "no t3 instance provisioned for this user", http.StatusForbidden)
return
}
if _, err := r.Cookie(cookieName); err != nil {
autoPair(e, w, r)
return
}
// Steady state: reverse-proxy (incl. WebSocket upgrade) to the user's instance.
target, _ := url.Parse(fmt.Sprintf("http://127.0.0.1:%d", e.Port))
httputil.NewSingleHostReverseProxy(target).ServeHTTP(w, r)
}
func main() {
if err := loadTable(); err != nil {
log.Fatalf("load %s: %v", dispatchFile, err)
}
go func() {
for range time.Tick(60 * time.Second) {
if err := loadTable(); err != nil {
log.Printf("reload %s: %v", dispatchFile, err)
}
}
}()
mux := http.NewServeMux()
mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { _, _ = w.Write([]byte("ok\n")) })
mux.HandleFunc("/", handler)
log.Printf("t3-dispatch listening on %s", listenAddr)
log.Fatal(http.ListenAndServe(listenAddr, mux))
}

13
scripts/t3-mint Normal file
View file

@ -0,0 +1,13 @@
#!/usr/bin/env bash
# Mint a one-time t3 pairing token for a mapped OS user.
# Runs as root via the scoped sudoers entry for the t3-dispatch service user.
# Validates the requested user is an actual t3 OS user (a value on the RHS of
# /etc/ttyd-user-map) before minting as that user. Prints the t3 CLI JSON.
set -euo pipefail
os_user="${1:-}"
[[ "$os_user" =~ ^[a-z_][a-z0-9_-]{0,31}$ ]] || { echo "invalid user" >&2; exit 2; }
# Must be a mapped t3 OS user (RHS of a non-comment "authentik=os" line).
awk -F= '!/^[[:space:]]*#/ && NF==2 { gsub(/[[:space:]]/, "", $2); print $2 }' /etc/ttyd-user-map \
| grep -qxF "$os_user" || { echo "user not mapped" >&2; exit 3; }
exec runuser -u "$os_user" -- /usr/bin/t3 auth pairing create \
--base-dir "/home/${os_user}/.t3" --ttl 5m --json

View file

@ -0,0 +1,7 @@
[Unit]
Description=Reconcile per-user t3 instances from /etc/ttyd-user-map
After=network.target
[Service]
Type=oneshot
ExecStart=/usr/local/bin/t3-provision-users

View file

@ -0,0 +1,171 @@
#!/usr/bin/env bash
# Reconcile per-user t3 Workstation instances from roster.yaml (the single source
# of truth). roster_engine.py derives the desired state (accounts, per-tier groups,
# sticky ports, /etc/ttyd-user-map, dispatch.json); this script APPLIES it.
#
# ADDITIVE-ONLY for existing users: never removes a group, never replaces a home,
# never re-locks/re-chmods an existing account — so a routine (hourly) reconcile is
# always safe for live users. Destructive offboarding (userdel) is a SEPARATE, gated
# path, never here. Runs hourly as root via t3-provision-users.timer; root has no
# Vault token, so tier validation is best-effort (skipped when k8s_users is unreachable).
#
# DRY_RUN=1 prints actions without mutating. WORKSTATION_DIR overrides the roster/engine location.
set -euo pipefail
WORKSTATION_DIR="${WORKSTATION_DIR:-/home/wizard/code/infra/scripts/workstation}"
ENGINE="$WORKSTATION_DIR/roster_engine.py"
ROSTER="$WORKSTATION_DIR/roster.yaml"
ENVDIR=/etc/t3-serve
MAP=/etc/ttyd-user-map
DRY_RUN="${DRY_RUN:-0}"
# Public infra repo for the locked clone (no auth; the monorepo has no remote).
INFRA_REMOTE="${INFRA_REMOTE:-https://github.com/ViktorBarzin/infra.git}"
# Per-user OIDC kubeconfig (kubelogin/PKCE; cluster server+CA copied from the admin kubeconfig).
OIDC_ISSUER="${OIDC_ISSUER:-https://authentik.viktorbarzin.me/application/o/kubernetes/}"
ADMIN_KUBECONFIG="${ADMIN_KUBECONFIG:-/home/wizard/.kube/config}"
log() { echo "[t3-provision] $*"; }
run() { if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] $*"; else "$@"; fi; }
# Per-non-admin writable, git-crypt-LOCKED infra clone at ~/code. Keyless +
# filter=cat ⇒ code/docs are plaintext, git-crypt'd secret files stay ciphertext.
# Writable + ungated (push != apply; applies are admin-only). NEVER touches an
# existing ~/code (so emo's symlink survives until the gated cutover).
install_locked_clone() {
local user="$1" home
home="$(getent passwd "$user" | cut -d: -f6)"
[[ -z "$home" ]] && return 0
[[ -e "$home/code" || -L "$home/code" ]] && return 0
if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] locked infra clone -> $user:$home/code"; return 0; fi
log "clone locked infra -> $user:~/code"
runuser -u "$user" -- git clone --quiet --no-checkout "$INFRA_REMOTE" "$home/code"
runuser -u "$user" -- git -C "$home/code" config filter.git-crypt.smudge cat
runuser -u "$user" -- git -C "$home/code" config filter.git-crypt.clean cat
runuser -u "$user" -- git -C "$home/code" config filter.git-crypt.required false
runuser -u "$user" -- git -C "$home/code" checkout --quiet master
}
# Per-user OIDC kubeconfig (kubelogin/PKCE — the `kubernetes` Authentik client is
# public, no secret). Identical for all users: identity comes from each user's own
# interactive OIDC login, which the apiserver maps (email claim) to their RBAC.
# Cluster server + CA are copied from the admin kubeconfig. If-absent, never clobber.
install_user_kubeconfig() {
local user="$1" home kc server ca
home="$(getent passwd "$user" | cut -d: -f6)"
[[ -z "$home" ]] && return 0
kc="$home/.kube/config"
[[ -f "$kc" ]] && return 0
[[ -r "$ADMIN_KUBECONFIG" ]] || { log "WARN: $ADMIN_KUBECONFIG unreadable -> skip kubeconfig for $user"; return 0; }
if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] OIDC kubeconfig -> $user:$kc"; return 0; fi
server="$(KUBECONFIG="$ADMIN_KUBECONFIG" kubectl config view --raw --minify -o jsonpath='{.clusters[0].cluster.server}')"
ca="$(KUBECONFIG="$ADMIN_KUBECONFIG" kubectl config view --raw --minify -o jsonpath='{.clusters[0].cluster.certificate-authority-data}')"
[[ -n "$server" && -n "$ca" ]] || { log "WARN: could not read cluster server/CA -> skip kubeconfig for $user"; return 0; }
install -d -o "$user" -g "$user" -m 0700 "$home/.kube"
cat > "$kc" <<EOF
apiVersion: v1
kind: Config
clusters:
- name: homelab
cluster:
server: $server
certificate-authority-data: $ca
contexts:
- name: oidc@homelab
context:
cluster: homelab
user: oidc
current-context: oidc@homelab
users:
- name: oidc
user:
exec:
apiVersion: client.authentication.k8s.io/v1beta1
command: kubectl
args:
- oidc-login
- get-token
- --oidc-issuer-url=$OIDC_ISSUER
- --oidc-client-id=kubernetes
- --oidc-extra-scope=email
- --oidc-extra-scope=profile
- --oidc-extra-scope=groups
interactiveMode: IfAvailable
EOF
chown "$user:$user" "$kc"; chmod 0600 "$kc"
log "wrote OIDC kubeconfig -> $user:~/.kube/config"
}
[[ $EUID -eq 0 ]] || { echo "t3-provision-users: must run as root" >&2; exit 1; }
for bin in python3 jq; do command -v "$bin" >/dev/null || { echo "missing $bin" >&2; exit 1; }; done
[[ -f "$ROSTER" && -f "$ENGINE" ]] || { echo "roster/engine not under $WORKSTATION_DIR" >&2; exit 1; }
install -d -m 0755 "$ENVDIR"
# 1) current sticky ports from existing .env files -> {os_user: port}
ports_file="$(mktemp)"; trap 'rm -f "$ports_file" "${desired_file:-}"' EXIT
{ echo "{}"; for f in "$ENVDIR"/*.env; do
[[ -e "$f" ]] || continue
u="$(basename "$f" .env)"; p="$(grep -oE 'T3_PORT=[0-9]+' "$f" | cut -d= -f2)"
[[ -n "$p" ]] && jq -n --arg u "$u" --argjson p "$p" '{($u): $p}'
done; } | jq -s 'add' > "$ports_file"
# 2) tier validation vs live k8s_users (best-effort; aborts only on a real conflict)
if command -v vault >/dev/null; then
export VAULT_ADDR="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
if k8s_raw="$(vault kv get -field=k8s_users secret/platform 2>/dev/null)"; then
k8s_file="$(mktemp)"; echo "$k8s_raw" | jq -c 'map_values(.role)' > "$k8s_file"
if ! python3 "$ENGINE" validate --roster "$ROSTER" --k8s-users-json "$k8s_file"; then
rm -f "$k8s_file"; echo "[t3-provision] ABORT: roster tier conflicts with k8s_users" >&2; exit 1
fi
rm -f "$k8s_file"
else
log "WARN: k8s_users unreachable (no Vault token?) -> skipping tier validation"
fi
fi
# 3) derive desired state
desired_file="$(mktemp)"
python3 "$ENGINE" derive --roster "$ROSTER" --ports-json "$ports_file" > "$desired_file"
jq -e . "$desired_file" >/dev/null || { echo "[t3-provision] derive produced invalid JSON" >&2; exit 1; }
# 4) per-account: create-if-absent + ADDITIVE tier groups (never strip) + locked clone
while IFS=$'\t' read -r os_user tier shell groups_csv; do
if ! id "$os_user" >/dev/null 2>&1; then
log "create account: $os_user (shell $shell)"
run useradd -m -s "$shell" "$os_user"
run passwd -l "$os_user" # SSO/t3 only — no local password
run chmod 700 "/home/$os_user"
fi
if [[ -n "$groups_csv" ]]; then
current="$(id -nG "$os_user" 2>/dev/null | tr ' ' '\n')"
IFS=',' read -ra want <<< "$groups_csv"
for g in "${want[@]}"; do
grep -qx "$g" <<< "$current" && continue # already a member -> skip
getent group "$g" >/dev/null 2>&1 || continue # group must exist
log "add $os_user -> group $g"; run gpasswd -a "$os_user" "$g" >/dev/null
done
fi
if [[ "$tier" != admin ]]; then # non-admins: locked ~/code clone + OIDC kubeconfig
install_locked_clone "$os_user"
install_user_kubeconfig "$os_user"
fi
done < <(jq -r '.accounts[] | [.os_user, .tier, .shell, (.groups|join(","))] | @tsv' "$desired_file")
# 5) per-user .env (sticky port) + enable t3-serve@
while IFS=$'\t' read -r os_user port; do
envf="$ENVDIR/$os_user.env"
if [[ ! -f "$envf" ]] || ! grep -qx "T3_PORT=$port" "$envf"; then
run bash -c "printf 'T3_PORT=%s\n' '$port' > '$envf'"
fi
id "$os_user" >/dev/null 2>&1 && run systemctl enable --now "t3-serve@$os_user.service" >/dev/null 2>&1 || true
done < <(jq -r '.ports | to_entries[] | [.key, .value] | @tsv' "$desired_file")
# 6) regenerate /etc/ttyd-user-map + dispatch.json from the desired state (SSoT:
# a roster entry removed here DISAPPEARS, which is what the offboarding cut relies on)
if [[ "$DRY_RUN" == 1 ]]; then
log "[dry-run] would regenerate $MAP + $ENVDIR/dispatch.json"
else
jq -r '.ttyd_user_map' "$desired_file" > "$MAP.tmp" && install -m 0644 "$MAP.tmp" "$MAP" && rm -f "$MAP.tmp"
jq -c '.dispatch' "$desired_file" > "$ENVDIR/dispatch.json.tmp" && install -m 0644 "$ENVDIR/dispatch.json.tmp" "$ENVDIR/dispatch.json" && rm -f "$ENVDIR/dispatch.json.tmp"
fi
log "reconcile complete ($([[ "$DRY_RUN" == 1 ]] && echo DRY-RUN || echo applied))"

View file

@ -0,0 +1,10 @@
[Unit]
Description=Periodic t3 per-user reconcile
[Timer]
OnBootSec=2min
OnCalendar=hourly
Persistent=true
[Install]
WantedBy=timers.target

20
scripts/t3-serve@.service Normal file
View file

@ -0,0 +1,20 @@
[Unit]
Description=T3 Code server for %i (t3 serve, per-user)
Documentation=https://github.com/pingdotgg/t3code
After=network.target
[Service]
Type=simple
User=%i
Group=%i
Environment=HOME=/home/%i
Environment=PATH=/usr/local/bin:/usr/bin:/bin:/home/%i/.local/bin
Environment=NODE_ENV=production
EnvironmentFile=/etc/t3-serve/%i.env
WorkingDirectory=/home/%i
ExecStart=/usr/bin/t3 serve --host 0.0.0.0 --port ${T3_PORT} --base-dir /home/%i/.t3
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target

261
scripts/task-processor.sh Executable file
View file

@ -0,0 +1,261 @@
#!/usr/bin/env bash
#
# Task processor for the Forgejo → OpenClaw pipeline.
# Polls Forgejo for new issues in the tasks repo, sends them to OpenClaw
# for processing, and posts results back as comments.
#
# Runs inside the OpenClaw pod via kubectl exec from a CronJob.
#
# Environment:
# FORGEJO_TOKEN — Forgejo API token with repo access
# FORGEJO_URL — Forgejo base URL (default: https://forgejo.viktorbarzin.me)
# FORGEJO_REPO — Repo in format "owner/repo" (default: vbarzin/tasks)
# OPENCLAW_URL — OpenClaw gateway URL (default: http://127.0.0.1:18789)
# OPENCLAW_TOKEN — OpenClaw gateway token
# SLACK_WEBHOOK_URL — Optional Slack webhook for notifications
set -euo pipefail
FORGEJO_URL="${FORGEJO_URL:-https://forgejo.viktorbarzin.me}"
FORGEJO_REPO="${FORGEJO_REPO:-viktor/tasks}"
OPENCLAW_URL="${OPENCLAW_URL:-https://integrate.api.nvidia.com}"
SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}"
: "${FORGEJO_TOKEN:?FORGEJO_TOKEN is required}"
: "${OPENCLAW_TOKEN:?OPENCLAW_TOKEN is required}"
FORGEJO_BOT_USER="${FORGEJO_BOT_USER:-viktor}"
fg_api() {
curl -sf -H "Authorization: token $FORGEJO_TOKEN" -H "Content-Type: application/json" "$@"
}
get_label_id() {
local label_name="$1"
fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/labels?limit=50" | \
python3 -c "
import sys, json
labels = json.load(sys.stdin)
name = sys.argv[1]
for l in labels:
if l['name'] == name:
print(l['id'])
break
else:
print(0)
" "$label_name"
}
add_label() {
local issue_id="$1" label_name="$2"
local label_id
label_id=$(get_label_id "$label_name")
if [ "$label_id" != "0" ]; then
fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id/labels" \
-d "{\"labels\":[$label_id]}" > /dev/null 2>&1 || true
fi
}
remove_label() {
local issue_id="$1" label_name="$2"
local label_id
label_id=$(get_label_id "$label_name")
if [ "$label_id" != "0" ]; then
fg_api -X DELETE "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id/labels/$label_id" > /dev/null 2>&1 || true
fi
}
post_comment() {
local issue_id="$1"
# Read comment body from stdin to avoid quoting issues
python3 -c "
import sys, json
body = sys.stdin.read()
print(json.dumps({'body': body}))
" | fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id/comments" -d @- > /dev/null 2>&1
}
close_issue() {
local issue_id="$1"
fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id" \
-X PATCH -d '{"state": "closed"}' > /dev/null 2>&1
}
get_comment_history() {
local issue_id="$1"
fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id/comments?limit=20" 2>/dev/null | \
python3 -c "
import sys, json
bot_user = sys.argv[1]
comments = json.load(sys.stdin)
history = []
for c in comments:
user = c.get('user', {}).get('login', 'unknown')
body = c.get('body', '')
# Skip bot's own comments to keep context clean
if user == bot_user:
# Include a short summary of previous responses
if '## OpenClaw Task Result' in body:
# Extract just the result content (skip header/footer)
lines = body.split('\n')
content = [l for l in lines if not l.startswith('## ') and not l.startswith('---') and not l.startswith('*Processed')]
summary = '\n'.join(content).strip()[:500]
if summary:
history.append(f'[Previous AI response]: {summary}')
else:
history.append(f'[{user}]: {body}')
print('\n\n'.join(history))
" "$FORGEJO_BOT_USER" 2>/dev/null
}
notify_slack() {
if [ -n "$SLACK_WEBHOOK_URL" ]; then
python3 -c "
import json, sys
print(json.dumps({'text': sys.argv[1]}))
" "$1" | curl -sf -X POST "$SLACK_WEBHOOK_URL" \
-H "Content-Type: application/json" -d @- > /dev/null 2>&1 || true
fi
}
process_issue() {
local issue_id="$1" title="$2" body="$3" author="$4"
echo "Processing issue #$issue_id: $title (by $author)"
# Mark as processing
add_label "$issue_id" "processing"
remove_label "$issue_id" "pending"
remove_label "$issue_id" "completed"
# Fetch comment history for context
local comment_history
comment_history=$(get_comment_history "$issue_id")
# Call OpenClaw gateway API (OpenAI-compatible chat completions)
# Use python to safely build the JSON payload
local response
response=$(python3 -c "
import json, sys
title = sys.argv[1]
body = sys.argv[2]
author = sys.argv[3]
comment_history = sys.argv[4]
prompt = f'''You are processing a task submitted by {author} via the Forgejo task queue.
Task title: {title}
Task description:
{body}'''
if comment_history.strip():
prompt += f'''
Conversation history (follow-up comments):
{comment_history}
The latest comment is the most recent request. Address it in context of the original task and prior conversation.'''
prompt += '''
Please execute this task. When done, provide a clear summary of what was done and any results.
If the task requires infrastructure changes, describe what changes would be needed but do NOT apply them automatically — list the commands/changes for review.'''
payload = {
'model': 'mistralai/mistral-large-3-675b-instruct-2512',
'messages': [
{'role': 'system', 'content': 'You are an infrastructure AI assistant. Process the task and provide actionable results. Be concise.'},
{'role': 'user', 'content': prompt}
],
'max_tokens': 8192,
'temperature': 0.3
}
print(json.dumps(payload))
" "$title" "$body" "$author" "$comment_history" | \
curl -sf --max-time 300 \
-H "Authorization: Bearer $OPENCLAW_TOKEN" \
-H "Content-Type: application/json" \
"$OPENCLAW_URL/v1/chat/completions" \
-d @- 2>&1) || {
echo " ERROR: OpenClaw API call failed"
echo "Failed to process this task. OpenClaw API returned an error. Please check the CronJob logs or process manually." | \
post_comment "$issue_id"
add_label "$issue_id" "failed"
remove_label "$issue_id" "processing"
notify_slack ":x: Task #$issue_id failed: $title"
return 1
}
# Extract the response content and post as comment
python3 -c "
import sys, json
try:
data = json.load(sys.stdin)
msg = data['choices'][0]['message']
# Some models put content in reasoning_content instead of content
result = msg.get('content') or msg.get('reasoning_content') or msg.get('reasoning') or 'No response generated.'
except Exception as e:
result = f'Error parsing OpenClaw response: {e}'
body = f'## OpenClaw Task Result\n\n{result}\n\n---\n*Processed automatically by the OpenClaw task pipeline.*'
print(body)
" <<< "$response" | post_comment "$issue_id"
# Update labels and close
add_label "$issue_id" "completed"
remove_label "$issue_id" "processing"
close_issue "$issue_id"
echo " Issue #$issue_id processed and closed"
notify_slack ":white_check_mark: Task #$issue_id completed: $title"
}
# --- Main ---
echo "=== Task Processor $(date -u +%Y-%m-%dT%H:%M:%SZ) ==="
# List open issues
ISSUES=$(fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues?state=open&type=issues&limit=10&sort=created&direction=asc" 2>/dev/null) || {
echo "ERROR: Could not fetch issues from Forgejo"
exit 1
}
# Parse pending issues into a temp file (avoids delimiter issues)
PENDING_FILE=$(mktemp)
trap 'rm -f "$PENDING_FILE"' EXIT
python3 -c "
import sys, json
issues = json.load(sys.stdin)
for issue in issues:
labels = [l['name'] for l in issue.get('labels', [])]
# Process if: no processing label AND (no completed label OR issue was reopened)
if 'processing' not in labels:
# Write each issue as a JSON line
print(json.dumps({
'id': issue['number'],
'title': issue['title'],
'body': (issue.get('body') or '')[:4000],
'author': issue['user']['login']
}))
" <<< "$ISSUES" > "$PENDING_FILE"
ISSUE_COUNT=$(wc -l < "$PENDING_FILE" | tr -d ' ')
if [ "$ISSUE_COUNT" = "0" ]; then
echo "No pending issues to process"
exit 0
fi
echo "Found $ISSUE_COUNT pending issue(s)"
# Process each pending issue (one JSON object per line)
while IFS= read -r line; do
issue_id=$(python3 -c "import json,sys; print(json.loads(sys.argv[1])['id'])" "$line")
title=$(python3 -c "import json,sys; print(json.loads(sys.argv[1])['title'])" "$line")
body=$(python3 -c "import json,sys; print(json.loads(sys.argv[1])['body'])" "$line")
author=$(python3 -c "import json,sys; print(json.loads(sys.argv[1])['author'])" "$line")
process_issue "$issue_id" "$title" "$body" "$author" || true
done < "$PENDING_FILE"
echo "=== Task processing complete ==="

View file

@ -0,0 +1,85 @@
#!/usr/bin/env bash
# Unit tests for the pure functions in fan-control.sh.
# Sources the script (main is guarded), exercises curve/decide/resolve/presence/parse.
# Run: bash infra/scripts/test-fan-control.sh
set -uo pipefail
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=/dev/null
source "$DIR/fan-control.sh"
pass=0 fail=0
eq() { # <description> <expected> <actual>
if [[ "$2" == "$3" ]]; then pass=$((pass + 1)); else
fail=$((fail + 1)); printf 'FAIL: %s — expected [%s] got [%s]\n' "$1" "$2" "$3"
fi
}
# --- COOL curve (continuous linear: 30% @50C .. 100% @83C) ---
eq "cool <=T_LO clamps" 30 "$(fc_curve cool 40)"
eq "cool 50 -> 30" 30 "$(fc_curve cool 50)"
eq "cool 55 -> 41" 41 "$(fc_curve cool 55)"
eq "cool 60 -> 51" 51 "$(fc_curve cool 60)"
eq "cool 64 -> 60" 60 "$(fc_curve cool 64)"
eq "cool 70 -> 72" 72 "$(fc_curve cool 70)"
eq "cool 75 -> 83" 83 "$(fc_curve cool 75)"
eq "cool 83 -> 100" 100 "$(fc_curve cool 83)"
eq "cool >=T_HI clamps" 100 "$(fc_curve cool 90)"
# --- QUIET curve (continuous linear: 20% @68C .. 100% @83C) ---
eq "quiet <=T_LO clamps" 20 "$(fc_curve quiet 60)"
eq "quiet 68 -> 20" 20 "$(fc_curve quiet 68)"
eq "quiet 70 -> 31" 31 "$(fc_curve quiet 70)"
eq "quiet 75 -> 57" 57 "$(fc_curve quiet 75)"
eq "quiet 80 -> 84" 84 "$(fc_curve quiet 80)"
eq "quiet 83 -> 100" 100 "$(fc_curve quiet 83)"
# --- decide: asymmetric hysteresis (ramp up now, ease down only past the deadband) ---
eq "decide uninit -> target" 68 "$(fc_decide cool 68 -1 3)"
eq "decide ramp up now" 68 "$(fc_decide cool 68 25 3)"
eq "decide equal holds" 62 "$(fc_decide cool 65 62 3)"
eq "decide down held" 72 "$(fc_decide cool 68 72 3)" # curve(68)=68<72 but curve(71)=75 !<72 -> hold
eq "decide down past" 60 "$(fc_decide cool 64 72 3)" # curve(64)=60, curve(67)=66<72 -> drop
# --- fc_clamp / fc_resolve: HA mode resolution ---
eq "clamp over 100" 100 "$(fc_clamp 150)"
eq "clamp under 0" 0 "$(fc_clamp -5)"
eq "clamp passthrough" 45 "$(fc_clamp 45)"
eq "resolve manual=slider" 42 "$(fc_resolve manual 64 42 cool -1 3)"
eq "resolve manual clamped" 100 "$(fc_resolve manual 64 150 cool -1 3)"
eq "resolve cool=cool curve" 51 "$(fc_resolve cool 60 0 cool -1 3)"
eq "resolve quiet=quiet curve" 73 "$(fc_resolve quiet 78 0 cool -1 3)"
eq "resolve auto+empty=cool" 51 "$(fc_resolve auto 60 0 cool -1 3)"
eq "resolve auto+present=quiet" 31 "$(fc_resolve auto 70 0 quiet -1 3)"
# --- fc_fan_watts: estimated fan power from RPM (cube-law, calibrated to the sweep) ---
eq "fan_watts 0" 0 "$(fc_fan_watts 0)"
eq "fan_watts 4800" 2 "$(fc_fan_watts 4800)"
eq "fan_watts 9360" 16 "$(fc_fan_watts 9360)"
eq "fan_watts 12720" 42 "$(fc_fan_watts 12720)"
eq "fan_watts 16920" 99 "$(fc_fan_watts 16920)"
# --- presence ---
now=1000000
eq "presence open -> quiet" quiet "$(fc_presence_mode Отворена 0 $now 900 Отворена)"
eq "presence closed recent -> quiet" quiet "$(fc_presence_mode Затворена $((now - 100)) $now 900 Отворена)"
eq "presence closed stale -> cool" cool "$(fc_presence_mode Затворена $((now - 1000)) $now 900 Отворена)"
eq "presence closed edge -> cool" cool "$(fc_presence_mode Затворена $((now - 900)) $now 900 Отворена)"
# --- temp parsing ---
eq "parse temp line" 74 "$(fc_parse_temp 'Temp | 0Eh | ok | 3.1 | 74 degrees C')"
eq "parse temp 7C" 72 "$(fc_parse_temp 'Temp | 0Eh | ok | 3.1 | 72 degrees C')"
# --- json field (jq-free) ---
J='{"entity_id":"sensor.garage_door_state_bg","state":"Отворена","attributes":{"friendly_name":"Garage Door State BG"},"last_changed":"2026-06-04T16:55:20.517745+00:00","last_updated":"2026-06-04T16:55:20.517745+00:00"}'
eq "json state" "Отворена" "$(fc_json_str_field "$J" state)"
eq "json last_changed" "2026-06-04T16:55:20.517745+00:00" "$(fc_json_str_field "$J" last_changed)"
# --- hex conversion ---
eq "hex 20" 0x14 "$(fc_pct_to_hex 20)"
eq "hex 45" 0x2d "$(fc_pct_to_hex 45)"
eq "hex 100" 0x64 "$(fc_pct_to_hex 100)"
eq "hex 5" 0x05 "$(fc_pct_to_hex 5)"
printf '\n%d passed, %d failed\n' "$pass" "$fail"
(( fail == 0 ))

View file

@ -0,0 +1,57 @@
#!/usr/bin/env bash
# Unit tests for the pure drift-guard functions in vault-token-renew.sh.
# Sources the script (vtr_main is guarded) and exercises the decision logic that
# decides whether ~/.vault-token is OUR periodic admin token (renew) or a foreign
# token that clobbered the file (refuse, fail loud). This is exactly the logic
# whose ABSENCE let the 2026-06-05 woodpecker-token clobber be silently renewed
# for two days. Run: bash infra/scripts/test-vault-token-renew.sh
set -uo pipefail
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=/dev/null
source "$DIR/vault-token-renew.sh"
pass=0 fail=0
ok() { # <description> <cmd...> — expects the command to succeed (renew-OK)
if "${@:2}"; then pass=$((pass + 1)); else
fail=$((fail + 1)); printf 'FAIL: %s — expected OK, got refuse\n' "$1"
fi
}
no() { # <description> <cmd...> — expects the command to fail (drift, refuse)
if "${@:2}"; then
fail=$((fail + 1)); printf 'FAIL: %s — expected DRIFT, got OK\n' "$1"
else pass=$((pass + 1)); fi
}
eq() { # <description> <expected> <actual>
if [[ "$2" == "$3" ]]; then pass=$((pass + 1)); else
fail=$((fail + 1)); printf 'FAIL: %s — expected [%s] got [%s]\n' "$1" "$2" "$3"
fi
}
# --- vtr_drift_ok: ONLY our periodic admin token (right name AND vault-admin) renews ---
ok "our token renews" vtr_drift_ok token-devvm-wizard "default,sops-admin,vault-admin"
ok "vault-admin anywhere in list" vtr_drift_ok token-devvm-wizard "default,vault-admin"
ok "policy order irrelevant" vtr_drift_ok token-devvm-wizard "vault-admin,default"
no "woodpecker clobber refused" vtr_drift_ok kubernetes-woodpecker-default "ci,default,terraform-state"
no "oidc token (admin but wrong dn)" vtr_drift_ok oidc-vbarzin "default,sops-admin,vault-admin"
no "right name, no vault-admin" vtr_drift_ok token-devvm-wizard "default,sops-admin"
no "empty display_name" vtr_drift_ok "" "vault-admin"
no "empty policies" vtr_drift_ok token-devvm-wizard ""
no "no substring false-positive" vtr_drift_ok token-devvm-wizard "default,vault-admin-ro"
# --- vtr_display_name / vtr_policies_csv: parse real `vault token lookup -format=json` ---
LOOKUP_OURS='{"data":{"display_name":"token-devvm-wizard","policies":["default","sops-admin","vault-admin"],"identity_policies":null}}'
LOOKUP_OIDC='{"data":{"display_name":"oidc-vbarzin","policies":["default"],"identity_policies":["sops-admin","vault-admin"]}}'
LOOKUP_WP='{"data":{"display_name":"kubernetes-woodpecker-default","policies":["ci","default","terraform-state"],"identity_policies":[]}}'
eq "dn ours" "token-devvm-wizard" "$(vtr_display_name "$LOOKUP_OURS")"
eq "dn oidc" "oidc-vbarzin" "$(vtr_display_name "$LOOKUP_OIDC")"
eq "pols ours" "default,sops-admin,vault-admin" "$(vtr_policies_csv "$LOOKUP_OURS")"
eq "pols oidc merges token+identity" "default,sops-admin,vault-admin" "$(vtr_policies_csv "$LOOKUP_OIDC")"
eq "pols woodpecker" "ci,default,terraform-state" "$(vtr_policies_csv "$LOOKUP_WP")"
# --- parse + decide end-to-end (the real lookup-JSON -> renew/refuse path) ---
ok "ours: parse+decide renews" vtr_drift_ok "$(vtr_display_name "$LOOKUP_OURS")" "$(vtr_policies_csv "$LOOKUP_OURS")"
no "woodpecker: parse+decide refused" vtr_drift_ok "$(vtr_display_name "$LOOKUP_WP")" "$(vtr_policies_csv "$LOOKUP_WP")"
no "oidc: parse+decide refused" vtr_drift_ok "$(vtr_display_name "$LOOKUP_OIDC")" "$(vtr_policies_csv "$LOOKUP_OIDC")"
printf '\n%d passed, %d failed\n' "$pass" "$fail"
(( fail == 0 ))

169
scripts/tg Executable file
View file

@ -0,0 +1,169 @@
#!/usr/bin/env bash
# scripts/tg — wrapper: decrypt state before, encrypt+commit after mutating ops
# Usage: scripts/tg apply --non-interactive
# scripts/tg plan
# Auth: `vault login -method=oidc` (token at ~/.vault-token)
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
SYNC="$REPO_ROOT/scripts/state-sync"
# Enable provider cache (shared across stacks)
export TF_PLUGIN_CACHE_DIR="${TF_PLUGIN_CACHE_DIR:-$HOME/.terraform.d/plugin-cache}"
export TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1
mkdir -p "$TF_PLUGIN_CACHE_DIR"
# Determine stack name from cwd (relative to stacks/)
STACK_NAME=""
cwd="$(pwd)"
stacks_dir="$REPO_ROOT/stacks"
if [[ "$cwd" == "$stacks_dir"/* ]]; then
rel="${cwd#$stacks_dir/}"
STACK_NAME="${rel%%/*}"
fi
# ── Tier detection ──
TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
is_tier0() {
echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
}
# ── Advisory lock via Vault KV ──
LOCK_MAX_AGE=1800 # 30 minutes — stale lock threshold
acquire_lock() {
local stack="$1"
local vault_addr="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
local lock_path="secret/data/locks/$stack"
local holder="pid=$$,host=$(hostname -s),user=$(whoami)"
# Check if lock exists and is not stale
local existing
existing=$(vault kv get -format=json "secret/locks/$stack" 2>/dev/null || echo '{}')
local locked=$(echo "$existing" | jq -r '.data.data.locked // "false"')
local acquired=$(echo "$existing" | jq -r '.data.data.acquired // "0"')
local existing_holder=$(echo "$existing" | jq -r '.data.data.holder // ""')
if [ "$locked" = "true" ]; then
local now=$(date +%s)
local age=$((now - acquired))
if [ "$age" -lt "$LOCK_MAX_AGE" ]; then
echo "ERROR: Stack '$stack' is locked by: $existing_holder (${age}s ago)"
echo " Wait for it to finish or run: vault kv delete secret/locks/$stack"
return 1
fi
echo "WARNING: Breaking stale lock on '$stack' (held ${age}s by $existing_holder)"
fi
vault kv put "secret/locks/$stack" locked=true holder="$holder" acquired="$(date +%s)" >/dev/null
}
release_lock() {
local stack="$1"
vault kv delete "secret/locks/$stack" >/dev/null 2>&1 || true
}
# ── Pre-flight: decrypt state (Tier 0) or fetch PG creds (Tier 1) ──
if [ -n "$STACK_NAME" ]; then
if is_tier0 "$STACK_NAME"; then
# Tier 0: SOPS-encrypted local state
if [ -f "$REPO_ROOT/state/stacks/$STACK_NAME/terraform.tfstate.enc" ]; then
"$SYNC" decrypt "$STACK_NAME"
fi
else
# Tier 1: PG backend — fetch credentials from Vault
if [ -z "${PG_CONN_STR:-}" ]; then
# Pre-flight: vault CLI must be available. Previously CI failed with a
# misleading "Cannot read PG credentials" message because the Alpine CI
# image lacked the vault binary — the 2>/dev/null below swallowed the
# real "vault: not found" error. Fail fast with a clear message instead.
if ! command -v vault >/dev/null 2>&1; then
echo "ERROR: vault CLI not found on PATH. Install it or use an image that includes it (ci/Dockerfile)." >&2
exit 1
fi
VAULT_OUT=$(vault read -format=json database/static-creds/pg-terraform-state 2>&1) || {
echo "ERROR: Cannot read PG credentials from Vault. Vault output follows:" >&2
echo "$VAULT_OUT" >&2
echo "" >&2
echo "Hint: humans run 'vault login -method=oidc'; CI auths via K8s SA (role=ci)." >&2
exit 1
}
PG_USER=$(echo "$VAULT_OUT" | jq -r .data.username)
PG_PASS=$(echo "$VAULT_OUT" | jq -r .data.password)
export PG_CONN_STR="postgres://${PG_USER}:${PG_PASS}@10.0.20.200:5432/terraform_state?sslmode=disable"
fi
fi
fi
# Detect if this is a mutating operation
is_mutating=false
for arg in "$@"; do
case "$arg" in
apply|destroy|import|state) is_mutating=true ;;
esac
done
# Detect if this is a plan/apply/destroy/refresh — anything that reads or
# writes infra state. Cheap pre-flight check below scans only the current
# stack's .tf files for the ingress_factory auth-comment convention. Other
# tg verbs (init, fmt, validate) skip the check.
is_tf_op=false
for arg in "$@"; do
case "$arg" in
plan|apply|destroy|refresh) is_tf_op=true ;;
esac
done
# Anti-exposure guard: every `auth = "app"` or `auth = "none"` in this stack
# must have a preceding `# auth = "<tier>":` comment documenting what gates
# the app or why the endpoint is intentionally public. See:
# - infra/modules/kubernetes/ingress_factory/main.tf (variable description)
# - infra/.claude/CLAUDE.md "Auth" section
# Stack-scoped: untouched stacks aren't blocked from future applies until
# they're actually edited, at which point the convention applies.
if $is_tf_op && [ -n "$STACK_NAME" ]; then
if ! "$REPO_ROOT/scripts/check-ingress-auth-comments.py" "$REPO_ROOT/stacks/$STACK_NAME"; then
exit 1
fi
fi
# Acquire lock for mutating operations (Tier 0 only — Tier 1 uses pg_advisory_lock)
if $is_mutating && [ -n "$STACK_NAME" ] && is_tier0 "$STACK_NAME"; then
if command -v vault &>/dev/null && [ -n "${VAULT_TOKEN:-}" ]; then
acquire_lock "$STACK_NAME"
trap 'release_lock "$STACK_NAME"' EXIT
fi
fi
# If running apply with --non-interactive, add -auto-approve for Terraform
args=("$@")
has_apply=false
has_non_interactive=false
for arg in "${args[@]}"; do
case "$arg" in
apply) has_apply=true ;;
--non-interactive) has_non_interactive=true ;;
esac
done
if $has_apply && $has_non_interactive; then
new_args=()
for arg in "${args[@]}"; do
new_args+=("$arg")
if [ "$arg" = "apply" ]; then
new_args+=("-auto-approve")
fi
done
terragrunt "${new_args[@]}"
else
terragrunt "$@"
fi
# After mutating operations: encrypt+commit (Tier 0) or no-op (Tier 1 — PG is authoritative)
if $is_mutating && [ -n "$STACK_NAME" ] && is_tier0 "$STACK_NAME"; then
"$SYNC" encrypt "$STACK_NAME"
cd "$REPO_ROOT"
git add "state/stacks/$STACK_NAME/terraform.tfstate.enc"
if ! git diff --cached --quiet; then
git commit -m "state($STACK_NAME): update encrypted state"
fi
fi

View file

@ -0,0 +1,28 @@
#!/usr/bin/env bash
set -e
from=$1
to=$2
if [ -z "$from" ] || [ -z "$to" ]; then
echo 'pass 2 positional parameters - $from and $to'
exit 1
fi
commands=()
# Update terraform modules
for file in $(grep -rni "\"istio-injection\" : \"$from\"" . | grep -v '#' | awk '{print $1}' | cut -d':' -f1); do
echo $file
sed -i "s/istio-injection\" : \"$from\"/istio-injection\" : \"$to\"/" $file
ns=$(echo $file | cut -d'/' -f 4)
commands+=("kubectl -n $ns get deployments --no-headers | awk '{print \$1}' | xargs kubectl -n $ns rollout restart deployment")
done
# Apply changes
terraform apply -auto-approve
# Restart deployments
for cmd in "${commands[@]}"; do
echo $cmd
bash -c "$cmd"
done

123
scripts/update_k8s.sh Executable file
View file

@ -0,0 +1,123 @@
#!/usr/bin/env bash
#
# K8s component upgrader. Run on a single node (master OR worker) at a time.
# The caller is responsible for:
# - draining + uncordoning the node (this script does not touch kubectl)
# - sequencing nodes (master first, then workers one at a time)
# - pre-flight checks (etcd snapshot, halt-on-alert, etc)
#
# Used by:
# - the k8s-version-upgrade agent (infra/.claude/agents/k8s-version-upgrade.md)
# - manual operators following the runbook (infra/docs/runbooks/k8s-version-upgrade.md)
#
# Old manual orchestration loop (kept for reference — the agent does the
# equivalent now):
# for n in $(kbn | grep 'k8s-node' | awk '{print $1}'); do
# kb drain $n --ignore-daemonsets --delete-emptydir-data
# s wizard@$n 'bash -s' < update_k8s.sh --role worker --release 1.34.5
# kb uncordon $n
# done
set -euo pipefail
ROLE=""
RELEASE=""
usage() {
cat <<EOF
Usage: $0 --role <master|worker> --release <X.Y.Z>
--role master|worker (required)
--release kubeadm/kubelet/kubectl target patch version, e.g. 1.34.5
Behavior:
- Rewrites /etc/apt/sources.list.d/kubernetes.list to the v\$MINOR/deb repo
derived from --release (so a 1.34.x release uses v1.34/deb, 1.35.x uses
v1.35/deb, etc).
- apt-get install kubeadm=<release>-* (apt-mark unhold first).
- master: kubeadm upgrade plan && kubeadm upgrade apply v<release> -y
- worker: kubeadm upgrade node
- apt-get install kubelet=<release>-* kubectl=<release>-* then re-hold.
- systemctl daemon-reload && systemctl restart kubelet
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--role) ROLE="$2"; shift 2;;
--release) RELEASE="$2"; shift 2;;
-h|--help) usage; exit 0;;
*) echo "Unknown arg: $1" >&2; usage; exit 2;;
esac
done
if [[ -z "$ROLE" || -z "$RELEASE" ]]; then
echo "ERROR: --role and --release are required" >&2
usage
exit 2
fi
if [[ "$ROLE" != "master" && "$ROLE" != "worker" ]]; then
echo "ERROR: --role must be 'master' or 'worker' (got: $ROLE)" >&2
exit 2
fi
# Derive minor track (e.g. 1.34.5 → 1.34)
STABLE_VERSION="$(echo "$RELEASE" | awk -F. '{print $1"."$2}')"
echo "==> Upgrading $(hostname) ($ROLE) to v$RELEASE (track v$STABLE_VERSION)"
# Apt repo URL is pinned per minor track. Rewrite + re-import the signing key
# every run — cheap, idempotent, and handles the minor-bump case where the
# old track's repo no longer carries the target version.
echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v$STABLE_VERSION/deb/ /" \
| sudo tee /etc/apt/sources.list.d/kubernetes.list
sudo mkdir -p /etc/apt/keyrings
curl -fsSL "https://pkgs.k8s.io/core:/stable:/v$STABLE_VERSION/deb/Release.key" \
| sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg --batch --yes
sudo apt-mark unhold kubeadm kubelet kubectl
sudo apt-get update
sudo apt-get install -y "kubeadm=$RELEASE-*"
if [[ "$ROLE" == "master" ]]; then
echo "==> Master path: kubeadm upgrade plan + apply"
sudo kubeadm upgrade plan
# The first apply may fail with "static Pod hash for component <X> did
# not change after 5m0s" — kubeadm's 5min wait for the kubelet to reload
# a static pod is too tight on our cluster (apiserver-to-kubelet status
# sync latency post-master-reboot can exceed it). The etcd image IS
# actually updated by then, so a 2nd attempt sees etcd already on
# target and skips it. Up to 3 attempts with a 30s delay between.
# First attempt: full kubeadm upgrade (incl. etcd). On the static-pod-
# hash 5min-timeout failure, retry with --etcd-upgrade=false. The
# timeout happens reliably for patch upgrades where etcd's image
# doesn't change (kubeadm writes identical manifest → hash doesn't
# change → kubeadm waits forever for a change that will never come).
# Skipping the etcd phase on retry is safe IF etcd is already on the
# right version (which is the only case where this timeout fires).
attempt=1
extra_flags=""
while ! sudo kubeadm upgrade apply "v$RELEASE" -y $extra_flags; do
if (( attempt >= 3 )); then
echo "ERROR: kubeadm upgrade apply failed after 3 attempts" >&2
exit 1
fi
echo "==> kubeadm apply attempt $attempt failed. Retrying with --etcd-upgrade=false (etcd image is unchanged for patch upgrades; kubeadm's static-pod-hash watch is the only thing failing)."
extra_flags="--etcd-upgrade=false"
sleep 30
attempt=$(( attempt + 1 ))
done
echo "==> kubeadm upgrade apply succeeded on attempt $attempt (flags: '$extra_flags')"
else
echo "==> Worker path: kubeadm upgrade node"
sudo kubeadm upgrade node
fi
sudo apt-get install -y "kubelet=$RELEASE-*" "kubectl=$RELEASE-*"
sudo apt-mark hold kubeadm kubelet kubectl
sudo systemctl daemon-reload
sudo systemctl restart kubelet
echo "==> Done: $(hostname) is on v$RELEASE"

14
scripts/update_node.sh Normal file
View file

@ -0,0 +1,14 @@
#!/usr/bin/env bash
#
# OS-major upgrade (Ubuntu do-release-upgrade). NOT in the auto-upgrade
# pipeline — minor apt patches are handled by unattended-upgrades + kured;
# K8s component bumps are handled by the k8s-version-upgrade agent. Run this
# script manually when bumping Ubuntu LTS major versions.
#
# See:
# - infra/docs/runbooks/k8s-node-auto-upgrades.md (apt + reboot)
# - infra/docs/runbooks/k8s-version-upgrade.md (kubeadm/kubelet/kubectl)
# sudo apt update && sudo apt autoremove -y && sudo apt upgrade -y
sudo do-release-upgrade
sudo apt update && sudo apt autoremove -y && sudo apt upgrade -y

619
scripts/upgrade_state.sh Executable file
View file

@ -0,0 +1,619 @@
#!/usr/bin/env bash
#
# upgrade_state.sh — survey the three autonomous-upgrade pipelines.
#
# Companion to cluster_healthcheck.sh, surfaced via the /upgrade-state skill.
# Read-only by design — no --fix.
#
# The three pipelines:
# 1. Apps — Keel polls registries hourly and rolls Deployments tagged
# keel.sh/policy. Metrics on container :9300/metrics.
# 2. OS — unattended-upgrades patches in-release per node; kured
# reboots within a daily 02:00-06:00 London window.
# 3. K8s — k8s-version-check CronJob (Sun 12:00 UTC) detects new
# kubeadm patch/minor releases; Job-chain drains+upgrades
# node-by-node. Pushgateway holds k8s_upgrade_* gauges.
#
# Exit codes: 0 healthy, 1 attention warranted, 2 something stalled.
set -euo pipefail
# --- Colors ---
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NC='\033[0m'
# --- Globals ---
JSON=false
KUBECONFIG_PATH="${KUBECONFIG:-${HOME}/.kube/config}"
[[ -f "$KUBECONFIG_PATH" ]] || KUBECONFIG_PATH="/home/wizard/code/infra/config"
KUBECTL=""
NODES=(k8s-master:10.0.20.100 k8s-node1:10.0.20.101 k8s-node2:10.0.20.102 k8s-node3:10.0.20.103 k8s-node4:10.0.20.104)
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no)
NOW_EPOCH=$(date -u +%s)
HIGHEST_EXIT=0 # 0 healthy, 1 attention, 2 stalled
# Results — collectors fill these.
APPS_STATUS_ICON=""; APPS_STATUS_TEXT=""
APPS_LAST_CHECK=""; APPS_NEXT=""; APPS_NOTES=""
APPS_ENROLLED=0; APPS_PENDING=0; APPS_UPDATES_LINE=""; APPS_ERROR_LINE=""
OS_STATUS_ICON=""; OS_STATUS_TEXT=""
OS_LAST_CHECK=""; OS_NEXT=""; OS_NOTES=""
OS_DISTRO_SUMMARY=""; OS_KERNEL_SUMMARY=""
OS_PENDING_REBOOT_NODES=""; OS_HELD_DETAIL=""
OS_LAST_UU=""; OS_LAST_KURED=""
K8S_STATUS_ICON=""; K8S_STATUS_TEXT=""
K8S_LAST_CHECK=""; K8S_NEXT=""; K8S_NOTES=""
K8S_RUNNING=""; K8S_PATCH=""; K8S_MINOR=""
K8S_LAST_DETECT_LINE=""; K8S_IN_FLIGHT="no"; K8S_LAST_CHAIN=""
# --- Helpers ---
log() { [[ "$JSON" == true ]] && return 0; echo -e "$*"; }
raise_exit() {
local n="$1"
if [[ "$n" -gt "$HIGHEST_EXIT" ]]; then HIGHEST_EXIT="$n"; fi
return 0
}
usage() {
cat <<EOF
Usage: $0 [--json] [--kubeconfig <path>]
Read-only audit of the three autonomous-upgrade pipelines (apps, OS, k8s).
--json machine-readable JSON
--kubeconfig PATH override kubeconfig
Exit codes: 0 healthy, 1 attention warranted, 2 something stalled.
EOF
}
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--json) JSON=true; shift ;;
--kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) echo "Unknown option: $1" >&2; exit 1 ;;
esac
done
KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH"
}
# Prometheus query — Prometheus + reload + backup share a network namespace,
# so reaching localhost:9090 works from any of the three sidecars.
prom_q() {
local q="$1"
$KUBECTL -n monitoring exec deploy/prometheus-server -c prometheus-server -- \
wget -qO- "http://localhost:9090/api/v1/query?query=${q}" 2>/dev/null || true
}
pg_metrics() {
$KUBECTL -n monitoring exec deploy/prometheus-server -c prometheus-server -- \
wget -qO- "http://prometheus-prometheus-pushgateway:9091/metrics" 2>/dev/null || true
}
ssh_node() {
local ip="$1"; shift
ssh "${SSH_OPTS[@]}" "wizard@$ip" "$@" 2>/dev/null || true
}
human_age() {
local secs="$1"
if [[ "$secs" -lt 60 ]]; then printf '%ds ago' "$secs"
elif [[ "$secs" -lt 3600 ]]; then printf '%dm ago' $((secs/60))
elif [[ "$secs" -lt 86400 ]]; then printf '%dh ago' $((secs/3600))
else printf '%dd ago' $((secs/86400))
fi
}
# Pushgateway emits floats and scientific notation — coerce to integer
# epoch seconds. Returns 0 if the input is empty / zero / unparseable.
to_epoch_int() {
local v="${1:-}"
if [[ -z "$v" || "$v" == "0" ]]; then echo 0; return; fi
python3 -c "import sys; v=sys.argv[1]; print(int(float(v)))" "$v" 2>/dev/null || echo 0
}
# --- 1. Apps (Keel) ---
collect_apps() {
local pending tracked enrolled updates_24h errors
# Enrolled: count Deployments with keel.sh/policy != never (Keel itself
# is policy=never). The Kyverno auto-injection labels namespaces
# keel.sh/enrolled=true, but the annotation is what Keel watches.
enrolled=$($KUBECTL get deploy -A -o json 2>/dev/null | python3 -c '
import json, sys
data = json.load(sys.stdin)
n = sum(1 for d in data["items"]
if (d["metadata"].get("annotations") or {}).get("keel.sh/policy", "never") != "never")
print(n)
' 2>/dev/null || echo 0)
APPS_ENROLLED="$enrolled"
# Pending approvals (sum across Keel pods).
pending=$(prom_q 'sum(pending_approvals)' | python3 -c '
import json, sys
try:
r = json.load(sys.stdin)["data"]["result"]
print(int(float(r[0]["value"][1])) if r else 0)
except Exception:
print(0)
' 2>/dev/null || echo 0)
APPS_PENDING="$pending"
# Tracked images — proxy for "is the scrape live?".
tracked=$(prom_q 'count(count by (image) (registries_scanned_total))' | python3 -c '
import json, sys
try:
r = json.load(sys.stdin)["data"]["result"]
print(int(float(r[0]["value"][1])) if r else 0)
except Exception:
print(0)
' 2>/dev/null || echo 0)
# Last scrape age — `up{job="kubernetes-pods", app="keel"}` is 1 if the
# most recent scrape succeeded. We surface the wallclock age via a tiny
# `time() - timestamp(up{...})` query.
APPS_LAST_CHECK=$(prom_q 'time()-timestamp(up{job="kubernetes-pods",app="keel"})' | python3 -c '
import json, sys
try:
r = json.load(sys.stdin)["data"]["result"]
if not r: print("scrape not live")
else:
secs = int(float(r[0]["value"][1]))
if secs < 60: print(f"{secs}s ago")
elif secs < 3600: print(f"{secs//60}m ago")
else: print(f"{secs//3600}h ago")
except Exception:
print("?")
' 2>/dev/null || echo "?")
# Recent updates: count lines in Keel logs that report a successful
# rollout. Keel logs an "update completed" message per rollout.
local log_24h
log_24h=$($KUBECTL -n keel logs deploy/keel --since=24h --tail=2000 2>/dev/null || true)
updates_24h=$(echo "$log_24h" | grep -cE 'update completed|successfully updated|deployment updated' 2>/dev/null || true)
[[ -z "$updates_24h" ]] && updates_24h=0
APPS_UPDATES_LINE="$updates_24h in last 24h (tracked images: $tracked)"
# Known-benign Keel error patterns to suppress. Each is a real error
# line Keel emits, but the surrounding behaviour is fine, so flagging
# them in /upgrade-state is just noise.
# - `bot.Run(): can not get configuration for bot [slack]` — Keel
# 1.2.0 registers a Slack socket-mode bot whenever SLACK_BOT_TOKEN
# is set, then fails because we don't supply an `xapp-` app-level
# token. We don't want the interactive bot (no approvals; opt-out
# auto-update). The Slack NOTIFICATION sender works independently
# of the bot, so rollout messages still post to #general.
# - `failed to check digest` with a transient network error —
# Keel polls ~175 image manifests against public registries
# hourly. Occasional `i/o timeout` / `connection refused` /
# `TLS handshake timeout` / `no such host` / `EOF` /
# `context deadline exceeded` are inherent to public-internet
# polling at that scale and auto-recover on the next poll.
# Actionable digest-check failures surface as HTTP 401/404
# (auth, removed-tag) — those are NOT filtered.
# - `failed to check digest` with HTTP 5xx — upstream registry
# having a problem (DockerHub maintenance, Forgejo restart,
# etc.). Same recovery pattern as network errors: next hourly
# poll succeeds once upstream is back. Persistent 5xx for >24h
# would indicate a real registry-side issue, but that surfaces
# via the registry's own monitoring (e.g. forgejo-integrity-probe
# + RegistryCatalogInaccessible), not via Keel logs.
local benign_re='bot\.Run\(\): can not get configuration for bot \[slack\]'
benign_re+='|SLACK_APP_TOKEN must have the (previf|prefix)'
benign_re+='|failed to check digest.*(i/o timeout|connection refused|connection reset|context deadline exceeded|TLS handshake timeout|no such host|: EOF)'
benign_re+='|failed to check digest.*non-successful response \(status=5[0-9][0-9]'
errors=$(echo "$log_24h" | grep -iE '"level":"(error|fatal)"|level=error' | grep -vE "$benign_re" | tail -3 || true)
if [[ -z "$errors" ]]; then
APPS_ERROR_LINE="(none in last 24h)"
else
APPS_ERROR_LINE="$(echo "$errors" | wc -l | tr -d ' ') error(s); newest: $(echo "$errors" | tail -1 | cut -c1-120)"
fi
# Keel pod state.
local pod_status
pod_status=$($KUBECTL -n keel get pods -l app=keel -o jsonpath='{.items[*].status.phase}' 2>/dev/null || true)
if [[ "$pod_status" != *"Running"* ]]; then
APPS_STATUS_ICON="✗"; APPS_STATUS_TEXT="down"
APPS_NOTES="Keel pod not Running ($pod_status)"
raise_exit 2
elif [[ "$pending" -gt 0 || -n "$errors" ]]; then
APPS_STATUS_ICON="⚠"; APPS_STATUS_TEXT="attn"
APPS_NOTES="$enrolled enrolled; $pending pending; $(echo "$errors" | wc -l | tr -d ' ') recent error(s)"
raise_exit 1
else
APPS_STATUS_ICON="✓"; APPS_STATUS_TEXT="healthy"
APPS_NOTES="$enrolled enrolled, 0 pending, 0 errors"
fi
APPS_NEXT="rolling, hourly poll"
}
# --- 2. OS (apt + kured) ---
collect_os() {
local distros kernels distro_uniq kernel_uniq
distros=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.osImage}{"\n"}{end}' 2>/dev/null)
kernels=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.kernelVersion}{"\n"}{end}' 2>/dev/null)
distro_uniq=$(echo "$distros" | sort -u | tr '\n' ',' | sed 's/,$//; s/,/, /g')
kernel_uniq=$(echo "$kernels" | sort -u | tr '\n' ',' | sed 's/,$//; s/,/, /g')
OS_DISTRO_SUMMARY="$distro_uniq"
OS_KERNEL_SUMMARY="$kernel_uniq"
# SSH fan-out — parallel background subshells, write per-node results to tmp files.
local tmpdir; tmpdir=$(mktemp -d)
trap 'rm -rf "$tmpdir"' RETURN
local entry name ip
for entry in "${NODES[@]}"; do
name="${entry%%:*}"; ip="${entry##*:}"
(
local out reboot held upgradable uu_log
reboot=$(ssh_node "$ip" 'test -f /var/run/reboot-required && echo yes || echo no')
held=$(ssh_node "$ip" 'apt-mark showhold 2>/dev/null')
upgradable=$(ssh_node "$ip" 'apt list --upgradable 2>/dev/null | tail -n +2')
uu_log=$(ssh_node "$ip" 'tail -1 /var/log/unattended-upgrades/unattended-upgrades.log 2>/dev/null')
printf 'reboot=%s\n' "$reboot" > "$tmpdir/$name"
printf 'held<<<EOF\n%s\nEOF\n' "$held" >> "$tmpdir/$name"
printf 'upgradable<<<EOF\n%s\nEOF\n' "$upgradable" >> "$tmpdir/$name"
printf 'uu_log=%s\n' "$uu_log" >> "$tmpdir/$name"
) &
done
wait
# Aggregate.
local pending_reboots=() held_with_bumps_lines=() newest_uu_ts=0 newest_uu_iso=""
for entry in "${NODES[@]}"; do
name="${entry%%:*}"
[[ -f "$tmpdir/$name" ]] || continue
local reboot held upgradable uu_log uu_ts
reboot=$(awk -F= '/^reboot=/{print $2}' "$tmpdir/$name")
held=$(awk '/^held<<<EOF$/,/^EOF$/' "$tmpdir/$name" | sed '1d;$d')
upgradable=$(awk '/^upgradable<<<EOF$/,/^EOF$/' "$tmpdir/$name" | sed '1d;$d')
uu_log=$(awk -F= '/^uu_log=/{sub(/^uu_log=/,""); print}' "$tmpdir/$name")
[[ "$reboot" == "yes" ]] && pending_reboots+=("$name")
# Held + upgradable, excluding k8s components (managed by k8s pipeline).
local pkg from to bump
while IFS= read -r line; do
[[ -z "$line" ]] && continue
pkg=$(echo "$line" | awk -F/ '{print $1}')
# Skip k8s and kernel/linux-image — the chain handles those.
case "$pkg" in
kubeadm|kubectl|kubelet) continue ;;
linux-image-*|linux-headers-*|linux-modules-*|linux-generic|linux-headers-generic|linux-image-generic) continue ;;
esac
# Only flag if the package is held.
if echo "$held" | grep -qx "$pkg"; then
to=$(echo "$line" | awk '{print $2}')
from=$(echo "$line" | sed -n 's/.*from: \([^ ]*\).*/\1/p')
bump="$pkg ${from%-*}${to%-*}"
held_with_bumps_lines+=("$name: $bump")
fi
done <<<"$upgradable"
# Newest uu timestamp (ISO at start of log line).
uu_ts=$(echo "$uu_log" | sed -E 's/^([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}).*/\1/')
if [[ -n "$uu_ts" ]]; then
local epoch; epoch=$(date -u -d "$uu_ts" +%s 2>/dev/null || echo 0)
if [[ "$epoch" -gt "$newest_uu_ts" ]]; then
newest_uu_ts="$epoch"; newest_uu_iso="$uu_ts"
fi
fi
done
OS_PENDING_REBOOT_NODES="${pending_reboots[*]:-}"
if [[ ${#held_with_bumps_lines[@]} -gt 0 ]]; then
OS_HELD_DETAIL=$(printf '%s\n' "${held_with_bumps_lines[@]}" | sort -u | paste -sd '; ' -)
fi
if [[ "$newest_uu_ts" -gt 0 ]]; then
local age=$((NOW_EPOCH - newest_uu_ts))
OS_LAST_UU="$newest_uu_iso UTC ($(human_age "$age"))"
OS_LAST_CHECK="$(human_age "$age") (uu daily)"
else
OS_LAST_UU="(no uu log accessible)"
OS_LAST_CHECK="?"
fi
# Last kured reboot — newest Ready transition across worker nodes.
# `Ready -> True` is what kured causes when the node returns; we surface
# the most recent timestamp and the node it belongs to.
local kured_raw kured_iso kured_node kured_ep kured_age
kured_raw=$($KUBECTL get nodes -o json 2>/dev/null | python3 -c '
import json, sys
from datetime import datetime
data = json.load(sys.stdin)
best = (0, "", "")
for n in data["items"]:
name = n["metadata"]["name"]
for c in n["status"].get("conditions", []):
if c["type"] == "Ready":
dt = datetime.strptime(c["lastTransitionTime"], "%Y-%m-%dT%H:%M:%SZ")
ep = int(dt.timestamp())
if ep > best[0]:
best = (ep, name, c["lastTransitionTime"])
print(f"{best[0]}|{best[1]}|{best[2]}")
' 2>/dev/null || echo "0||")
kured_ep="${kured_raw%%|*}"
kured_node=$(echo "$kured_raw" | cut -d'|' -f2)
kured_iso=$(echo "$kured_raw" | cut -d'|' -f3)
if [[ "$kured_ep" -gt 0 ]]; then
kured_age=$((NOW_EPOCH - kured_ep))
OS_LAST_KURED="$kured_iso ($kured_node, $(human_age "$kured_age"))"
else
OS_LAST_KURED="?"
fi
OS_NEXT="daily 02:00-06:00 London"
# Kured pod health.
local kured_pods kured_unhealthy
kured_pods=$($KUBECTL -n kured get pods -l app.kubernetes.io/name=kured -o jsonpath='{range .items[*]}{.status.phase}{"\n"}{end}' 2>/dev/null)
kured_unhealthy=$(echo "$kured_pods" | grep -cv '^Running$' 2>/dev/null || true)
local notes=()
[[ -n "$OS_HELD_DETAIL" ]] && notes+=("held with bumps: $OS_HELD_DETAIL")
[[ -n "$OS_PENDING_REBOOT_NODES" ]] && notes+=("pending reboot: $OS_PENDING_REBOOT_NODES")
if [[ "$kured_unhealthy" -gt 0 ]]; then
OS_STATUS_ICON="✗"; OS_STATUS_TEXT="kured down"
OS_NOTES="kured pods not all Running"
raise_exit 2
elif [[ ${#notes[@]} -gt 0 ]]; then
OS_STATUS_ICON="⚠"; OS_STATUS_TEXT="attn"
OS_NOTES="${notes[*]}"
raise_exit 1
else
OS_STATUS_ICON="✓"; OS_STATUS_TEXT="healthy"
OS_NOTES="distros uniform; no held bumps; no pending reboots"
fi
}
# --- 3. K8s (kubeadm/kubelet/kubectl) ---
collect_k8s() {
local kver_list kver_uniq metrics target_patch target_minor last_run in_flight started
kver_list=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.kubeletVersion}{"\n"}{end}' 2>/dev/null)
kver_uniq=$(echo "$kver_list" | sort -u)
local n_uniq; n_uniq=$(echo "$kver_uniq" | wc -l | tr -d ' ')
if [[ "$n_uniq" -eq 1 ]]; then
K8S_RUNNING="$kver_uniq across $(echo "$kver_list" | wc -l | tr -d ' ')/$(echo "$kver_list" | wc -l | tr -d ' ') nodes"
else
K8S_RUNNING="mixed: $(echo "$kver_uniq" | paste -sd', ' -)"
fi
local running_ver; running_ver=$(echo "$kver_uniq" | head -1)
metrics=$(pg_metrics)
# All five may legitimately be absent (cluster never ran the upgrade
# chain, kind="minor" not detected, etc.) — `|| true` keeps pipefail
# from killing the script on no-match.
target_patch=$(echo "$metrics" | { grep -E '^k8s_upgrade_available\{[^}]*kind="patch"' || true; } | sed -n 's/.*target="\([^"]*\)".*/\1/p' | head -1)
target_minor=$(echo "$metrics" | { grep -E '^k8s_upgrade_available\{[^}]*kind="minor"' || true; } | sed -n 's/.*target="\([^"]*\)".*/\1/p' | head -1)
# Pushgateway emits these with `{instance="",job="..."}` labels — the
# `awk '$1 ~ /^name(\{|$)/'` form matches both bare and labelled metrics.
last_run=$(echo "$metrics" | awk '$1 ~ /^k8s_version_check_last_run_timestamp(\{|$)/{print $2}' | head -1 || true)
in_flight=$(echo "$metrics" | awk '$1 ~ /^k8s_upgrade_in_flight(\{|$)/{print $2}' | head -1 || true)
started=$(echo "$metrics" | awk '$1 ~ /^k8s_upgrade_started_timestamp(\{|$)/{print $2}' | head -1 || true)
# Pushgateway timestamps come back in scientific notation
# (e.g. 1.779052159e+09) — convert to plain integer seconds.
local last_run_int started_int
last_run_int=$(to_epoch_int "$last_run")
started_int=$(to_epoch_int "$started")
if [[ "$last_run_int" -gt 0 ]]; then
local age=$((NOW_EPOCH - last_run_int))
K8S_LAST_CHECK="$(human_age "$age") (daily cron)"
if [[ -n "$target_patch" ]]; then
K8S_LAST_DETECT_LINE="last run $(human_age "$age"): available v$target_patch (patch)"
elif [[ -n "$target_minor" ]]; then
K8S_LAST_DETECT_LINE="last run $(human_age "$age"): available v$target_minor (minor)"
else
K8S_LAST_DETECT_LINE="last run $(human_age "$age"): no upgrade available"
fi
else
K8S_LAST_CHECK="(metric missing)"
K8S_LAST_DETECT_LINE="(no k8s_version_check_last_run_timestamp in Pushgateway)"
fi
K8S_PATCH="${target_patch:-none}"
K8S_MINOR="${target_minor:-none}"
# In-flight / last chain.
if [[ "${in_flight:-0}" == "1" ]]; then
K8S_IN_FLIGHT="yes"
local since=0
[[ "$started_int" -gt 0 ]] && since=$((NOW_EPOCH - started_int))
K8S_LAST_CHAIN="in-flight (started $(human_age "$since"))"
else
K8S_IN_FLIGHT="no"
if [[ "$started_int" -gt 0 ]]; then
local age=$((NOW_EPOCH - started_int))
K8S_LAST_CHAIN="$(human_age "$age")"
else
K8S_LAST_CHAIN="never (or zeroed)"
fi
fi
K8S_NEXT="$(next_daily_noon_utc)"
# Status logic.
local stalled=0
if [[ "${in_flight:-0}" == "1" && "$started_int" -gt 0 ]]; then
# K8sUpgradeStalled fires after 5400s (90m) per monitoring stack.
local since=$((NOW_EPOCH - started_int))
[[ "$since" -gt 5400 ]] && stalled=1
fi
local last_run_age=999999999
[[ "$last_run_int" -gt 0 ]] && last_run_age=$((NOW_EPOCH - last_run_int))
if [[ "$stalled" == "1" ]]; then
K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="stalled"
K8S_NOTES="K8sUpgradeStalled would fire — chain in-flight >90m"
raise_exit 2
elif [[ "$last_run_age" -gt $((9*86400)) ]]; then
K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="detection stale"
K8S_NOTES="last detection >9d ago"
raise_exit 2
elif [[ "${in_flight:-0}" == "1" ]]; then
K8S_STATUS_ICON="…"; K8S_STATUS_TEXT="in-flight"
K8S_NOTES="upgrade chain running"
raise_exit 1
elif [[ -n "$target_patch" ]]; then
K8S_STATUS_ICON="→"; K8S_STATUS_TEXT="$target_patch"
K8S_NOTES="running $running_ver → v$target_patch (patch) available"
raise_exit 1
elif [[ -n "$target_minor" ]]; then
K8S_STATUS_ICON="→"; K8S_STATUS_TEXT="$target_minor"
K8S_NOTES="running $running_ver → v$target_minor (minor) available"
raise_exit 1
else
K8S_STATUS_ICON="✓"; K8S_STATUS_TEXT="current"
K8S_NOTES="running $running_ver, nothing newer"
fi
}
# Next daily 12:00 UTC — pure bash date math, no croniter. Schedule was
# weekly Sunday until 2026-05-18; now `0 12 * * *` in the
# k8s-version-upgrade stack. If we're still before today's 12:00 UTC,
# the next run is today; otherwise it's tomorrow.
next_daily_noon_utc() {
local hr days_ahead
hr=$(date -u +%H)
if [[ "$hr" -lt 12 ]]; then days_ahead=0; else days_ahead=1; fi
date -u -d "+$days_ahead days" +"%a %Y-%m-%d 12:00 UTC"
}
# --- Renderers ---
# The table uses `column -t` so we don't have to compute visual widths
# manually (the status icons are multi-byte UTF-8 and ANSI escapes don't
# play nice with `printf %-Xs`). Trade-off: no in-cell colour, but the
# icon character already carries the signal.
render_table() {
echo
printf "${BOLD}Upgrade state — %s${NC}\n" "$(date -u +'%Y-%m-%d %H:%M UTC')"
echo
{
echo "Layer|Status|Last check|Next upgrade|Notes"
echo "-----|------|----------|------------|-----"
printf 'Apps|%s %s|%s|%s|%s\n' "$APPS_STATUS_ICON" "$APPS_STATUS_TEXT" "$APPS_LAST_CHECK" "$APPS_NEXT" "$APPS_NOTES"
printf 'OS |%s %s|%s|%s|%s\n' "$OS_STATUS_ICON" "$OS_STATUS_TEXT" "$OS_LAST_CHECK" "$OS_NEXT" "$OS_NOTES"
printf 'K8s |%s %s|%s|%s|%s\n' "$K8S_STATUS_ICON" "$K8S_STATUS_TEXT" "$K8S_LAST_CHECK" "$K8S_NEXT" "$K8S_NOTES"
} | column -t -s '|' -o ' | '
echo
printf "${BOLD}--- Apps (Keel) ---${NC}\n"
echo "Enrolled deployments: $APPS_ENROLLED"
echo "Recent rollouts: $APPS_UPDATES_LINE"
echo "Pending approvals: $APPS_PENDING"
echo "Last Keel error: $APPS_ERROR_LINE"
echo
printf "${BOLD}--- OS (apt + kured) ---${NC}\n"
echo "Ubuntu per node: $OS_DISTRO_SUMMARY"
echo "Kernel per node: $OS_KERNEL_SUMMARY"
echo "Pending reboot: ${OS_PENDING_REBOOT_NODES:-none}"
echo "Held packages with upstream bumps: ${OS_HELD_DETAIL:-none (excluding k8s components)}"
echo "Last uu run (newest across nodes): $OS_LAST_UU"
echo "Last kured reboot (newest Ready transition): $OS_LAST_KURED"
echo "Next kured window: $OS_NEXT"
echo
printf "${BOLD}--- K8s (kubeadm/kubelet/kubectl) ---${NC}\n"
echo "Running: $K8S_RUNNING"
echo "Latest patch (apt): ${K8S_PATCH}"
echo "Next minor available: ${K8S_MINOR}"
echo "Detection: $K8S_LAST_DETECT_LINE"
echo "In-flight: $K8S_IN_FLIGHT | Last chain start: $K8S_LAST_CHAIN"
echo "Next detection: $K8S_NEXT"
echo
}
render_json() {
# Pipe values into Python via env vars so we don't need to worry about
# embedded quotes/backslashes in error lines.
APPS_STATUS_ICON="$APPS_STATUS_ICON" APPS_STATUS_TEXT="$APPS_STATUS_TEXT" \
APPS_LAST_CHECK="$APPS_LAST_CHECK" APPS_NEXT="$APPS_NEXT" APPS_NOTES="$APPS_NOTES" \
APPS_ENROLLED="$APPS_ENROLLED" APPS_PENDING="$APPS_PENDING" \
APPS_UPDATES_LINE="$APPS_UPDATES_LINE" APPS_ERROR_LINE="$APPS_ERROR_LINE" \
OS_STATUS_ICON="$OS_STATUS_ICON" OS_STATUS_TEXT="$OS_STATUS_TEXT" \
OS_LAST_CHECK="$OS_LAST_CHECK" OS_NEXT="$OS_NEXT" OS_NOTES="$OS_NOTES" \
OS_DISTRO_SUMMARY="$OS_DISTRO_SUMMARY" OS_KERNEL_SUMMARY="$OS_KERNEL_SUMMARY" \
OS_PENDING_REBOOT_NODES="$OS_PENDING_REBOOT_NODES" OS_HELD_DETAIL="$OS_HELD_DETAIL" \
OS_LAST_UU="$OS_LAST_UU" OS_LAST_KURED="$OS_LAST_KURED" \
K8S_STATUS_ICON="$K8S_STATUS_ICON" K8S_STATUS_TEXT="$K8S_STATUS_TEXT" \
K8S_LAST_CHECK="$K8S_LAST_CHECK" K8S_NEXT="$K8S_NEXT" K8S_NOTES="$K8S_NOTES" \
K8S_RUNNING="$K8S_RUNNING" K8S_PATCH="$K8S_PATCH" K8S_MINOR="$K8S_MINOR" \
K8S_LAST_DETECT_LINE="$K8S_LAST_DETECT_LINE" K8S_IN_FLIGHT="$K8S_IN_FLIGHT" K8S_LAST_CHAIN="$K8S_LAST_CHAIN" \
HIGHEST_EXIT="$HIGHEST_EXIT" \
python3 -c '
import json, os
from datetime import datetime, timezone
def env(k): return os.environ.get(k, "")
out = {
"as_of_utc": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"highest_exit": int(env("HIGHEST_EXIT")),
"apps": {
"status": env("APPS_STATUS_ICON"),
"status_text": env("APPS_STATUS_TEXT"),
"last_check": env("APPS_LAST_CHECK"),
"next_upgrade": env("APPS_NEXT"),
"notes": env("APPS_NOTES"),
"enrolled": int(env("APPS_ENROLLED") or 0),
"pending_approvals": int(env("APPS_PENDING") or 0),
"updates_line": env("APPS_UPDATES_LINE"),
"errors_line": env("APPS_ERROR_LINE"),
},
"os": {
"status": env("OS_STATUS_ICON"),
"status_text": env("OS_STATUS_TEXT"),
"last_check": env("OS_LAST_CHECK"),
"next_upgrade": env("OS_NEXT"),
"notes": env("OS_NOTES"),
"distros": env("OS_DISTRO_SUMMARY"),
"kernels": env("OS_KERNEL_SUMMARY"),
"pending_reboot_nodes": env("OS_PENDING_REBOOT_NODES"),
"held_with_bumps": env("OS_HELD_DETAIL"),
"last_uu_run": env("OS_LAST_UU"),
"last_kured_reboot": env("OS_LAST_KURED"),
},
"k8s": {
"status": env("K8S_STATUS_ICON"),
"status_text": env("K8S_STATUS_TEXT"),
"last_check": env("K8S_LAST_CHECK"),
"next_upgrade": env("K8S_NEXT"),
"notes": env("K8S_NOTES"),
"running": env("K8S_RUNNING"),
"patch_target": env("K8S_PATCH"),
"minor_target": env("K8S_MINOR"),
"last_detection_line": env("K8S_LAST_DETECT_LINE"),
"in_flight": env("K8S_IN_FLIGHT"),
"last_chain": env("K8S_LAST_CHAIN"),
},
}
print(json.dumps(out, indent=2))
'
}
main() {
parse_args "$@"
collect_apps
collect_os
collect_k8s
if [[ "$JSON" == true ]]; then
render_json
else
render_table
fi
exit "$HIGHEST_EXIT"
}
main "$@"

10
scripts/vault-kubeconfig Executable file
View file

@ -0,0 +1,10 @@
#!/usr/bin/env bash
# Generate a short-lived kubeconfig from Vault K8s secrets engine.
# Requires: vault login -method=oidc (or VAULT_TOKEN set)
set -euo pipefail
TOKEN=$(vault write -format=json kubernetes/creds/local-admin kubernetes_namespace=default | jq -r .data.service_account_token)
kubectl config set-credentials vault-admin --token="$TOKEN"
kubectl config set-context vault --cluster=kubernetes --user=vault-admin
kubectl config use-context vault
echo "Kubeconfig set with 1h token"

View file

@ -0,0 +1,9 @@
[Unit]
Description=Renew the periodic Vault/OpenBao token in ~/.vault-token
Documentation=https://github.com/ViktorBarzin/infra/blob/master/scripts/vault-token-renew.sh
Wants=network-online.target
After=network-online.target
[Service]
Type=oneshot
ExecStart=%h/.local/bin/vault-token-renew

View file

@ -0,0 +1,90 @@
#!/usr/bin/env bash
# Renew the long-lived PERIODIC Vault/OpenBao token stored in ~/.vault-token.
#
# Background: wizard@devvm used to hold a 7-day OIDC login token (re-auth weekly
# via `vault login -method=oidc`). On 2026-06-05 that was replaced with a
# periodic, orphan token so it never expires. Periodic tokens have no max-TTL;
# they only need renewing within each `period` (768h / 32d here). This unit
# renews daily, so the token stays alive indefinitely with huge margin. If the
# box is ever decommissioned and this stops running, the token self-expires
# within ~32 days (unlike a root token, which would live forever).
#
# Token was minted with (vault-admin = path "*" sudo; sops-admin = transit for SOPS):
# vault token create -orphan -period=768h \
# -policy=vault-admin -policy=sops-admin -display-name=devvm-wizard
# To recreate if ever lost: `vault login -method=oidc`, run the above with
# `-field=token > ~/.vault-token`, then `chmod 600 ~/.vault-token`.
#
# Source of truth: infra/scripts/vault-token-renew.sh (deployed to
# ~/.local/bin/vault-token-renew). Driven by the systemd USER units
# vault-token-renew.{service,timer}. Deploy + recovery runbook:
# infra/docs/runbooks/vault-token-renew-devvm.md
EXPECTED_DN="token-devvm-wizard"
REQUIRED_POLICY="vault-admin"
# vtr_display_name <lookup-json> -> display_name (empty if absent).
vtr_display_name() {
printf '%s' "$1" | jq -r '.data.display_name // ""'
}
# vtr_policies_csv <lookup-json> -> comma-joined token policies + identity policies.
# Both are merged because a token minted via OIDC carries vault-admin only in
# identity_policies, while .data.policies shows just [default] (misleading on its
# own — see memory id=4211). Our periodic token carries them as token policies.
vtr_policies_csv() {
printf '%s' "$1" | jq -r '((.data.policies // []) + (.data.identity_policies // [])) | join(",")'
}
# vtr_drift_ok <display_name> <policies-csv> -> 0 if this is OUR periodic admin
# token (right display name AND vault-admin present), 1 otherwise. The comma
# fencing makes the policy match exact (so "vault-admin-ro" never matches).
vtr_drift_ok() {
local dn="$1" pols="$2"
[ "$dn" = "$EXPECTED_DN" ] || return 1
printf ',%s,' "$pols" | grep -q ",$REQUIRED_POLICY," || return 1
}
vtr_main() {
set -euo pipefail
export PATH="/usr/local/bin:/usr/bin:/bin:${PATH:-}"
export VAULT_ADDR="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
local log info dn pols out ttl
log="${XDG_STATE_HOME:-$HOME/.local/state}/vault-token-renew.log"
mkdir -p "$(dirname "$log")"
if ! info=$(vault token lookup -format=json 2>&1); then
printf '%s FAIL: token lookup: %s\n' "$(date -Is)" "$info" >>"$log"
exit 1
fi
dn=$(vtr_display_name "$info")
pols=$(vtr_policies_csv "$info")
# Drift guard (added 2026-06-07): the renewer must NOT keep a FOREIGN token alive.
# On 2026-06-05 a stray `vault login -method=kubernetes` overwrote ~/.vault-token
# with a read-only woodpecker token, and this script then silently renewed THAT
# for two days — masking the loss of write access. So before renewing, confirm
# the token is our periodic admin token; if it has drifted, fail loudly (systemd
# marks the unit failed) instead of keeping someone else's token alive.
if ! vtr_drift_ok "$dn" "$pols"; then
printf '%s DRIFT: ~/.vault-token is dn=%q policies=%q (expected dn=%q with %q). Refusing to renew a foreign token. Re-mint: vault login -method=oidc && vault token create -orphan -period=768h -policy=vault-admin -policy=sops-admin -display-name=devvm-wizard -field=token > ~/.vault-token && chmod 600 ~/.vault-token\n' \
"$(date -Is)" "$dn" "$pols" "$EXPECTED_DN" "$REQUIRED_POLICY" >>"$log"
exit 1
fi
# `vault token renew` with no argument renews the calling token (renew-self).
# On success, log only the new TTL (never the raw JSON — it contains the token).
if out=$(vault token renew -format=json 2>&1); then
ttl=$(printf '%s' "$out" | jq -r '.auth.lease_duration' 2>/dev/null || echo '?')
printf '%s OK renewed (dn=%s ttl=%ss)\n' "$(date -Is)" "$dn" "$ttl" >>"$log"
else
printf '%s FAIL: %s\n' "$(date -Is)" "$out" >>"$log"
exit 1
fi
}
# Run main only when executed directly, so the test can source the pure functions.
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
vtr_main "$@"
fi

View file

@ -0,0 +1,10 @@
[Unit]
Description=Daily renewal of the periodic Vault token in ~/.vault-token
[Timer]
OnCalendar=daily
Persistent=true
RandomizedDelaySec=300
[Install]
WantedBy=timers.target

View file

@ -0,0 +1,121 @@
#!/usr/bin/env bash
# Programmatically register a Forgejo repo in Woodpecker without needing the
# Web UI's OAuth flow.
#
# Earlier we believed only the OAuth login could create a working webhook
# because the webhook URL contains a JWT signed with a server-side key.
# That's true for the JWT, BUT the webhook is created server-side when the
# repo is activated through POST /api/repos — Woodpecker handles the JWT
# generation internally. We just need to call that endpoint as the right
# user (the one whose forge OAuth token can read the repo).
#
# The Woodpecker admin token (mine, ViktorBarzin@github) is a session JWT
# of the form `{"type":"user","user-id":"1"}` signed with the user's
# `hash` column (per-user, stored in the `users` table). Forge-API calls
# made on behalf of that user use the user's stored OAuth `access_token`
# from the same row. My GitHub admin can't read Forgejo repos, so the
# admin token can't activate Forgejo repos.
#
# The fix: mint a session JWT for the Forgejo `viktor` user (user_id=2)
# using `viktor`'s `hash`. Then POST /api/repos as viktor — viktor's
# stored Forgejo OAuth token has the access needed.
#
# Usage:
# ./woodpecker-register-forgejo-repo.sh <forgejo-org/repo> [<forgejo-org/repo> ...]
# Example:
# ./woodpecker-register-forgejo-repo.sh viktor/broker-sync viktor/freedify
#
# Requires:
# - vault CLI logged in (oidc or token), with read access to
# secret/database/static-creds/pg-woodpecker AND a Forgejo PAT in
# secret/viktor/forgejo_admin_token (or pass FORGEJO_TOKEN env var)
# - kubectl with cluster access (for the temporary psql pod)
# - openssl
set -euo pipefail
NS=${NS:-woodpecker}
WP_URL=${WP_URL:-https://ci.viktorbarzin.me}
FORGEJO_URL=${FORGEJO_URL:-https://forgejo.viktorbarzin.me}
FORGEJO_USER_LOGIN=${FORGEJO_USER_LOGIN:-viktor}
if [ "$#" -lt 1 ]; then
echo "usage: $0 <org/repo> [<org/repo> ...]" >&2
exit 1
fi
# Pull viktor's `hash` from the woodpecker DB (used to sign the session JWT)
# and OAuth access_token (sanity check it exists).
WP_DB_USER=$(vault read -format=json database/static-creds/pg-woodpecker | jq -r .data.username)
WP_DB_PASS=$(vault read -format=json database/static-creds/pg-woodpecker | jq -r .data.password)
PG_POD=tmp-wp-register-$$
cat <<EOF | kubectl apply -f - >/dev/null
apiVersion: v1
kind: Pod
metadata: { name: $PG_POD, namespace: $NS }
spec:
restartPolicy: Never
containers:
- name: psql
image: postgres:15
env: [{name: PGPASSWORD, value: "$WP_DB_PASS"}]
command: ["sleep", "300"]
EOF
trap "kubectl delete pod -n $NS $PG_POD --wait=false >/dev/null 2>&1 || true" EXIT
for _ in $(seq 1 30); do
PHASE=$(kubectl get pod -n $NS $PG_POD -o jsonpath='{.status.phase}' 2>/dev/null || true)
[ "$PHASE" = "Running" ] && break
sleep 1
done
VIKTOR_HASH=$(kubectl exec -n $NS $PG_POD -- psql -h pg-cluster-rw.dbaas -U "$WP_DB_USER" -d woodpecker -tA -c \
"SELECT hash FROM users WHERE login='$FORGEJO_USER_LOGIN' AND forge_id=2" | tr -d '[:space:]')
if [ -z "$VIKTOR_HASH" ]; then
echo "ERROR: no woodpecker user found for forge_id=2 login=$FORGEJO_USER_LOGIN" >&2
echo " (have they ever logged in via Forgejo OAuth?)" >&2
exit 1
fi
# Mint a session JWT (HS256) for that user.
b64() { openssl base64 -A | tr '+/' '-_' | tr -d '='; }
HEADER=$(printf '%s' '{"alg":"HS256","typ":"JWT"}' | b64)
PAYLOAD=$(printf '{"type":"user","user-id":"%s"}' \
"$(kubectl exec -n $NS $PG_POD -- psql -h pg-cluster-rw.dbaas -U "$WP_DB_USER" -d woodpecker -tA -c \
"SELECT id FROM users WHERE login='$FORGEJO_USER_LOGIN' AND forge_id=2" | tr -d '[:space:]')" | b64)
SIG=$(printf '%s.%s' "$HEADER" "$PAYLOAD" | openssl dgst -sha256 -hmac "$VIKTOR_HASH" -binary | b64)
TOKEN="$HEADER.$PAYLOAD.$SIG"
# Sanity check: am I really logged in as viktor?
ME=$(curl -sf "$WP_URL/api/user" -H "Authorization: Bearer $TOKEN" | jq -r '.login')
if [ "$ME" != "$FORGEJO_USER_LOGIN" ]; then
echo "ERROR: minted token authenticates as '$ME', not '$FORGEJO_USER_LOGIN'" >&2
exit 1
fi
echo "Authenticated as: $ME"
# Activate each repo via POST /api/repos?forge_remote_id=N
# Forgejo repo ID is fetched via the Forgejo API.
FORGEJO_AUTH="${FORGEJO_TOKEN:-$(vault kv get -field=forgejo_admin_token secret/viktor 2>/dev/null || true)}"
if [ -z "$FORGEJO_AUTH" ]; then
echo "ERROR: set FORGEJO_TOKEN env or seed secret/viktor/forgejo_admin_token in vault" >&2
exit 1
fi
for repo in "$@"; do
FRID=$(curl -sf "$FORGEJO_URL/api/v1/repos/$repo" -H "Authorization: token $FORGEJO_AUTH" | jq -r .id 2>/dev/null || true)
if [ -z "$FRID" ] || [ "$FRID" = "null" ]; then
echo " $repo: ERROR resolving Forgejo repo id" >&2
continue
fi
HTTP=$(curl -s -X POST "$WP_URL/api/repos?forge_remote_id=$FRID" \
-H "Authorization: Bearer $TOKEN" \
-o /tmp/wp-add-$FRID.json -w "%{http_code}")
case "$HTTP" in
200) echo " $repo: activated (id=$(jq -r .id /tmp/wp-add-$FRID.json))" ;;
409) echo " $repo: already active" ;;
*) echo " $repo: HTTP $HTTP$(cat /tmp/wp-add-$FRID.json)" ;;
esac
rm -f /tmp/wp-add-$FRID.json
done

3
scripts/workstation/.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
__pycache__/
.pytest_cache/
*.pyc

View file

@ -0,0 +1,3 @@
{
"claudeMd": "# Viktor Barzin homelab — shared multi-user Claude Code Workstation (devvm)\n\nYou are running as a specific OS user on a SHARED devvm Workstation, not as the admin. These org-wide rules apply to EVERY user and sit at the top of settings precedence (they cannot be overridden by a user's own config):\n\n- Respect your permission tier. Your kubectl, Vault, and infra access are scoped to your RBAC tier (admin / power-user / namespace-owner). Do not attempt to escalate privileges or reach another user's resources.\n- Secrets are per-user. Never read another user's home directory, credentials, tokens, or ~/.claude secrets. Your own secrets live in your home at mode 600.\n- Infrastructure changes go through Terraform/Terragrunt (scripts/tg apply) — never direct kubectl apply/edit/patch. Pushing to git does NOT deploy; applies are manual and admin-gated, so your edits cannot take effect without an admin apply.\n- Follow the engineering rules in ~/.claude/rules/ (execution, planning, quality) and every CLAUDE.md in the repo tree.\n- The monorepo is at ~/code. Non-admins get a git-crypt-LOCKED clone: secret files read as ciphertext — that is expected, not an error."
}

View file

@ -0,0 +1,26 @@
# Declarative host toolset for the devvm Workstation (apt packages, one per line).
# Consumed by setup-devvm.sh: apt-get install -y $(grep -vE '^\s*(#|$)' packages.txt)
# Comments (#) and blank lines are ignored. Tools NOT in the standard apt repos
# are listed below as comments with their real install path (handled explicitly
# in setup-devvm.sh) so this manifest stays a safe argument to `apt-get install`.
git
zsh
tmux
ripgrep
fd-find
jq
curl
ca-certificates
python3
python3-yaml
python3-pip
podman
# --- installed by setup-devvm.sh via NON-apt paths (not apt-installable) ---
# nodejs + npm -> NodeSource repo (claude-code needs node >= 18; distro nodejs is too old)
# @anthropic-ai/claude-code -> npm install -g
# kubectl -> k8s apt repo OR pinned binary (already present on devvm)
# vault -> HashiCorp apt repo OR pinned binary (already present on devvm)
# kubelogin (kubectl oidc-login) -> `kubectl krew install oidc-login` or int128/kubelogin release.
# NOTE: the apt package literally named "kubelogin" is the AZURE
# tool, NOT the OIDC one we need -- do not apt-install it.

View file

@ -0,0 +1,21 @@
# THE single source of truth for the devvm Workstation lifecycle (onboard -> offboard).
# Consumed by roster_engine.py (derive/validate) + t3-provision-users.sh (apply).
#
# os_user (the map KEY, pinned) -> authentik_user . k8s_user . tier . namespaces
# The three identifiers differ per person (verified 2026-06-08) -- no email->username
# derivation; record each explicitly.
#
# Tiers: admin | power-user | namespace-owner
# admin - cluster-admin, unlocked tree, secrets (groups: sudo,docker,code-shared)
# power-user - cluster-wide READ (no Secrets) via oidc-power-user-readonly; locked clone
# namespace-owner - admin in their own namespace(s) only; locked clone
#
# wizard IS listed (as admin): the reconcile REGENERATES /etc/ttyd-user-map +
# dispatch.json from this file, so omitting him would drop his t3 instance. The
# provisioner skips account/group/clone mutations for already-existing users, so
# listing him is safe (he keeps his unlocked tree + cluster-admin untouched).
users:
wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin} # base config author + cluster-admin
emo: {authentik_user: emil.barzin, k8s_user: emo, tier: power-user} # NET-NEW k8s_users entry (add as power-user before provisioning)
ancamilea: {authentik_user: ancaelena98, k8s_user: anca, tier: namespace-owner, namespaces: [plotting-book]} # ALREADY provisioned in-cluster -- assert, don't re-create
# gheorghe: {authentik_user: vabbit81, k8s_user: vabbit81, tier: namespace-owner, namespaces: [vabbit81]} # already a cluster ns-owner; uncomment to give him a devvm workstation

View file

@ -0,0 +1,299 @@
#!/usr/bin/env python3
"""Pure derivation + offboarding-diff engine for the devvm Workstation roster.
Functional core (this module, unit-tested) / imperative shell (the bash
provisioner that consumes the JSON this emits and performs the host mutations).
No host I/O lives in the tested functions. See PRD ViktorBarzin/infra#9.
The roster (`roster.yaml`) is the single source of truth for the workstation
lifecycle. `os_user` is the pinned key; `authentik_user` / `k8s_user` differ
per person and are recorded explicitly (no email->username derivation).
"""
from __future__ import annotations
import json
import sys
from dataclasses import dataclass, field
from typing import Iterable
import yaml
BASE_PORT = 3773
VALID_TIERS = ("admin", "power-user", "namespace-owner")
# Tier -> supplementary groups the reconcile ENSURES (additive-only; never stripped).
TIER_GROUPS: dict[str, tuple[str, ...]] = {
"admin": ("code-shared", "docker", "sudo"),
"power-user": (),
"namespace-owner": (),
}
DEFAULT_SHELL = "/bin/zsh"
_REVERSIBLE_OFFBOARD_KINDS = (
"disable_instance",
"unmap_dispatch",
"remove_from_t3_group",
"lock_login",
"revoke_cluster_rbac",
)
class RosterError(ValueError):
"""Raised when the roster is structurally invalid."""
@dataclass(frozen=True)
class User:
os_user: str
authentik_user: str
k8s_user: str
tier: str
namespaces: tuple[str, ...] = ()
@dataclass(frozen=True)
class Roster:
users: dict[str, User] = field(default_factory=dict)
@dataclass(frozen=True)
class Account:
os_user: str
tier: str
shell: str
login_locked: bool
groups: tuple[str, ...]
@dataclass(frozen=True)
class DesiredState:
accounts: dict[str, Account]
ttyd_user_map: str
dispatch: dict[str, dict]
ports: dict[str, int]
@dataclass(frozen=True)
class OffboardAction:
os_user: str
kind: str
reversible: bool
# --------------------------------------------------------------------------
# Parsing + structural validation
# --------------------------------------------------------------------------
def _parse_user(os_user: str, spec: dict) -> User:
for required in ("authentik_user", "k8s_user", "tier"):
if required not in spec:
raise RosterError(f"user {os_user!r}: missing required field {required!r}")
tier = spec["tier"]
if tier not in VALID_TIERS:
raise RosterError(
f"user {os_user!r}: unknown tier {tier!r} (valid: {list(VALID_TIERS)})"
)
namespaces = tuple(spec.get("namespaces") or ())
if tier == "namespace-owner" and not namespaces:
raise RosterError(f"user {os_user!r}: namespace-owner requires namespaces")
if tier != "namespace-owner" and namespaces:
raise RosterError(f"user {os_user!r}: only namespace-owner may set namespaces")
return User(os_user, spec["authentik_user"], spec["k8s_user"], tier, namespaces)
def load_roster(text: str) -> Roster:
data = yaml.safe_load(text) or {}
users_raw = data.get("users") or {}
return Roster({name: _parse_user(name, spec) for name, spec in users_raw.items()})
def load_roster_file(path: str) -> Roster:
with open(path, encoding="utf-8") as fh:
return load_roster(fh.read())
# --------------------------------------------------------------------------
# Tier validation against live k8s_users (fail-loud)
# --------------------------------------------------------------------------
@dataclass(frozen=True)
class ValidationIssue:
os_user: str
severity: str # "error" = tier conflict (abort) | "warn" = absent (grant pending)
message: str
def validate_tiers(
roster: Roster, k8s_user_tiers: dict[str, str]
) -> list[ValidationIssue]:
"""Compare each roster user's tier against the live `k8s_users` map. A real
conflict (roster tier != cluster tier) is an "error" (abort). A net-new user
not yet in `k8s_users` is a "warn" (onboarding proceeds; the kubectl grant is
pending). Admins are exempt (cluster-admin is granted out of band). An empty
list means the roster is consistent with the cluster."""
issues = []
for user in roster.users.values():
if user.tier == "admin":
continue
actual = k8s_user_tiers.get(user.k8s_user)
if actual is None:
issues.append(
ValidationIssue(
user.os_user,
"warn",
f"{user.os_user}: tier {user.tier} but k8s_user {user.k8s_user!r} "
f"absent from k8s_users (kubectl grant pending — add the entry)",
)
)
elif actual != user.tier:
issues.append(
ValidationIssue(
user.os_user,
"error",
f"{user.os_user}: roster tier {user.tier} != k8s_users tier "
f"{actual} for {user.k8s_user!r}",
)
)
return issues
def has_blocking_errors(issues: list[ValidationIssue]) -> bool:
return any(issue.severity == "error" for issue in issues)
# --------------------------------------------------------------------------
# Desired-state derivation (sticky ports, ttyd map, dispatch, accounts)
# --------------------------------------------------------------------------
def _allocate_ports(roster: Roster, existing_ports: dict[str, int]) -> dict[str, int]:
ports = {u: existing_ports[u] for u in roster.users if u in existing_ports}
used = set(ports.values())
for os_user in sorted(roster.users):
if os_user in ports:
continue
candidate = BASE_PORT
while candidate in used:
candidate += 1
ports[os_user] = candidate
used.add(candidate)
return ports
_TTYD_MAP_HEADER = (
"# Generated from roster.yaml by roster_engine.py — DO NOT EDIT BY HAND.\n"
"# <authentik_user>=<os_user>; consumed by t3-dispatch.\n"
)
def derive_desired_state(
roster: Roster, existing_ports: dict[str, int]
) -> DesiredState:
ports = _allocate_ports(roster, existing_ports)
ordered = sorted(roster.users.values(), key=lambda u: ports[u.os_user])
ttyd_lines = [f"{u.authentik_user}={u.os_user}" for u in ordered]
ttyd_user_map = _TTYD_MAP_HEADER + "\n".join(ttyd_lines) + "\n"
dispatch = {
u.authentik_user: {"os_user": u.os_user, "port": ports[u.os_user]}
for u in ordered
}
accounts = {
u.os_user: Account(
os_user=u.os_user,
tier=u.tier,
shell=DEFAULT_SHELL,
login_locked=True,
groups=TIER_GROUPS[u.tier],
)
for u in roster.users.values()
}
return DesiredState(accounts, ttyd_user_map, dispatch, ports)
def groups_to_add(desired: Iterable[str], current: Iterable[str]) -> list[str]:
"""Additive-only: the groups to `gpasswd -a`. Never proposes a removal, so a
routine reconcile can't strip a pre-existing user's legacy groups."""
return sorted(set(desired) - set(current))
# --------------------------------------------------------------------------
# Offboarding diff (staged: reversible cut, then gated destructive removal)
# --------------------------------------------------------------------------
def to_deprovision(old: Roster, new: Roster) -> list[str]:
return sorted(set(old.users) - set(new.users))
def offboard_plan(
old: Roster, new: Roster, *, include_destructive: bool
) -> list[OffboardAction]:
"""Staged offboarding actions for users dropped from the roster. The
reversible cut (disable instance, unmap, lock, revoke RBAC) is always
returned; the irreversible `userdel_archive` is included ONLY when
explicitly requested, so it can never be auto-applied by a reconcile."""
plan: list[OffboardAction] = []
for os_user in to_deprovision(old, new):
plan.extend(
OffboardAction(os_user, kind, True) for kind in _REVERSIBLE_OFFBOARD_KINDS
)
if include_destructive:
plan.append(OffboardAction(os_user, "userdel_archive", False))
return plan
# --------------------------------------------------------------------------
# CLI adapter (imperative shell entrypoint — consumed by t3-provision-users.sh)
# --------------------------------------------------------------------------
def _desired_state_to_dict(ds: DesiredState) -> dict:
return {
"accounts": {
name: {
"os_user": a.os_user,
"tier": a.tier,
"shell": a.shell,
"login_locked": a.login_locked,
"groups": list(a.groups),
}
for name, a in ds.accounts.items()
},
"ttyd_user_map": ds.ttyd_user_map,
"dispatch": ds.dispatch,
"ports": ds.ports,
}
def _main(argv: list[str]) -> int:
import argparse
parser = argparse.ArgumentParser(description="Workstation roster engine")
sub = parser.add_subparsers(dest="cmd", required=True)
pv = sub.add_parser(
"validate", help="exit 1 if roster tiers diverge from k8s_users"
)
pv.add_argument("--roster", required=True)
pv.add_argument("--k8s-users-json", required=True, help="JSON map {k8s_user: tier}")
pd = sub.add_parser("derive", help="emit desired state as JSON")
pd.add_argument("--roster", required=True)
pd.add_argument("--ports-json", required=True, help="JSON map {os_user: port}")
args = parser.parse_args(argv)
roster = load_roster_file(args.roster)
if args.cmd == "validate":
with open(args.k8s_users_json, encoding="utf-8") as fh:
issues = validate_tiers(roster, json.load(fh))
for issue in issues:
print(f"{issue.severity.upper()}: {issue.message}", file=sys.stderr)
return 1 if has_blocking_errors(issues) else 0
with open(args.ports_json, encoding="utf-8") as fh:
desired = derive_desired_state(roster, json.load(fh))
json.dump(_desired_state_to_dict(desired), sys.stdout, indent=2, sort_keys=True)
sys.stdout.write("\n")
return 0
if __name__ == "__main__":
raise SystemExit(_main(sys.argv[1:]))

View file

@ -0,0 +1,80 @@
#!/usr/bin/env bash
# Idempotent machine-wide host base for the devvm Claude Code Workstation.
# Run as root. Sets up ONLY machine-wide state: the apt toolset, node + claude-code,
# kubelogin, the ENFORCED managed Claude config, and /etc/skel defaults (launcher,
# tmux UX, and live config-inheritance symlinks into the shared config base).
#
# PER-USER provisioning (accounts, per-tier groups, kubeconfig, secrets, infra
# clone) lives in t3-provision-users.sh — NOT here. Safe to re-run.
set -euo pipefail
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# The shared config base every user inherits from (live, chezmoi-versioned).
# Coupled to the admin's home today; override to relocate to a neutral path.
CONFIG_BASE="${WORKSTATION_CONFIG_BASE:-/home/wizard/.claude}"
[[ $EUID -eq 0 ]] || { echo "setup-devvm.sh: must run as root" >&2; exit 1; }
log() { echo "[setup-devvm] $*"; }
# 1) apt toolset (declarative manifest; comments/blank lines stripped)
mapfile -t PKGS < <(grep -vE '^[[:space:]]*(#|$)' "$HERE/packages.txt")
log "apt: ensuring ${#PKGS[@]} packages present"
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y "${PKGS[@]}" >/dev/null
# 2) node >= 18 + claude-code (claude-code requires node >= 18)
need_node=1
if command -v node >/dev/null; then
[[ "$(node -v | sed 's/^v\([0-9]*\).*/\1/')" -ge 18 ]] && need_node=0
fi
if [[ $need_node -eq 1 ]]; then
log "node: installing NodeSource 22.x"
curl -fsSL https://deb.nodesource.com/setup_22.x | bash - >/dev/null
apt-get install -y nodejs >/dev/null
fi
command -v claude >/dev/null || { log "npm: installing @anthropic-ai/claude-code"; npm install -g @anthropic-ai/claude-code >/dev/null; }
# 3) kubelogin (kubectl oidc-login) system-wide — NOT the apt 'kubelogin' (= Azure tool)
if [[ ! -x /usr/local/bin/kubelogin ]]; then
log "kubelogin: installing int128/kubelogin"
tmp="$(mktemp -d)"
curl -fsSL -o "$tmp/kl.zip" https://github.com/int128/kubelogin/releases/latest/download/kubelogin_linux_amd64.zip
( cd "$tmp" && { unzip -o kl.zip kubelogin >/dev/null 2>&1 || python3 -m zipfile -e kl.zip .; } )
install -m 0755 "$tmp/kubelogin" /usr/local/bin/kubelogin
ln -sf /usr/local/bin/kubelogin /usr/local/bin/kubectl-oidc_login
rm -rf "$tmp"
fi
# 4) machine-wide ENFORCED Claude config (org claudeMd; top precedence; NO secrets)
install -d -m 0755 /etc/claude-code
install -m 0644 "$HERE/managed-settings.json" /etc/claude-code/managed-settings.json
log "managed-settings.json -> /etc/claude-code/ (enforced org claudeMd)"
# 5) /etc/skel for NEW accounts: launcher + tmux UX + live-inheritance symlinks.
# A symlink placed in /etc/skel is copied (as a symlink) into each new home by
# `useradd -m`, so new users' ~/.claude/{skills,rules,...} resolve to the shared
# base and pick up the admin's edits live. Secrets + hooks are per-user (written
# by the provisioner), NEVER symlinked here.
install -d -m 0755 /etc/skel
install -m 0755 "$HERE/skel/start-claude.sh" /etc/skel/start-claude.sh
install -m 0644 "$HERE/skel/tmux.conf" /etc/skel/.tmux.conf
install -d -m 0755 /etc/skel/.claude
for d in skills rules agents commands; do
[[ -d "$CONFIG_BASE/$d" ]] && ln -sfn "$CONFIG_BASE/$d" "/etc/skel/.claude/$d"
done
log "skel: launcher + tmux + inheritance symlinks (base=$CONFIG_BASE)"
# 6) deploy the roster-driven provisioner to /usr/local/bin (run hourly by
# t3-provision-users.timer). Re-deployed here so its logic is reproducible.
install -m 0755 "$HERE/../t3-provision-users.sh" /usr/local/bin/t3-provision-users
log "t3-provision-users -> /usr/local/bin/ (roster-driven)"
# 7) harden the admin's unlocked tree: it holds git-crypt-DECRYPTED secrets, so it
# must NOT be world-readable — only the admin + code-shared. Without this, ANY
# devvm user (even outside code-shared) could read decrypted secrets by path.
ADMIN_CODE="${ADMIN_CODE:-/home/wizard/code}"
if [[ -d "$ADMIN_CODE" ]]; then
chmod o-rx "$ADMIN_CODE"
log "hardened $ADMIN_CODE (o-rx — not world-readable)"
fi
log "OK (idempotent)"

View file

@ -0,0 +1,42 @@
#!/bin/bash
# Per-user Claude Code Workstation launcher (devvm). Lands the user in their OWN
# ~/code clone (NOT a hardcoded /home/wizard/code) and names the Claude session
# after the tmux session so /resume, the prompt box, and the terminal title line
# up. Deployed via /etc/skel by setup-devvm.sh, so new accounts get it on
# `useradd -m`. Existing users are repointed to this during their migration.
echo ""
echo " Welcome, $(id -un)! 🚀"
echo ""
echo " Starting Claude Code in $HOME/code ..."
echo " (Right-click for tmux menu, or Ctrl+B then | or - to split)"
echo ""
name_args=()
if [ -n "${TMUX:-}" ]; then
sess="$(tmux display-message -p '#{session_name}' 2>/dev/null)"
[ -n "$sess" ] && name_args=(--name "$sess")
fi
cd "$HOME/code" 2>/dev/null || cd "$HOME"
# Prefer the system-wide `claude` (installed by setup-devvm.sh); fall back to npx.
launch() {
if command -v claude >/dev/null 2>&1; then
claude "$@"
else
npx @anthropic-ai/claude-code "$@"
fi
}
# Deliberately not `exec` so we can branch on the exit code: clean quit ends the
# pane (ttyd closes the terminal); a crash drops to a shell so the tmux session
# isn't destroyed-and-recreated in a ttyd auto-reconnect loop.
launch --dangerously-skip-permissions --model claude-opus-4-8 "${name_args[@]}"
code=$?
[ "$code" -eq 0 ] && exit 0
echo ""
echo " claude exited abnormally (status $code). Dropping to a shell — your tmux session is preserved."
echo " Re-launch any time with: ~/start-claude.sh"
echo ""
exec "${SHELL:-/bin/bash}" -l

View file

@ -0,0 +1,51 @@
# Workstation base tmux config (deployed to /etc/skel/.tmux.conf by
# setup-devvm.sh; new accounts inherit it). Uses $HOME (expanded by the shell at
# run time) so it works for ANY user — never a hardcoded /home/<name>.
#
# NOTE: the tmux-resurrect/continuum "persistence" block is owned by the separate
# terminal-lobby tool, which appends its own managed section + installs tpm. This
# base file intentionally omits it so a fresh account isn't left with broken
# `run ~/.tmux/plugins/tpm/tpm` references before terminal-lobby runs.
# Launch the per-user Claude launcher in every new pane/window (lands in ~/code).
set -g default-command "$HOME/start-claude.sh"
# Mouse support — click panes, drag to resize, scroll with wheel
set -g mouse on
# Easy splits: Ctrl+b then | for vertical, - for horizontal
bind | split-window -h -c "#{pane_current_path}"
bind - split-window -v -c "#{pane_current_path}"
bind c new-window -c "#{pane_current_path}"
# Right-click context menu — clickable actions popup
bind -n MouseDown3Pane display-menu -T "#[align=centre]Terminal Menu" -x M -y M \
"New Claude" w "new-window -c '#{pane_current_path}'" \
"Split Horizontal" h "split-window -v -c '#{pane_current_path}'" \
"Split Vertical" v "split-window -h -c '#{pane_current_path}'" \
"" \
"Shell" s "split-window -v -c '#{pane_current_path}' /bin/zsh" \
"" \
"Close Pane" x "confirm-before -p 'Close pane? (y/n)' kill-pane" \
"Close Window" X "confirm-before -p 'Close window? (y/n)' kill-window" \
"" \
"Detach" d "detach-client"
# Clickable [+] button in the status bar — left-click to open the same menu
set -g status-right '#[fg=black bg=green] [+] #[default] #[fg=cyan]Right-click for menu '
set -g status-right-length 60
bind -n MouseDown1StatusRight display-menu -T "#[align=centre]Terminal Menu" -x M -y S \
"New Claude" w "new-window -c '#{pane_current_path}'" \
"Split Horizontal" h "split-window -v -c '#{pane_current_path}'" \
"Split Vertical" v "split-window -h -c '#{pane_current_path}'" \
"" \
"Shell" s "split-window -v -c '#{pane_current_path}' /bin/zsh" \
"" \
"Close Pane" x "confirm-before -p 'Close pane? (y/n)' kill-pane" \
"Close Window" X "confirm-before -p 'Close window? (y/n)' kill-window"
# Status bar styling + 1-based numbering
set -g status-style 'bg=colour235 fg=colour136'
set -g status-left '#[fg=green][#S] '
set -g base-index 1
setw -g pane-base-index 1

View file

@ -0,0 +1,280 @@
"""Unit tests for the pure roster derivation + offboarding-diff engine.
These exercise external behaviour only (parse -> validate -> derive -> diff);
no host I/O is touched. Mirrors the pure-core pytest style used elsewhere in
the monorepo. See PRD ViktorBarzin/infra#9 (modules #1 roster engine, #5
offboarding diff).
"""
import textwrap
import pytest
import roster_engine as eng
def _roster(yaml_text: str) -> "eng.Roster":
return eng.load_roster(textwrap.dedent(yaml_text))
# --------------------------------------------------------------------------
# load_roster: parsing + structural validation (module #1)
# --------------------------------------------------------------------------
def test_parses_user_fields_and_tier():
r = _roster(
"""
users:
emo: {authentik_user: emil.barzin, k8s_user: emo, tier: power-user}
"""
)
u = r.users["emo"]
assert u.os_user == "emo"
assert u.authentik_user == "emil.barzin"
assert u.k8s_user == "emo"
assert u.tier == "power-user"
assert u.namespaces == ()
def test_namespace_owner_carries_namespaces():
r = _roster(
"""
users:
ancamilea: {authentik_user: ancaelena98, k8s_user: anca,
tier: namespace-owner, namespaces: [plotting-book]}
"""
)
assert r.users["ancamilea"].namespaces == ("plotting-book",)
def test_admin_tier_is_accepted():
r = _roster(
"users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
)
assert r.users["wizard"].tier == "admin"
def test_rejects_unknown_tier():
with pytest.raises(eng.RosterError, match="tier"):
_roster("users: {bob: {authentik_user: b, k8s_user: b, tier: wizard-king}}")
def test_rejects_missing_required_field():
with pytest.raises(eng.RosterError, match="authentik_user"):
_roster("users: {bob: {k8s_user: b, tier: power-user}}")
def test_namespace_owner_requires_namespaces():
with pytest.raises(eng.RosterError, match="namespace"):
_roster("users: {bob: {authentik_user: b, k8s_user: b, tier: namespace-owner}}")
def test_non_namespace_owner_must_not_set_namespaces():
with pytest.raises(eng.RosterError, match="namespace"):
_roster(
"users: {bob: {authentik_user: b, k8s_user: b, tier: power-user, "
"namespaces: [x]}}"
)
def test_empty_roster_is_valid():
assert _roster("users: {}").users == {}
def test_missing_users_key_is_valid_empty():
assert _roster("{}").users == {}
# --------------------------------------------------------------------------
# validate_tiers: roster tier vs live k8s_users (fail-loud, module #1)
# --------------------------------------------------------------------------
def test_validate_ok_when_tiers_match():
r = _roster(
"users: {ancamilea: {authentik_user: a, k8s_user: anca, "
"tier: namespace-owner, namespaces: [plotting-book]}}"
)
assert eng.validate_tiers(r, {"anca": "namespace-owner"}) == []
def test_validate_flags_tier_mismatch_as_error():
# roster says power-user, cluster says namespace-owner -> a real conflict -> ERROR (abort).
r = _roster(
"users: {ancamilea: {authentik_user: a, k8s_user: anca, tier: power-user}}"
)
issues = eng.validate_tiers(r, {"anca": "namespace-owner"})
assert len(issues) == 1
assert issues[0].severity == "error"
assert issues[0].os_user == "ancamilea"
assert "power-user" in issues[0].message and "namespace-owner" in issues[0].message
def test_validate_flags_netnew_absent_as_warn():
# emo is power-user in the roster but has no k8s_users entry yet. Onboarding the
# workstation should still proceed; the kubectl grant is pending -> WARN, not error.
r = _roster("users: {emo: {authentik_user: e, k8s_user: emo, tier: power-user}}")
issues = eng.validate_tiers(r, {})
assert len(issues) == 1
assert issues[0].severity == "warn"
assert "emo" in issues[0].message and "k8s_users" in issues[0].message
def test_validate_skips_admin_tier():
# wizard (admin) is cluster-admin via a separate mechanism, not k8s_users.
r = _roster(
"users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
)
assert eng.validate_tiers(r, {}) == []
def test_has_blocking_errors_distinguishes_mismatch_from_absent():
mismatch = _roster(
"users: {ancamilea: {authentik_user: a, k8s_user: anca, tier: power-user}}"
)
absent = _roster(
"users: {emo: {authentik_user: e, k8s_user: emo, tier: power-user}}"
)
assert (
eng.has_blocking_errors(
eng.validate_tiers(mismatch, {"anca": "namespace-owner"})
)
is True
)
assert eng.has_blocking_errors(eng.validate_tiers(absent, {})) is False
# --------------------------------------------------------------------------
# derive_desired_state: accounts, sticky ports, ttyd map, dispatch (module #1)
# --------------------------------------------------------------------------
THREE = """
users:
wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}
emo: {authentik_user: emil.barzin, k8s_user: emo, tier: power-user}
ancamilea: {authentik_user: ancaelena98, k8s_user: anca, tier: namespace-owner, namespaces: [plotting-book]}
"""
LIVE_PORTS = {"wizard": 3773, "emo": 3774, "ancamilea": 3775}
def test_derive_preserves_existing_sticky_ports():
ds = eng.derive_desired_state(_roster(THREE), LIVE_PORTS)
assert ds.ports == {"wizard": 3773, "emo": 3774, "ancamilea": 3775}
def test_derive_allocates_next_free_port_for_new_user():
ds = eng.derive_desired_state(_roster(THREE), {"wizard": 3773})
# emo + ancamilea are new -> next free from 3773 skipping the used 3773
assert ds.ports["wizard"] == 3773
assert sorted([ds.ports["emo"], ds.ports["ancamilea"]]) == [3774, 3775]
def test_derive_dispatch_keyed_by_authentik_user():
ds = eng.derive_desired_state(_roster(THREE), LIVE_PORTS)
assert ds.dispatch == {
"vbarzin": {"os_user": "wizard", "port": 3773},
"emil.barzin": {"os_user": "emo", "port": 3774},
"ancaelena98": {"os_user": "ancamilea", "port": 3775},
}
def test_derive_ttyd_map_has_one_mapping_per_user():
ds = eng.derive_desired_state(_roster(THREE), LIVE_PORTS)
body = [
line
for line in ds.ttyd_user_map.splitlines()
if line.strip() and not line.lstrip().startswith("#")
]
assert set(body) == {"vbarzin=wizard", "emil.barzin=emo", "ancaelena98=ancamilea"}
def test_derive_accounts_assign_tier_groups_and_shell():
ds = eng.derive_desired_state(_roster(THREE), LIVE_PORTS)
assert ds.accounts["wizard"].groups == ("code-shared", "docker", "sudo")
assert ds.accounts["emo"].groups == ()
assert ds.accounts["ancamilea"].groups == ()
assert ds.accounts["emo"].shell == "/bin/zsh"
def test_derive_is_deterministic():
r = _roster(THREE)
assert eng.derive_desired_state(r, LIVE_PORTS) == eng.derive_desired_state(
r, LIVE_PORTS
)
# --------------------------------------------------------------------------
# groups_to_add: the additive-only invariant (module #1)
# --------------------------------------------------------------------------
def test_groups_to_add_returns_only_missing():
assert eng.groups_to_add(("sudo", "docker", "code-shared"), ("docker",)) == [
"code-shared",
"sudo",
]
def test_groups_to_add_never_proposes_removal_of_extra_groups():
# emo currently has code-shared+docker (legacy). A power-user reconcile wants
# no groups -> must NOT strip anything (additive-only invariant).
assert eng.groups_to_add((), ("code-shared", "docker")) == []
def test_groups_to_add_idempotent_when_all_present():
assert eng.groups_to_add(("sudo",), ("sudo", "docker")) == []
# --------------------------------------------------------------------------
# offboarding diff: staged plan, destructive never auto (module #5)
# --------------------------------------------------------------------------
def test_to_deprovision_is_old_minus_new():
old = _roster(THREE)
new = _roster(
"""
users:
wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}
emo: {authentik_user: emil.barzin, k8s_user: emo, tier: power-user}
"""
)
assert eng.to_deprovision(old, new) == ["ancamilea"]
def test_to_deprovision_empty_when_nothing_removed():
r = _roster(THREE)
assert eng.to_deprovision(r, r) == []
def test_offboard_plan_reversible_cut_targets_exactly_the_removed_user():
old = _roster(THREE)
new = _roster(
"users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
)
plan = eng.offboard_plan(old, new, include_destructive=False)
cut_users = {a.os_user for a in plan}
assert cut_users == {"emo", "ancamilea"}
assert all(a.reversible for a in plan)
def test_offboard_plan_excludes_destructive_by_default():
old = _roster(THREE)
new = _roster(
"users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
)
auto = eng.offboard_plan(old, new, include_destructive=False)
assert all(a.kind != "userdel_archive" for a in auto)
def test_offboard_plan_includes_destructive_only_when_explicitly_requested():
old = _roster(THREE)
new = _roster(
"users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
)
full = eng.offboard_plan(old, new, include_destructive=True)
destructive = [a for a in full if a.kind == "userdel_archive"]
assert {a.os_user for a in destructive} == {"emo", "ancamilea"}
assert all(not a.reversible for a in destructive)