stem95su: scheduled Drive->site sync CronJob (every 10m)
CronJob stem95su-gdrive-sync (*/10) mounts the content PVC RW and rclone-syncs the read-only Drive folder "claude" (stem claude/files) onto it (rclone/rclone:1.74.3, scope=drive.readonly, empty-source guard + --max-delete 25). ESO ExternalSecret stem95su-rclone <- Vault secret/stem95su. Requires the GCP OAuth app published to Production or the refresh token expires ~weekly. Lands the gdrive-sync stack on master (it had landed on a feature branch by accident on the shared devvm checkout). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
05b50d2b96
commit
6d224861c4
1168 changed files with 120 additions and 358547 deletions
|
|
@ -1,12 +0,0 @@
|
|||
[Unit]
|
||||
Description=Apply per-VM I/O caps via qm set (idempotent)
|
||||
Documentation=https://github.com/ViktorBarzin/infra/blob/master/scripts/apply-mbps-caps.sh
|
||||
After=pve-cluster.service
|
||||
Wants=pve-cluster.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/apply-mbps-caps.sh
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=apply-mbps-caps
|
||||
|
|
@ -1,74 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# Apply per-VM I/O caps via `qm set` on the PVE host.
|
||||
#
|
||||
# - Reads each target VM's current boot-disk options.
|
||||
# - Appends/normalises `mbps_rd=<N>,mbps_wr=<N>`.
|
||||
# - Re-applies via `qm set` (live, no reboot needed).
|
||||
# - Idempotent: re-running with no drift is a no-op at the storage
|
||||
# level (proxmox config rewrite is cheap).
|
||||
# - Continues on per-VM failures so one missing/stopped VM doesn't
|
||||
# skip the rest — designed to be safe under the systemd timer.
|
||||
#
|
||||
# Backed by `apply-mbps-caps.{service,timer}` (hourly + 5min-after-boot).
|
||||
# Why these values: see beads code-9v2j + memory id=2726 (alloy IO storm)
|
||||
# + memory id=1575 (VMs intentionally out of TF).
|
||||
|
||||
set -uo pipefail # NOT -e — keep going if a single VM step fails.
|
||||
|
||||
# vmid:disk_slot:mbps_rd:mbps_wr (Linux VMs only — skipping 101 pfsense BSD, 300 Windows)
|
||||
TARGETS=(
|
||||
"102:scsi0:60:60" # devvm
|
||||
"103:sata0:40:40" # home-assistant
|
||||
"200:scsi0:100:60" # k8s-master (alloy storm origin — firmest clip)
|
||||
"201:scsi1:150:120" # k8s-node1 (GPU + many CSI disks; boots from scsi1)
|
||||
"202:scsi0:150:120" # k8s-node2
|
||||
"203:scsi0:150:120" # k8s-node3
|
||||
"204:scsi0:150:120" # k8s-node4
|
||||
"220:scsi0:40:40" # docker-registry
|
||||
)
|
||||
|
||||
apply_one() {
|
||||
local spec="$1"
|
||||
local vmid slot rd wr
|
||||
IFS=: read -r vmid slot rd wr <<<"$spec"
|
||||
|
||||
# Skip non-existent VMs cleanly (e.g. node decommissioned, never rebuilt).
|
||||
if ! qm status "$vmid" >/dev/null 2>&1; then
|
||||
echo "vmid $vmid: not present on this host — skipping"
|
||||
return 0
|
||||
fi
|
||||
|
||||
local current cleaned newvalue
|
||||
current=$(qm config "$vmid" | awk -v s="$slot:" '$1==s {sub(/^[^ ]+ /, ""); print; exit}')
|
||||
if [[ -z "$current" ]]; then
|
||||
echo "vmid $vmid: no $slot line in config — skipping"
|
||||
return 0
|
||||
fi
|
||||
|
||||
cleaned=$(echo "$current" | sed -E 's/,mbps_rd=[0-9]+//g; s/,mbps_wr=[0-9]+//g')
|
||||
newvalue="${cleaned},mbps_rd=${rd},mbps_wr=${wr}"
|
||||
|
||||
# Skip the qm-set call entirely when state already matches — keeps
|
||||
# journal noise low under the hourly timer.
|
||||
if [[ "$current" == "$newvalue" ]]; then
|
||||
echo "vmid $vmid: $slot already at mbps_rd=${rd},mbps_wr=${wr} — no-op"
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "vmid $vmid: updating $slot"
|
||||
echo " before: $current"
|
||||
echo " after: $newvalue"
|
||||
if qm set "$vmid" "--$slot" "$newvalue"; then
|
||||
echo " ok"
|
||||
else
|
||||
echo " FAILED: qm set returned non-zero"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
rc=0
|
||||
for spec in "${TARGETS[@]}"; do
|
||||
apply_one "$spec" || rc=1
|
||||
done
|
||||
|
||||
exit "$rc"
|
||||
|
|
@ -1,18 +0,0 @@
|
|||
[Unit]
|
||||
Description=Re-apply per-VM I/O caps periodically + after PVE boot
|
||||
|
||||
[Timer]
|
||||
# After every PVE host reboot — caps survive in /etc/pve/qemu-server/<vmid>.conf
|
||||
# normally, but a config restore from backup can drop them (see 2026-05-26
|
||||
# incident where we restored 202.conf + 203.conf from /mnt/backup/pve-config/).
|
||||
OnBootSec=5min
|
||||
|
||||
# Hourly during normal operation — catches manual `qm set` drift or fresh
|
||||
# VM clones that haven't had caps applied yet.
|
||||
OnCalendar=hourly
|
||||
|
||||
Persistent=true
|
||||
RandomizedDelaySec=2min
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
|
|
@ -1,124 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Enforce the inline-comment convention for ingress_factory auth tiers.
|
||||
|
||||
Every `auth = "app"` or `auth = "none"` line under a stack must have an
|
||||
immediately-preceding comment block containing `# auth = "<tier>":`
|
||||
that documents what gates the app (for "app") or why the endpoint is
|
||||
intentionally public (for "none").
|
||||
|
||||
This is the static guard for the anti-exposure rule documented in
|
||||
`infra/.claude/CLAUDE.md` "Auth" section. It's invoked by `scripts/tg`
|
||||
before every plan/apply/destroy/refresh, so it fires regardless of who
|
||||
or what is running terragrunt — local laptop, CI, headless agent.
|
||||
|
||||
Stack-scoped by design: only checks the .tf files under the stack
|
||||
being acted on. Other stacks' historical violations don't block work
|
||||
on the current stack; each stack documents itself the next time it's
|
||||
edited.
|
||||
|
||||
Usage:
|
||||
check-ingress-auth-comments.py <stack-path> # scan one stack
|
||||
check-ingress-auth-comments.py --all # scan every stack
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
AUTH_LINE = re.compile(r'^\s*auth\s*=\s*"(app|none)"\s*$')
|
||||
COMMENT_LINE = re.compile(r'^\s*#')
|
||||
COMMENT_TIER = re.compile(r'auth\s*=\s*"(app|none)"')
|
||||
|
||||
|
||||
def scan_dir(path):
|
||||
violations = []
|
||||
for root, _, files in os.walk(path):
|
||||
for f in files:
|
||||
if not f.endswith('.tf'):
|
||||
continue
|
||||
full = os.path.join(root, f)
|
||||
try:
|
||||
with open(full) as fh:
|
||||
lines = fh.readlines()
|
||||
except OSError:
|
||||
continue
|
||||
for i, line in enumerate(lines):
|
||||
m = AUTH_LINE.match(line)
|
||||
if not m:
|
||||
continue
|
||||
tier = m.group(1)
|
||||
# Walk backwards through contiguous comment lines.
|
||||
# Pass if ANY of them documents the matching tier.
|
||||
ok = False
|
||||
j = i - 1
|
||||
while j >= 0 and COMMENT_LINE.match(lines[j]):
|
||||
cm = COMMENT_TIER.search(lines[j])
|
||||
if cm and cm.group(1) == tier:
|
||||
ok = True
|
||||
break
|
||||
j -= 1
|
||||
if not ok:
|
||||
violations.append((full, i + 1, tier))
|
||||
return violations
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
||||
g = ap.add_mutually_exclusive_group(required=True)
|
||||
g.add_argument('path', nargs='?', help='Stack directory to scan')
|
||||
g.add_argument('--all', action='store_true', help='Scan every stack under stacks/')
|
||||
args = ap.parse_args()
|
||||
|
||||
if args.all:
|
||||
scan_paths = ['stacks']
|
||||
else:
|
||||
if not os.path.isdir(args.path):
|
||||
print(f"ERROR: {args.path} is not a directory", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
scan_paths = [args.path]
|
||||
|
||||
violations = []
|
||||
for p in scan_paths:
|
||||
violations.extend(scan_dir(p))
|
||||
|
||||
if not violations:
|
||||
return
|
||||
|
||||
print(
|
||||
"\n"
|
||||
"==============================================================\n"
|
||||
"ingress_factory auth-comment convention violated\n"
|
||||
"==============================================================\n"
|
||||
"\n"
|
||||
"Every `auth = \"app\"` or `auth = \"none\"` line must have a\n"
|
||||
"preceding comment line documenting what gates the app (for\n"
|
||||
"\"app\") or why the endpoint is intentionally public (for\n"
|
||||
"\"none\"). This guard prevents accidentally exposing private\n"
|
||||
"services. See infra/.claude/CLAUDE.md Auth section.\n"
|
||||
"\n"
|
||||
"Add a comment line directly above the auth line:\n"
|
||||
"\n"
|
||||
" # auth = \"app\": <what gates the app, e.g. NextAuth + OAuth>\n"
|
||||
" auth = \"app\"\n"
|
||||
"\n"
|
||||
"or:\n"
|
||||
"\n"
|
||||
" # auth = \"none\": <why public, e.g. webhook receiver, CalDAV>\n"
|
||||
" auth = \"none\"\n"
|
||||
"\n"
|
||||
"Violations:",
|
||||
file=sys.stderr,
|
||||
)
|
||||
for path, line_no, tier in violations:
|
||||
print(
|
||||
f" {path}:{line_no}: auth = \"{tier}\" missing preceding "
|
||||
f"`# auth = \"{tier}\":` comment",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,277 +0,0 @@
|
|||
import asyncio
|
||||
import click
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Union, Optional
|
||||
from kubernetes_asyncio import client, config
|
||||
from kubernetes_asyncio.client.api_client import ApiClient
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def wait_for_healthy(
|
||||
api_instance: client.AppsV1Api,
|
||||
resource_type: str,
|
||||
namespace: str,
|
||||
name: str,
|
||||
target_replicas: int,
|
||||
timeout: int = 300,
|
||||
) -> None:
|
||||
start_time = time.time()
|
||||
logger.info(
|
||||
f"Waiting for {resource_type} {name} to reach {target_replicas} replicas..."
|
||||
)
|
||||
|
||||
while True:
|
||||
if time.time() - start_time > timeout:
|
||||
logger.error(f"❌ Timeout reached for {resource_type} {name}")
|
||||
return
|
||||
|
||||
try:
|
||||
if resource_type.lower() == "deployment":
|
||||
res = await api_instance.read_namespaced_deployment_status(
|
||||
name, namespace
|
||||
)
|
||||
ready = res.status.ready_replicas or 0
|
||||
updated = res.status.updated_replicas or 0
|
||||
if ready == target_replicas and updated == target_replicas:
|
||||
break
|
||||
else: # StatefulSet
|
||||
res = await api_instance.read_namespaced_stateful_set_status(
|
||||
name, namespace
|
||||
)
|
||||
ready = res.status.ready_replicas or 0
|
||||
if ready == target_replicas:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Retrying status check for {name}: {e}")
|
||||
|
||||
await asyncio.sleep(5)
|
||||
|
||||
logger.info(f"✅ {resource_type} {name} is now healthy.")
|
||||
|
||||
|
||||
async def wait_for_zero(
|
||||
api: client.AppsV1Api, kind: str, ns: str, name: str, timeout: int
|
||||
) -> tuple[str, str]:
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
while (asyncio.get_event_loop().time() - start_time) < timeout:
|
||||
try:
|
||||
res = await (
|
||||
api.read_namespaced_deployment_status(name, ns)
|
||||
if kind.lower() == "deployment"
|
||||
else api.read_namespaced_stateful_set_status(name, ns)
|
||||
)
|
||||
if (res.status.ready_replicas or 0) == 0:
|
||||
return ns, name
|
||||
except Exception:
|
||||
return ns, name # Assume gone if error
|
||||
await asyncio.sleep(3)
|
||||
logger.error(f"Timeout: {kind} {ns}/{name} still has running pods.")
|
||||
return ns, name
|
||||
|
||||
|
||||
async def scale_resource(
|
||||
api_instance: client.AppsV1Api,
|
||||
resource_type: str,
|
||||
namespace: str,
|
||||
name: str,
|
||||
replicas: int,
|
||||
) -> None:
|
||||
body = {"spec": {"replicas": replicas}}
|
||||
try:
|
||||
if resource_type.lower() == "deployment":
|
||||
await api_instance.patch_namespaced_deployment_scale(name, namespace, body)
|
||||
else:
|
||||
await api_instance.patch_namespaced_stateful_set_scale(
|
||||
name, namespace, body
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to scale {resource_type} {name}: {e}")
|
||||
|
||||
|
||||
async def run_stop_tier(
|
||||
api_v1: client.AppsV1Api, label: str, output_file: str, timeout: int
|
||||
) -> None:
|
||||
"""Processes a single label tier: saves, scales to 0, and waits."""
|
||||
excluded_ns = ["kube-system", "kube-public", "kube-node-lease"]
|
||||
|
||||
# 1. Discover
|
||||
targets = [
|
||||
("Deployment", api_v1.list_deployment_for_all_namespaces),
|
||||
("StatefulSet", api_v1.list_stateful_set_for_all_namespaces),
|
||||
]
|
||||
|
||||
tier_resources = []
|
||||
for kind, list_func in targets:
|
||||
resp = await list_func(label_selector=label)
|
||||
tier_resources.extend(
|
||||
[
|
||||
(kind, item)
|
||||
for item in resp.items
|
||||
if item.metadata.namespace not in excluded_ns
|
||||
]
|
||||
)
|
||||
|
||||
if not tier_resources:
|
||||
logger.warning(f"No resources found for label: {label}")
|
||||
return
|
||||
|
||||
# 2. Save & Scale
|
||||
active_jobs: set[tuple[str, str]] = set()
|
||||
wait_tasks = []
|
||||
|
||||
# Append to file so we don't overwrite previous tiers
|
||||
with open(output_file, "a") as f:
|
||||
for kind, item in tier_resources:
|
||||
ns, name = item.metadata.namespace, item.metadata.name
|
||||
reps = item.spec.replicas or 0
|
||||
f.write(f"{kind} {ns} {name} {reps}\n")
|
||||
active_jobs.add((ns, name))
|
||||
|
||||
await scale_resource(api_v1, kind, ns, name, 0)
|
||||
wait_tasks.append(wait_for_zero(api_v1, kind, ns, name, timeout))
|
||||
|
||||
# 3. Wait for this tier to finish before moving to next
|
||||
logger.info(f"Tier [{label}]: Waiting for {len(active_jobs)} resources to stop...")
|
||||
for coro in asyncio.as_completed(wait_tasks):
|
||||
finished_ns, finished_name = await coro
|
||||
active_jobs.discard((finished_ns, finished_name))
|
||||
if active_jobs:
|
||||
remaining_ns = sorted({ns for ns, name in active_jobs})
|
||||
logger.info(
|
||||
f"[{label}] Pending: {len(active_jobs)} | Namespaces: {', '.join(remaining_ns)}"
|
||||
)
|
||||
|
||||
logger.info(f"✅ Tier [{label}] successfully shut down.")
|
||||
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
pass
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("labels", nargs=-1, required=True)
|
||||
@click.option("--output", "-o", default="resources.txt", help="Output state file")
|
||||
@click.option("--timeout", "-t", default=3600)
|
||||
def stop(labels: List[str], output: str, timeout: int):
|
||||
"""Stop tiers sequentially. Usage: stop 'app=web' 'app=db'"""
|
||||
|
||||
async def main():
|
||||
await config.load_kube_config()
|
||||
# Clear/Create file at start
|
||||
open(output, "w").close()
|
||||
|
||||
async with ApiClient() as api_client:
|
||||
api_v1 = client.AppsV1Api(api_client)
|
||||
for label in labels:
|
||||
logger.info(f"🚀 Processing Shutdown Tier: {label}")
|
||||
await run_stop_tier(api_v1, label, output, timeout)
|
||||
logger.info("🏁 Sequence complete. Cluster is gracefully stopped.")
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("labels", nargs=-1, required=True)
|
||||
@click.option("--file", "-f", default="resources.txt")
|
||||
@click.option("--timeout", "-t", default=3600, help="Seconds to wait per resource")
|
||||
def start(labels: List[str], file: str, timeout: int):
|
||||
asyncio.run(run_start_sequence(labels, file, timeout))
|
||||
|
||||
|
||||
async def run_start_sequence(labels: List[str], file_path: str, timeout: int) -> None:
|
||||
await config.load_kube_config()
|
||||
|
||||
async with ApiClient() as api_client:
|
||||
apps_v1 = client.AppsV1Api(api_client)
|
||||
|
||||
# 1. Load the entire snapshot into memory for filtering
|
||||
try:
|
||||
with open(file_path, "r") as f:
|
||||
# Format: Kind Namespace Name Replicas
|
||||
snapshot_lines = [line.strip().split() for line in f if line.strip()]
|
||||
except FileNotFoundError:
|
||||
logger.error(f"Snapshot file {file_path} not found.")
|
||||
return
|
||||
|
||||
# 2. Iterate through labels in the order provided
|
||||
for label in labels:
|
||||
logger.info(f"🚀 Starting Tier: {label}")
|
||||
|
||||
# Find resources in this tier by querying K8s for the label
|
||||
# then matching against our snapshot file data
|
||||
tier_resources = await get_resources_by_label(apps_v1, label)
|
||||
|
||||
# Cross-reference: Only start things that are in BOTH the K8s label query AND our file
|
||||
# This ensures we restore them to the CORRECT previous replica count
|
||||
to_restore = []
|
||||
tier_keys = {(r["ns"], r["name"]) for r in tier_resources}
|
||||
|
||||
for kind, ns, name, reps in snapshot_lines:
|
||||
if (ns, name) in tier_keys:
|
||||
to_restore.append((kind, ns, name, int(reps)))
|
||||
|
||||
if not to_restore:
|
||||
logger.warning(f"No resources found in snapshot for tier: {label}")
|
||||
continue
|
||||
|
||||
# 3. Scale and Wait for this specific tier
|
||||
await process_start_tier(apps_v1, to_restore, timeout, label)
|
||||
|
||||
logger.info("🏁 All tiers started successfully.")
|
||||
|
||||
|
||||
async def get_resources_by_label(api: client.AppsV1Api, label: str) -> List[dict]:
|
||||
"""Helper to find what currently exists in the cluster with this label."""
|
||||
targets = [
|
||||
api.list_deployment_for_all_namespaces,
|
||||
api.list_stateful_set_for_all_namespaces,
|
||||
]
|
||||
found = []
|
||||
for list_func in targets:
|
||||
resp = await list_func(label_selector=label)
|
||||
for item in resp.items:
|
||||
found.append({"ns": item.metadata.namespace, "name": item.metadata.name})
|
||||
return found
|
||||
|
||||
|
||||
async def process_start_tier(
|
||||
api: client.AppsV1Api, resources: list, timeout: int, label: str
|
||||
):
|
||||
active_jobs = set()
|
||||
scale_tasks = []
|
||||
wait_tasks = []
|
||||
|
||||
# Wrapper to track which job finishes
|
||||
async def tracked_wait(kind, ns, name, target, t_out):
|
||||
await wait_for_healthy(api, kind, ns, name, target, t_out)
|
||||
return (ns, name)
|
||||
|
||||
for kind, ns, name, reps in resources:
|
||||
active_jobs.add((ns, name))
|
||||
scale_tasks.append(scale_resource(api, kind, ns, name, reps))
|
||||
wait_tasks.append(tracked_wait(kind, ns, name, reps, timeout))
|
||||
|
||||
# Trigger all scales for this tier
|
||||
await asyncio.gather(*scale_tasks)
|
||||
|
||||
# Monitor health
|
||||
for coro in asyncio.as_completed(wait_tasks):
|
||||
finished_ns, finished_name = await coro
|
||||
active_jobs.discard((finished_ns, finished_name))
|
||||
|
||||
if active_jobs:
|
||||
remaining_ns = sorted({ns for ns, name in active_jobs})
|
||||
logger.info(
|
||||
f"[{label}] Pending Health: {len(active_jobs)} | Namespaces: {', '.join(remaining_ns)}"
|
||||
)
|
||||
|
||||
logger.info(f"✅ Tier [{label}] is healthy.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
[Unit]
|
||||
Description=Daily backup: PVC snapshots + SQLite + pfsense to sda
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/daily-backup
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=daily-backup
|
||||
# 4h budget — the snapshot mount + LUKS decrypt + rsync + sqlite scan loop
|
||||
# scales with the number of PVCs (118 today). Hit the 1h ceiling around week
|
||||
# 18 of 2026 and silently SIGTERM'd for 10 days. Bumped to 4h with margin.
|
||||
TimeoutStartSec=14400
|
||||
|
|
@ -1,424 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# daily-backup — 3-2-1 backup: PVC file copy + SQLite + pfsense + PVE config to sda
|
||||
# Deploy to PVE host at /usr/local/bin/daily-backup
|
||||
# Schedule: Daily 05:00 via systemd timer
|
||||
set -euo pipefail
|
||||
|
||||
# --- Configuration ---
|
||||
BACKUP_ROOT="/mnt/backup"
|
||||
PVC_MOUNT="/tmp/pvc-mount"
|
||||
PUSHGATEWAY="${DAILY_BACKUP_PUSHGATEWAY:-http://10.0.20.100:30091}"
|
||||
PUSHGATEWAY_JOB="daily-backup"
|
||||
LOCKFILE="/run/daily-backup.lock"
|
||||
MANIFEST="${BACKUP_ROOT}/.changed-files"
|
||||
MAPPING_CACHE="${BACKUP_ROOT}/.lv-pvc-mapping.json"
|
||||
KUBECONFIG="${KUBECONFIG:-/root/.kube/config}"
|
||||
export KUBECONFIG
|
||||
|
||||
# --- Logging ---
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
||||
warn() { log "WARN: $*" >&2; }
|
||||
die() { log "FATAL: $*" >&2; push_metrics 1 0; exit 1; }
|
||||
|
||||
# --- Manifest append helper ---
|
||||
# Both daily-backup and nfs-mirror append to /mnt/backup/.changed-files.
|
||||
# If their runs overlap (e.g. nfs-mirror Mon 04:11 still running when
|
||||
# daily-backup starts Mon 05:00) the appends can interleave mid-line.
|
||||
# `flock -x` on a sibling lock file makes appends atomic across processes.
|
||||
MANIFEST_LOCK="${MANIFEST}.lock"
|
||||
manifest_append() {
|
||||
(
|
||||
flock -x 200
|
||||
cat >> "${MANIFEST}"
|
||||
) 200>"${MANIFEST_LOCK}"
|
||||
}
|
||||
|
||||
# Cap manifest size to prevent unbounded growth (e.g. Synology unreachable
|
||||
# for many days, every daily-backup keeps appending). At >500k lines,
|
||||
# `--files-from=` rsync becomes pathological — fall back to a full Step 1
|
||||
# sync by signalling offsite-sync to ignore the manifest this round.
|
||||
MANIFEST_MAX_LINES=500000
|
||||
check_manifest_size() {
|
||||
[ -f "${MANIFEST}" ] || return 0
|
||||
local lines
|
||||
lines=$(wc -l < "${MANIFEST}" 2>/dev/null || echo 0)
|
||||
if [ "${lines:-0}" -gt "${MANIFEST_MAX_LINES}" ]; then
|
||||
warn "manifest at ${lines} lines (>${MANIFEST_MAX_LINES}) — flagging next offsite-sync as full"
|
||||
touch "${BACKUP_ROOT}/.force-full-sync"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Locking ---
|
||||
# Track whether we got SIGTERM/SIGINT so cleanup can push a non-success metric.
|
||||
# Without this, a systemd timeout-kill leaves WeeklyBackupFailing alerts blind:
|
||||
# the script never reaches the success push at the end and the metric goes stale
|
||||
# silently. (Root cause of 2026-04-30 → 2026-05-09 silent-failure run.)
|
||||
KILLED=""
|
||||
|
||||
cleanup() {
|
||||
# Recursively unmount /tmp/pvc-mount: previous SIGTERM'd runs left snapshot
|
||||
# mounts stacked here, which made every subsequent run start with an
|
||||
# already-occupied mountpoint and time out before reaching its own umount.
|
||||
while mountpoint -q "${PVC_MOUNT}" 2>/dev/null; do
|
||||
umount "${PVC_MOUNT}" 2>/dev/null || umount -l "${PVC_MOUNT}" 2>/dev/null || break
|
||||
done
|
||||
# Close any LUKS mappers we opened (or that were left over from a prior crash).
|
||||
for m in /dev/mapper/pvc-snap-*; do
|
||||
[ -e "$m" ] || continue
|
||||
cryptsetup close "$(basename "$m")" 2>/dev/null || true
|
||||
done
|
||||
rm -f "${LOCKFILE}"
|
||||
if [ -n "${KILLED}" ]; then
|
||||
# status=2 = aborted (matches lvm-pvc-snapshot's convention)
|
||||
push_metrics 2 "${TOTAL_BYTES:-0}"
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
trap 'KILLED=1; exit 143' TERM INT
|
||||
|
||||
if ! ( set -o noclobber; echo $$ > "${LOCKFILE}" ) 2>/dev/null; then
|
||||
die "Another instance is running (PID $(cat "${LOCKFILE}" 2>/dev/null || echo unknown))"
|
||||
fi
|
||||
|
||||
# Belt-and-braces: if a previous run was SIGTERM'd before its trap completed,
|
||||
# /tmp/pvc-mount may have stacked mounts and stale LUKS mappers. The lock above
|
||||
# guarantees we're alone, so it's safe to clean these up now.
|
||||
while mountpoint -q "${PVC_MOUNT}" 2>/dev/null; do
|
||||
umount "${PVC_MOUNT}" 2>/dev/null || umount -l "${PVC_MOUNT}" 2>/dev/null || break
|
||||
done
|
||||
for m in /dev/mapper/pvc-snap-*; do
|
||||
[ -e "$m" ] || continue
|
||||
cryptsetup close "$(basename "$m")" 2>/dev/null || true
|
||||
done
|
||||
|
||||
# --- Metrics ---
|
||||
push_metrics() {
|
||||
local status="${1:-0}" bytes="${2:-0}"
|
||||
cat <<EOF | curl -s --connect-timeout 5 --max-time 10 --data-binary @- "${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || true
|
||||
daily_backup_last_run_timestamp $(date +%s)
|
||||
daily_backup_last_status ${status}
|
||||
daily_backup_bytes_synced ${bytes}
|
||||
EOF
|
||||
}
|
||||
|
||||
# --- PVC name resolution ---
|
||||
resolve_pvc_name() {
|
||||
local lv="$1"
|
||||
jq -r --arg lv "${lv}" '
|
||||
.items[] |
|
||||
select(.spec.csi.volumeHandle // "" | endswith($lv)) |
|
||||
"\(.spec.claimRef.namespace)/\(.spec.claimRef.name)"
|
||||
' "${MAPPING_CACHE}" 2>/dev/null
|
||||
}
|
||||
|
||||
# --- NFS Export Health Check ---
|
||||
# Verify NFS exports are healthy before starting backup.
|
||||
# Detects: missing /etc/exports, incorrect fsid=0 flag, unexpected exports.
|
||||
# Added 2026-04-14 [PM-2026-04-14]: backup script accessed NFS causing stale handle
|
||||
# propagation during the fsid=0 outage. Early check prevents cascading failures.
|
||||
check_nfs_exports() {
|
||||
local exports_file="/etc/exports"
|
||||
local status=0
|
||||
|
||||
if [ ! -f "${exports_file}" ]; then
|
||||
log "WARN: ${exports_file} does not exist — NFS exports may be unconfigured"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Check for dangerous fsid=0 on /srv/nfs (breaks NFSv4 subdirectory path resolution)
|
||||
if grep -E '^/srv/nfs[[:space:]].*fsid=0' "${exports_file}" 2>/dev/null; then
|
||||
log "ERROR: /etc/exports contains fsid=0 on /srv/nfs — this will break all k8s NFS mounts!"
|
||||
log "ERROR: Remove fsid=0 and run: exportfs -ra && systemctl restart nfs-server"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Verify NFS server is active
|
||||
if ! systemctl is-active --quiet nfs-server 2>/dev/null; then
|
||||
log "WARN: nfs-server is not running — NFS mounts will fail"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Verify exports are actually loaded (exportfs -s lists active exports)
|
||||
local active_exports
|
||||
active_exports=$(exportfs -s 2>/dev/null | grep -c '/srv/nfs' || true)
|
||||
if [ "${active_exports:-0}" -eq 0 ]; then
|
||||
log "WARN: No /srv/nfs exports active in kernel — run: exportfs -ra"
|
||||
return 1
|
||||
fi
|
||||
|
||||
log "NFS export health check passed (${active_exports} /srv/nfs export(s) active)"
|
||||
return 0
|
||||
}
|
||||
|
||||
# --- Main ---
|
||||
log "=== daily-backup starting ==="
|
||||
|
||||
if ! mountpoint -q "${BACKUP_ROOT}"; then
|
||||
die "${BACKUP_ROOT} is not mounted"
|
||||
fi
|
||||
|
||||
# NFS export health check — warn but don't abort (backup can proceed with block storage PVCs)
|
||||
check_nfs_exports || {
|
||||
log "WARN: NFS export health check failed — NFS-backed PVC backups may fail"
|
||||
STATUS=1
|
||||
}
|
||||
|
||||
STATUS=0
|
||||
TOTAL_BYTES=0
|
||||
|
||||
# DO NOT truncate the manifest here.
|
||||
#
|
||||
# Truncation lives in offsite-sync-backup (only on successful sync). If
|
||||
# offsite-sync failed yesterday — Synology unreachable, transient error —
|
||||
# the manifest holds yesterday's unconsumed file list. Truncating at the
|
||||
# start of today's daily-backup would silently lose those entries; they'd
|
||||
# only reach Synology on the next monthly full sync.
|
||||
#
|
||||
# Appending duplicates across multiple runs is harmless — rsync transfers
|
||||
# each file once. If the manifest grows pathologically (Synology down for
|
||||
# weeks), the OffsiteBackupSync{Stale,Failing} alerts catch it.
|
||||
|
||||
# NFS data is synced to Synology via two paths: nfs-mirror → sda → Step 1
|
||||
# for the curated subset, and inotify + Step 2 for the sda-bypass list.
|
||||
|
||||
# ============================================================
|
||||
# STEP 1: PVC file-level copy from LVM thin snapshots
|
||||
# ============================================================
|
||||
log "--- Step 1: PVC file copy from snapshots ---"
|
||||
WEEK=$(date +%Y-%W)
|
||||
PREV=$(ls -1d "${BACKUP_ROOT}/pvc-data"/????-?? 2>/dev/null | tail -1 || true)
|
||||
|
||||
# Cache LV→PVC mapping (fallback if kubectl is down next time)
|
||||
if kubectl get pv -o json > /tmp/pv-list.json 2>/dev/null; then
|
||||
cp /tmp/pv-list.json "${MAPPING_CACHE}"
|
||||
rm -f /tmp/pv-list.json
|
||||
fi
|
||||
|
||||
if [ ! -f "${MAPPING_CACHE}" ]; then
|
||||
warn "No PV mapping cache and kubectl unavailable — skipping PVC copy"
|
||||
STATUS=1
|
||||
else
|
||||
mkdir -p "${PVC_MOUNT}"
|
||||
PVC_COUNT=0
|
||||
PVC_FAIL=0
|
||||
|
||||
# Iterate origin LVs (not snapshots), find latest snapshot for each
|
||||
for origin_lv in $(lvs --noheadings -o lv_name pve 2>/dev/null | grep 'vm-9999-pvc-' | grep -v '_snap_' | tr -d ' '); do
|
||||
# Find latest snapshot for this origin
|
||||
snap=$(lvs --noheadings -o lv_name pve 2>/dev/null | tr -d ' ' | grep "^${origin_lv}_snap_" | sort | tail -1 || true)
|
||||
[ -z "${snap}" ] && continue
|
||||
|
||||
# Resolve human-readable name
|
||||
ns_pvc=$(resolve_pvc_name "${origin_lv}")
|
||||
if [ -z "${ns_pvc}" ] || [ "${ns_pvc}" = "null/null" ]; then
|
||||
warn "Cannot resolve PVC name for ${origin_lv}, skipping"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Skip-list: PVCs we deliberately don't keep offsite copies of.
|
||||
# nextcloud-data-proxmox — orphaned pre-encryption PV (Released,
|
||||
# Retain). Nextcloud moved to nextcloud-data-encrypted on 2026-04-13;
|
||||
# this old unencrypted PV lingers (Retain) and was still being backed
|
||||
# up weekly, filling the offsite Synology. Stop copying it (2026-06-01).
|
||||
case "${ns_pvc}" in
|
||||
nextcloud/nextcloud-data-proxmox)
|
||||
log " skip ${ns_pvc} (orphaned pre-encryption PVC)"
|
||||
continue ;;
|
||||
esac
|
||||
|
||||
# Detect LUKS-encrypted volumes and set up mount device
|
||||
LUKS_NAME=""
|
||||
MOUNT_DEV="/dev/pve/${snap}"
|
||||
MOUNT_OPTS="ro"
|
||||
if blkid -o value -s TYPE "/dev/pve/${snap}" 2>/dev/null | grep -q 'crypto_LUKS'; then
|
||||
# Clean up any stale LUKS mapping for this snapshot from a previous crashed run
|
||||
STALE_LUKS="pvc-snap-$(echo "${snap}" | md5sum | cut -c1-12)"
|
||||
if [ -e "/dev/mapper/${STALE_LUKS}" ]; then
|
||||
umount "/dev/mapper/${STALE_LUKS}" 2>/dev/null || true
|
||||
cryptsetup close "${STALE_LUKS}" 2>/dev/null || true
|
||||
fi
|
||||
LUKS_KEY="/root/.luks-backup-key"
|
||||
LUKS_NAME="pvc-snap-$(echo "${snap}" | md5sum | cut -c1-12)"
|
||||
if [ -f "${LUKS_KEY}" ] && cryptsetup open --type luks --key-file "${LUKS_KEY}" --readonly "/dev/pve/${snap}" "${LUKS_NAME}" 2>&1; then
|
||||
MOUNT_DEV="/dev/mapper/${LUKS_NAME}"
|
||||
MOUNT_OPTS="ro,noload" # noload skips ext4 journal replay on read-only LUKS
|
||||
log " LUKS: decrypted ${snap} → ${LUKS_NAME}"
|
||||
else
|
||||
warn "Failed to decrypt LUKS snapshot ${snap}"
|
||||
PVC_FAIL=$((PVC_FAIL + 1))
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
# Mount snapshot read-only, rsync files
|
||||
if timeout 30 mount -o "${MOUNT_OPTS}" "${MOUNT_DEV}" "${PVC_MOUNT}" 2>&1; then
|
||||
dst="${BACKUP_ROOT}/pvc-data/${WEEK}/${ns_pvc}"
|
||||
mkdir -p "${dst}"
|
||||
rsync_rc=0
|
||||
# Per-PVC rsync timeout (30 min). Without this, a single hung
|
||||
# PVC blocks the entire backup until systemd's TimeoutStartSec
|
||||
# kills the script (4h ceiling), leaving every later PVC
|
||||
# unbacked and silently triggering WeeklyBackupFailing. Picked
|
||||
# 30 min as well above the largest PVC's normal copy time
|
||||
# (immich-postgres ~10 GiB, ~3 min on local ext4) and well
|
||||
# below the unit-level budget so we still have headroom to
|
||||
# finish the rest.
|
||||
timeout 1800 rsync -a --delete \
|
||||
${PREV:+--link-dest="${PREV}/${ns_pvc}/"} \
|
||||
"${PVC_MOUNT}/" "${dst}/" 2>&1 || rsync_rc=$?
|
||||
if [ "$rsync_rc" -eq 0 ]; then
|
||||
PVC_COUNT=$((PVC_COUNT + 1))
|
||||
elif [ "$rsync_rc" -eq 23 ] && [ -n "${LUKS_NAME}" ]; then
|
||||
# rsync 23 = partial transfer; expected for LUKS noload mounts
|
||||
# (in-flight writes have corrupt metadata from skipped journal replay)
|
||||
PVC_COUNT=$((PVC_COUNT + 1))
|
||||
log " partial rsync (LUKS noload) for ${ns_pvc} — OK"
|
||||
elif [ "$rsync_rc" -eq 124 ]; then
|
||||
# `timeout` exit 124 = wall-clock killed the rsync. Track
|
||||
# separately so the next run still produces a metric and
|
||||
# doesn't pretend nothing happened.
|
||||
warn "rsync timed out for ${ns_pvc} after 30 min — moving on"
|
||||
PVC_FAIL=$((PVC_FAIL + 1))
|
||||
else
|
||||
warn "rsync failed for ${ns_pvc} (rc=$rsync_rc)"
|
||||
PVC_FAIL=$((PVC_FAIL + 1))
|
||||
fi
|
||||
|
||||
# Auto-detect and safely backup SQLite databases from snapshot
|
||||
if command -v sqlite3 &>/dev/null; then
|
||||
find "${PVC_MOUNT}" -maxdepth 3 \
|
||||
\( -name '*.db' -o -name '*.sqlite' -o -name '*.sqlite3' \) \
|
||||
-size +0 -type f 2>/dev/null | while read -r dbfile; do
|
||||
# Verify it's actually SQLite (magic number check)
|
||||
if head -c 15 "$dbfile" 2>/dev/null | grep -q 'SQLite format 3'; then
|
||||
relpath="${dbfile#${PVC_MOUNT}/}"
|
||||
dest_file="${BACKUP_ROOT}/sqlite-backup/${WEEK}/${ns_pvc}/${relpath}"
|
||||
mkdir -p "$(dirname "${dest_file}")"
|
||||
# 5-min sqlite timeout — same hang-prevention idea
|
||||
# as rsync above. A corrupted SQLite or one held
|
||||
# open by a writer in the snapshot can otherwise
|
||||
# block .backup indefinitely.
|
||||
if timeout 300 sqlite3 "file://${dbfile}?mode=ro" ".backup '${dest_file}'" 2>/dev/null; then
|
||||
log " SQLite: ${ns_pvc}/${relpath}"
|
||||
else
|
||||
cp "${dbfile}" "${dest_file}" 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
umount "${PVC_MOUNT}" 2>/dev/null || umount -l "${PVC_MOUNT}" 2>/dev/null || true
|
||||
else
|
||||
warn "Failed to mount snapshot ${snap}"
|
||||
PVC_FAIL=$((PVC_FAIL + 1))
|
||||
fi
|
||||
|
||||
# Close LUKS device if we opened one
|
||||
if [ -n "${LUKS_NAME}" ]; then
|
||||
cryptsetup close "${LUKS_NAME}" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
|
||||
log " PVC copy: ${PVC_COUNT} OK, ${PVC_FAIL} failed"
|
||||
[ "${PVC_FAIL}" -gt 0 ] && STATUS=1
|
||||
|
||||
# Add PVC files to manifest (locked append)
|
||||
if [ -d "${BACKUP_ROOT}/pvc-data/${WEEK}" ]; then
|
||||
find "${BACKUP_ROOT}/pvc-data/${WEEK}" -type f 2>/dev/null | \
|
||||
sed "s|^${BACKUP_ROOT}/||" | manifest_append
|
||||
fi
|
||||
|
||||
# Prune old weekly versions (keep 4)
|
||||
ls -1d "${BACKUP_ROOT}/pvc-data"/????-?? 2>/dev/null | head -n -4 | xargs rm -rf 2>/dev/null || true
|
||||
ls -1d "${BACKUP_ROOT}/sqlite-backup"/????-?? 2>/dev/null | head -n -4 | xargs rm -rf 2>/dev/null || true
|
||||
|
||||
PVC_BYTES=$(du -sb "${BACKUP_ROOT}/pvc-data/${WEEK}" 2>/dev/null | cut -f1 || true)
|
||||
TOTAL_BYTES=$((TOTAL_BYTES + ${PVC_BYTES:-0}))
|
||||
fi
|
||||
|
||||
# ============================================================
|
||||
# STEP 3: pfsense backup (config.xml + full tar)
|
||||
# ============================================================
|
||||
log "--- Step 3: pfsense backup ---"
|
||||
PFSENSE_DEST="${BACKUP_ROOT}/pfsense"
|
||||
DATE=$(date +%Y%m%d)
|
||||
PFSENSE_STATUS=0
|
||||
mkdir -p "${PFSENSE_DEST}"
|
||||
|
||||
if timeout 10 ssh -o BatchMode=yes -o ConnectTimeout=5 root@10.0.20.1 true 2>/dev/null; then
|
||||
# config.xml — primary restore artifact
|
||||
if scp -o ConnectTimeout=10 root@10.0.20.1:/cf/conf/config.xml "${PFSENSE_DEST}/config-${DATE}.xml" 2>/dev/null; then
|
||||
log " OK: config.xml"
|
||||
echo "pfsense/config-${DATE}.xml" | manifest_append
|
||||
else
|
||||
warn "Failed to copy pfsense config.xml"
|
||||
STATUS=1
|
||||
PFSENSE_STATUS=1
|
||||
fi
|
||||
|
||||
# Full filesystem tar — Sundays only (weekly).
|
||||
# config.xml is the primary restore artifact and runs daily above; the
|
||||
# full filesystem tar is for forensic / package-state recovery only and
|
||||
# rarely-needed. Re-tarring 100M+ daily writes ~3G/month to sda + Synology
|
||||
# for unchanged content. Keep one fresh tarball per week instead.
|
||||
if [ "$(date +%u)" = "7" ]; then
|
||||
if ssh -o ConnectTimeout=10 root@10.0.20.1 \
|
||||
"tar czf - --exclude=/dev --exclude=/proc --exclude=/tmp --exclude=/var/run /" \
|
||||
> "${PFSENSE_DEST}/pfsense-full-${DATE}.tar.gz" 2>/dev/null; then
|
||||
log " OK: weekly full tar ($(du -sh "${PFSENSE_DEST}/pfsense-full-${DATE}.tar.gz" | cut -f1))"
|
||||
echo "pfsense/pfsense-full-${DATE}.tar.gz" | manifest_append
|
||||
else
|
||||
warn "Failed to tar pfsense filesystem"
|
||||
STATUS=1
|
||||
PFSENSE_STATUS=1
|
||||
fi
|
||||
else
|
||||
log " skip weekly full tar (only runs Sundays)"
|
||||
fi
|
||||
|
||||
# Retention: keep 4 weekly copies
|
||||
ls -t "${PFSENSE_DEST}"/config-*.xml 2>/dev/null | tail -n +5 | xargs rm -f 2>/dev/null || true
|
||||
ls -t "${PFSENSE_DEST}"/pfsense-full-*.tar.gz 2>/dev/null | tail -n +5 | xargs rm -f 2>/dev/null || true
|
||||
else
|
||||
warn "Cannot SSH to pfsense (10.0.20.1) — skipping"
|
||||
STATUS=1
|
||||
PFSENSE_STATUS=1
|
||||
fi
|
||||
|
||||
# Push pfsense-backup metrics in BOTH success and failure paths so
|
||||
# PfsenseBackupStale + PfsenseBackupFailing alerts can fire instead of going
|
||||
# silent when ssh-to-pfsense is broken.
|
||||
{
|
||||
echo "backup_last_run_timestamp $(date +%s)"
|
||||
echo "backup_last_status ${PFSENSE_STATUS}"
|
||||
[ "${PFSENSE_STATUS}" -eq 0 ] && echo "backup_last_success_timestamp $(date +%s)"
|
||||
} | curl -s --connect-timeout 5 --max-time 10 --data-binary @- \
|
||||
"${PUSHGATEWAY}/metrics/job/pfsense-backup" 2>/dev/null || true
|
||||
|
||||
# ============================================================
|
||||
# STEP 4: PVE host config backup
|
||||
# ============================================================
|
||||
log "--- Step 4: PVE host config ---"
|
||||
mkdir -p "${BACKUP_ROOT}/pve-config/scripts"
|
||||
timeout 300 rsync -a --delete /etc/pve/ "${BACKUP_ROOT}/pve-config/etc-pve/" 2>&1 || { warn "Failed to sync /etc/pve"; STATUS=1; }
|
||||
for script in /usr/local/bin/lvm-pvc-snapshot /usr/local/bin/daily-backup /usr/local/bin/offsite-sync-backup; do
|
||||
[ -f "${script}" ] && cp "${script}" "${BACKUP_ROOT}/pve-config/scripts/" 2>/dev/null || true
|
||||
done
|
||||
find "${BACKUP_ROOT}/pve-config" -type f 2>/dev/null | sed "s|^${BACKUP_ROOT}/||" | manifest_append
|
||||
log " OK: PVE config"
|
||||
|
||||
check_manifest_size
|
||||
|
||||
# ============================================================
|
||||
# STEP 5: Prune LVM snapshots older than 7 days
|
||||
# ============================================================
|
||||
log "--- Step 5: Snapshot pruning (7-day retention) ---"
|
||||
/usr/local/bin/lvm-pvc-snapshot prune 2>&1 || { warn "Snapshot prune failed"; STATUS=1; }
|
||||
|
||||
# ============================================================
|
||||
# Done
|
||||
# ============================================================
|
||||
MANIFEST_LINES=$(wc -l < "${MANIFEST}" 2>/dev/null || echo 0)
|
||||
log "=== daily-backup complete (status=${STATUS}, ${TOTAL_BYTES} bytes, ${MANIFEST_LINES} files in manifest) ==="
|
||||
push_metrics "${STATUS}" "${TOTAL_BYTES}"
|
||||
exit "${STATUS}"
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
[Unit]
|
||||
Description=Daily backup: PVC snapshots + SQLite + pfsense to sda
|
||||
|
||||
[Timer]
|
||||
OnCalendar=*-*-* 05:00:00
|
||||
Persistent=true
|
||||
RandomizedDelaySec=300
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
|
|
@ -1,372 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Extend disk storage on a Kubernetes node VM.
|
||||
# Drains the node, shuts down the VM, resizes the disk in Proxmox,
|
||||
# boots the VM, expands the filesystem, and uncordons the node.
|
||||
#
|
||||
# Usage: ./scripts/extend_vm_storage.sh <node-name> <size-increment>
|
||||
# Example: ./scripts/extend_vm_storage.sh k8s-node2 +64G
|
||||
|
||||
# --- Constants ---
|
||||
PROXMOX_HOST="root@192.168.1.127"
|
||||
VM_SSH_USER="wizard"
|
||||
KUBECTL="kubectl --kubeconfig $(pwd)/config"
|
||||
SHUTDOWN_TIMEOUT=300
|
||||
SSH_WAIT_TIMEOUT=300
|
||||
POLL_INTERVAL=5
|
||||
|
||||
# --- Colors ---
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[0;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
error() { echo -e "${RED}[ERROR]${NC} $*"; }
|
||||
|
||||
# --- Node-to-VMID mapping ---
|
||||
declare -A NODE_VMID=(
|
||||
[k8s-master]=200
|
||||
[k8s-node1]=201
|
||||
[k8s-node2]=202
|
||||
[k8s-node3]=203
|
||||
[k8s-node4]=204
|
||||
)
|
||||
|
||||
# --- Cleanup trap ---
|
||||
DRAINED_NODE=""
|
||||
cleanup() {
|
||||
if [[ -n "$DRAINED_NODE" ]]; then
|
||||
echo ""
|
||||
error "Script exited unexpectedly!"
|
||||
warn "The node '$DRAINED_NODE' may still be cordoned/drained."
|
||||
warn "Recovery steps:"
|
||||
warn " 1. Check VM status: ssh $PROXMOX_HOST 'qm status ${NODE_VMID[$DRAINED_NODE]}'"
|
||||
warn " 2. Start VM if stopped: ssh $PROXMOX_HOST 'qm start ${NODE_VMID[$DRAINED_NODE]}'"
|
||||
warn " 3. Uncordon node: $KUBECTL uncordon $DRAINED_NODE"
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# --- Input validation ---
|
||||
usage() {
|
||||
echo "Usage: $0 <node-name> <size-increment>"
|
||||
echo ""
|
||||
echo "Arguments:"
|
||||
echo " node-name One of: ${!NODE_VMID[*]}"
|
||||
echo " size-increment Disk size increase, e.g. +64G, +128G"
|
||||
echo ""
|
||||
echo "Example:"
|
||||
echo " $0 k8s-node2 +64G"
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [[ $# -ne 2 ]]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
NODE_NAME="$1"
|
||||
SIZE_INCREMENT="$2"
|
||||
|
||||
if [[ -z "${NODE_VMID[$NODE_NAME]+x}" ]]; then
|
||||
error "Unknown node: '$NODE_NAME'"
|
||||
echo "Valid nodes: ${!NODE_VMID[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! "$SIZE_INCREMENT" =~ ^\+[0-9]+G$ ]]; then
|
||||
error "Invalid size increment: '$SIZE_INCREMENT'"
|
||||
echo "Must match pattern +<number>G, e.g. +64G"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
VMID="${NODE_VMID[$NODE_NAME]}"
|
||||
|
||||
# --- Resolve node IP via kubectl ---
|
||||
info "Resolving IP for node '$NODE_NAME'..."
|
||||
NODE_IP=$($KUBECTL get node "$NODE_NAME" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}' 2>/dev/null)
|
||||
if [[ -z "$NODE_IP" ]]; then
|
||||
error "Could not resolve IP for node '$NODE_NAME'. Is the cluster reachable?"
|
||||
exit 1
|
||||
fi
|
||||
ok "Node IP: $NODE_IP"
|
||||
|
||||
# --- Query current disk size ---
|
||||
info "Querying current disk size for VM $VMID..."
|
||||
SCSI0_LINE=$(ssh "$PROXMOX_HOST" "qm config $VMID" 2>/dev/null | grep '^scsi0:')
|
||||
if [[ -z "$SCSI0_LINE" ]]; then
|
||||
error "Could not read scsi0 config for VM $VMID."
|
||||
exit 1
|
||||
fi
|
||||
# Extract size value, e.g. "size=64G" from the config line
|
||||
CURRENT_SIZE=$(echo "$SCSI0_LINE" | sed -n 's/.*size=\([0-9]*G\).*/\1/p')
|
||||
if [[ -z "$CURRENT_SIZE" ]]; then
|
||||
error "Could not parse current disk size from: $SCSI0_LINE"
|
||||
exit 1
|
||||
fi
|
||||
CURRENT_SIZE_NUM=${CURRENT_SIZE%G}
|
||||
INCREMENT_NUM=${SIZE_INCREMENT//[+G]/}
|
||||
NEW_SIZE_NUM=$((CURRENT_SIZE_NUM + INCREMENT_NUM))
|
||||
ok "Current disk size: ${CURRENT_SIZE_NUM}G → New size: ${NEW_SIZE_NUM}G (${SIZE_INCREMENT})"
|
||||
|
||||
if [[ $NEW_SIZE_NUM -le $CURRENT_SIZE_NUM ]]; then
|
||||
error "New size (${NEW_SIZE_NUM}G) must be greater than current size (${CURRENT_SIZE_NUM}G)."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- Confirmation ---
|
||||
echo ""
|
||||
echo "========================================="
|
||||
echo " Extend VM Storage"
|
||||
echo "========================================="
|
||||
echo " Node: $NODE_NAME"
|
||||
echo " VMID: $VMID"
|
||||
echo " Node IP: $NODE_IP"
|
||||
echo " Current: ${CURRENT_SIZE_NUM}G"
|
||||
echo " Increment: $SIZE_INCREMENT"
|
||||
echo " New size: ${NEW_SIZE_NUM}G"
|
||||
echo " Proxmox: $PROXMOX_HOST"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
echo "This will:"
|
||||
echo " 1. Drain the node (evict pods)"
|
||||
echo " 2. Shut down the VM"
|
||||
echo " 3. Resize disk (scsi0) from ${CURRENT_SIZE_NUM}G to ${NEW_SIZE_NUM}G"
|
||||
echo " 4. Start the VM"
|
||||
echo " 5. Expand the filesystem inside the guest"
|
||||
echo " 6. Uncordon the node"
|
||||
echo ""
|
||||
read -rp "Proceed? [y/N] " confirm
|
||||
if [[ ! "$confirm" =~ ^[yY]$ ]]; then
|
||||
echo "Aborted."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# --- Step 1: Drain node ---
|
||||
info "Step 1/7: Draining node '$NODE_NAME'..."
|
||||
DRAINED_NODE="$NODE_NAME"
|
||||
if ! $KUBECTL drain "$NODE_NAME" --ignore-daemonsets --delete-emptydir-data --force --timeout=300s; then
|
||||
error "Failed to drain node '$NODE_NAME'."
|
||||
exit 1
|
||||
fi
|
||||
ok "Node drained."
|
||||
|
||||
# --- Step 2: Shutdown VM ---
|
||||
info "Step 2/7: Shutting down VM $VMID..."
|
||||
if ! ssh "$PROXMOX_HOST" "qm shutdown $VMID"; then
|
||||
error "Failed to send shutdown command to VM $VMID."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
info "Waiting for VM to stop (timeout: ${SHUTDOWN_TIMEOUT}s)..."
|
||||
elapsed=0
|
||||
while true; do
|
||||
status=$(ssh "$PROXMOX_HOST" "qm status $VMID" 2>/dev/null)
|
||||
if [[ "$status" == *"stopped"* ]]; then
|
||||
break
|
||||
fi
|
||||
if [[ $elapsed -ge $SHUTDOWN_TIMEOUT ]]; then
|
||||
error "VM $VMID did not stop within ${SHUTDOWN_TIMEOUT}s. Current status: $status"
|
||||
exit 1
|
||||
fi
|
||||
sleep "$POLL_INTERVAL"
|
||||
elapsed=$((elapsed + POLL_INTERVAL))
|
||||
done
|
||||
ok "VM stopped."
|
||||
|
||||
# --- Step 3: Resize disk ---
|
||||
info "Step 3/7: Resizing disk scsi0 by $SIZE_INCREMENT..."
|
||||
if ! ssh "$PROXMOX_HOST" "qm resize $VMID scsi0 $SIZE_INCREMENT"; then
|
||||
error "Failed to resize disk on VM $VMID."
|
||||
exit 1
|
||||
fi
|
||||
ok "Disk resized."
|
||||
|
||||
# --- Step 4: Start VM ---
|
||||
info "Step 4/7: Starting VM $VMID..."
|
||||
if ! ssh "$PROXMOX_HOST" "qm start $VMID"; then
|
||||
error "Failed to start VM $VMID."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
info "Waiting for SSH to become available at $NODE_IP (timeout: ${SSH_WAIT_TIMEOUT}s)..."
|
||||
elapsed=0
|
||||
while true; do
|
||||
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$VM_SSH_USER@$NODE_IP" "true" 2>/dev/null; then
|
||||
break
|
||||
fi
|
||||
if [[ $elapsed -ge $SSH_WAIT_TIMEOUT ]]; then
|
||||
error "SSH not reachable on $NODE_IP within ${SSH_WAIT_TIMEOUT}s."
|
||||
exit 1
|
||||
fi
|
||||
sleep "$POLL_INTERVAL"
|
||||
elapsed=$((elapsed + POLL_INTERVAL))
|
||||
done
|
||||
ok "VM is up and SSH is reachable."
|
||||
|
||||
info "Waiting 10s for system stabilization..."
|
||||
sleep 10
|
||||
|
||||
# --- Step 5: Expand filesystem ---
|
||||
info "Step 5/7: Expanding filesystem inside the guest..."
|
||||
ssh -o StrictHostKeyChecking=no "$VM_SSH_USER@$NODE_IP" 'bash -s' <<'REMOTE_SCRIPT'
|
||||
set -o pipefail
|
||||
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[0;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
error() { echo -e "${RED}[ERROR]${NC} $*"; }
|
||||
|
||||
ROOT_DEV=$(findmnt -n -o SOURCE /)
|
||||
ROOT_FSTYPE=$(findmnt -n -o FSTYPE /)
|
||||
info "Root device: $ROOT_DEV"
|
||||
info "Root filesystem: $ROOT_FSTYPE"
|
||||
|
||||
# Ensure growpart is available
|
||||
if ! command -v growpart &>/dev/null; then
|
||||
info "Installing growpart (cloud-guest-utils)..."
|
||||
sudo apt-get update -qq && sudo apt-get install -y -qq cloud-guest-utils
|
||||
fi
|
||||
|
||||
resize_fs() {
|
||||
local dev="$1"
|
||||
local fstype="$2"
|
||||
if [[ "$fstype" == "ext4" || "$fstype" == "ext3" || "$fstype" == "ext2" ]]; then
|
||||
info "Running resize2fs on $dev..."
|
||||
if ! sudo resize2fs "$dev"; then
|
||||
error "resize2fs failed on $dev"
|
||||
return 1
|
||||
fi
|
||||
elif [[ "$fstype" == "xfs" ]]; then
|
||||
info "Running xfs_growfs on /..."
|
||||
if ! sudo xfs_growfs /; then
|
||||
error "xfs_growfs failed"
|
||||
return 1
|
||||
fi
|
||||
else
|
||||
error "Unsupported filesystem type: $fstype"
|
||||
return 1
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
# Check if root is on LVM (device-mapper)
|
||||
if [[ "$ROOT_DEV" == /dev/mapper/* || "$ROOT_DEV" == /dev/dm-* ]]; then
|
||||
info "LVM layout detected."
|
||||
|
||||
# Find the PV device
|
||||
PV_DEV=$(sudo pvs --noheadings -o pv_name | head -1 | tr -d ' ')
|
||||
if [[ -z "$PV_DEV" ]]; then
|
||||
error "Could not determine PV device."
|
||||
exit 1
|
||||
fi
|
||||
info "PV device: $PV_DEV"
|
||||
|
||||
# Parse disk and partition number (handles /dev/sdaX and /dev/nvmeXnXpX)
|
||||
if [[ "$PV_DEV" =~ ^(/dev/nvme[0-9]+n[0-9]+)p([0-9]+)$ ]]; then
|
||||
DISK="${BASH_REMATCH[1]}"
|
||||
PARTNUM="${BASH_REMATCH[2]}"
|
||||
elif [[ "$PV_DEV" =~ ^(/dev/[a-z]+)([0-9]+)$ ]]; then
|
||||
DISK="${BASH_REMATCH[1]}"
|
||||
PARTNUM="${BASH_REMATCH[2]}"
|
||||
else
|
||||
error "Could not parse disk/partition from PV: $PV_DEV"
|
||||
exit 1
|
||||
fi
|
||||
info "Disk: $DISK, Partition: $PARTNUM"
|
||||
|
||||
# Grow partition
|
||||
info "Growing partition $DISK partition $PARTNUM..."
|
||||
sudo growpart "$DISK" "$PARTNUM" || echo "(growpart: partition may already be at max size)"
|
||||
|
||||
# Resize PV
|
||||
info "Resizing PV $PV_DEV..."
|
||||
if ! sudo pvresize "$PV_DEV"; then
|
||||
error "pvresize failed on $PV_DEV"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Resolve LV path if using /dev/dm-*
|
||||
if [[ "$ROOT_DEV" == /dev/dm-* ]]; then
|
||||
LV_PATH=$(sudo lvs --noheadings -o lv_path | head -1 | tr -d ' ')
|
||||
else
|
||||
LV_PATH="$ROOT_DEV"
|
||||
fi
|
||||
info "LV path: $LV_PATH"
|
||||
|
||||
# Extend LV
|
||||
info "Extending LV $LV_PATH to use all free space..."
|
||||
if ! sudo lvextend -l +100%FREE "$LV_PATH"; then
|
||||
warn "lvextend reported no change (LV may already use all space)."
|
||||
fi
|
||||
|
||||
# Resize filesystem
|
||||
resize_fs "$LV_PATH" "$ROOT_FSTYPE"
|
||||
if [[ $? -ne 0 ]]; then
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
info "Direct partition layout detected."
|
||||
|
||||
# Parse disk and partition number
|
||||
if [[ "$ROOT_DEV" =~ ^(/dev/nvme[0-9]+n[0-9]+)p([0-9]+)$ ]]; then
|
||||
DISK="${BASH_REMATCH[1]}"
|
||||
PARTNUM="${BASH_REMATCH[2]}"
|
||||
elif [[ "$ROOT_DEV" =~ ^(/dev/[a-z]+)([0-9]+)$ ]]; then
|
||||
DISK="${BASH_REMATCH[1]}"
|
||||
PARTNUM="${BASH_REMATCH[2]}"
|
||||
else
|
||||
error "Could not parse disk/partition from: $ROOT_DEV"
|
||||
exit 1
|
||||
fi
|
||||
info "Disk: $DISK, Partition: $PARTNUM"
|
||||
|
||||
# Grow partition
|
||||
info "Growing partition $DISK partition $PARTNUM..."
|
||||
sudo growpart "$DISK" "$PARTNUM" || echo "(growpart: partition may already be at max size)"
|
||||
|
||||
# Resize filesystem
|
||||
resize_fs "$ROOT_DEV" "$ROOT_FSTYPE"
|
||||
if [[ $? -ne 0 ]]; then
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
ok "Filesystem expansion complete."
|
||||
df -h /
|
||||
REMOTE_SCRIPT
|
||||
|
||||
if [[ $? -ne 0 ]]; then
|
||||
error "Filesystem expansion failed on the guest."
|
||||
exit 1
|
||||
fi
|
||||
ok "Filesystem expanded."
|
||||
|
||||
# --- Step 6: Uncordon node ---
|
||||
info "Step 6/7: Uncordoning node '$NODE_NAME'..."
|
||||
if ! $KUBECTL uncordon "$NODE_NAME"; then
|
||||
error "Failed to uncordon node '$NODE_NAME'."
|
||||
exit 1
|
||||
fi
|
||||
DRAINED_NODE=""
|
||||
ok "Node uncordoned."
|
||||
|
||||
# --- Step 7: Verify ---
|
||||
info "Step 7/7: Verification"
|
||||
echo ""
|
||||
info "Disk usage on $NODE_NAME:"
|
||||
ssh -o StrictHostKeyChecking=no "$VM_SSH_USER@$NODE_IP" "df -h /"
|
||||
echo ""
|
||||
info "Node status:"
|
||||
$KUBECTL get node "$NODE_NAME"
|
||||
echo ""
|
||||
ok "Storage extension complete for $NODE_NAME."
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
# /etc/fan-control.env — config for the fan-control daemon (chmod 600).
|
||||
# Deployed manually to the PVE host; the real file holds a secret token and is
|
||||
# NOT committed. Copy this template, fill HA_TOKEN, scp to /etc/fan-control.env.
|
||||
|
||||
# Long-lived ha-sofia access token (Home Assistant -> Profile -> Security ->
|
||||
# Long-lived access tokens). Empty => presence disabled, daemon runs COOL-only.
|
||||
HA_TOKEN=
|
||||
|
||||
# --- optional overrides (defaults shown) ---
|
||||
# HA_URL=http://192.168.1.8:8123
|
||||
# GARAGE_ENTITY=sensor.garage_door_state_bg
|
||||
# GARAGE_OPEN_STATE=Отворена
|
||||
# HOLD_SECS=900 # quiet-mode hold after last garage activity (15 min)
|
||||
# LOOP_INTERVAL=15
|
||||
# PRESENCE_INTERVAL=30
|
||||
# DEADBAND=3
|
||||
# CEILING=83 # degC: hand back to Dell auto at/above this
|
||||
# RESUME_BELOW=75
|
||||
# RESUME_STABLE=120
|
||||
# MAX_IPMI_FAILS=3
|
||||
PUSHGATEWAY_URL=http://10.0.20.100:30091
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
[Unit]
|
||||
Description=Presence-aware IPMI fan controller (Dell R730, garage)
|
||||
Documentation=https://github.com/ViktorBarzin/infra/blob/master/scripts/fan-control.sh
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=-/etc/fan-control.env
|
||||
ExecStart=/usr/local/bin/fan-control
|
||||
# Belt-and-suspenders: whatever happens to the daemon, hand the fans back to
|
||||
# the iDRAC's own automatic curve so the box is never stuck in manual mode.
|
||||
ExecStopPost=/usr/bin/ipmitool raw 0x30 0x30 0x01 0x01
|
||||
Restart=on-failure
|
||||
RestartSec=10
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=fan-control
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
@ -1,262 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# Presence-aware IPMI fan controller for the Dell R730 PVE host (192.168.1.127).
|
||||
#
|
||||
# The server lives in the GARAGE (memory id=1723). Two curves, picked by
|
||||
# whether someone is physically in the garage:
|
||||
# - COOL : garage empty -> minimise CPU temp, noise is free.
|
||||
# - QUIET : someone in the garage -> minimise noise, accept a warmer CPU.
|
||||
# Presence comes from the ha-sofia garage-door sensor: door open now, OR it
|
||||
# last changed within HOLD_SECS, => QUIET. Otherwise COOL.
|
||||
#
|
||||
# Safety (manual fan mode bypasses the iDRAC's own curve, so we backstop it):
|
||||
# - On ANY exit (crash/stop/TERM) the EXIT trap hands fans back to Dell
|
||||
# automatic control (raw 0x30 0x30 0x01 0x01). systemd ExecStopPost
|
||||
# repeats this belt-and-suspenders.
|
||||
# - CPU >= CEILING -> hand back to Dell auto until it recovers (RESUME_BELOW
|
||||
# held for RESUME_STABLE s). The firmware's own emergency cooling takes over.
|
||||
# - IPMI read failures (>= MAX_IPMI_FAILS) -> hand back to Dell auto.
|
||||
#
|
||||
# Deploy: scp to /usr/local/bin/fan-control (strip .sh) + install
|
||||
# fan-control.service + /etc/fan-control.env. Same pattern as apply-mbps-caps.
|
||||
# Tests: test-fan-control.sh (sources this file, exercises the pure functions).
|
||||
# Design: infra/docs/plans/2026-06-04-pve-fan-control-design.md
|
||||
# Runbook: infra/docs/runbooks/fan-control.md
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
# ---- configuration (override via /etc/fan-control.env) ----
|
||||
: "${IPMITOOL:=ipmitool}"
|
||||
: "${LOOP_INTERVAL:=15}" # seconds between temperature decisions
|
||||
: "${PRESENCE_INTERVAL:=30}" # seconds between ha-sofia garage-door polls
|
||||
: "${DEADBAND:=3}" # degC hysteresis applied to downward fan steps
|
||||
: "${CEILING:=83}" # degC: hand back to Dell auto at/above this
|
||||
: "${RESUME_BELOW:=75}" # degC: eligible to resume manual below this...
|
||||
: "${RESUME_STABLE:=120}" # ...once held that long
|
||||
: "${HOLD_SECS:=900}" # quiet-mode hold after last garage activity (15 min)
|
||||
: "${HA_URL:=http://192.168.1.8:8123}"
|
||||
: "${HA_TOKEN:=}" # long-lived ha-sofia token; empty => presence disabled (COOL only)
|
||||
: "${GARAGE_ENTITY:=sensor.garage_door_state_bg}"
|
||||
: "${GARAGE_OPEN_STATE:=Отворена}" # ha state string meaning "open"
|
||||
# HA control: a mode select + manual % the user drives from Home Assistant.
|
||||
# auto => garage-presence curve (default); cool/quiet => force that curve;
|
||||
# manual => hold MANUAL_ENTITY %. Empty HA_TOKEN or unreachable HA => auto.
|
||||
: "${MODE_ENTITY:=input_select.r730_fan_mode}"
|
||||
: "${MANUAL_ENTITY:=input_number.r730_fan_manual_pct}"
|
||||
: "${PUSHGATEWAY_URL:=}" # optional Prometheus Pushgateway base URL
|
||||
: "${MAX_IPMI_FAILS:=3}"
|
||||
: "${DRY_RUN:=0}" # 1 => log IPMI actions instead of executing
|
||||
: "${RUN_ONCE:=0}" # 1 => one iteration then exit (testing)
|
||||
|
||||
# Continuous LINEAR fan curve (2026-06-05): fan% ramps proportionally with CPU
|
||||
# temp between (T_LO,P_LO) and (T_HI,P_HI), clamped flat outside. Replaces the old
|
||||
# discrete step-bands (which flapped at band edges — e.g. 45<->65%). Both modes
|
||||
# reach 100% right at the 83°C ceiling. Anchors are env-tunable.
|
||||
# COOL (garage empty): 30% @50°C .. 100% @83°C (~2.1%/°C; equilibrium ~60°C/~51%)
|
||||
# QUIET (someone there): 20% @68°C .. 100% @83°C (near-silent until ~70°C)
|
||||
# Web-researched: a linear curve + 2-3°C hysteresis is the homelab standard; PID is
|
||||
# overkill for this slow thermal loop. See docs/plans/2026-06-04-pve-fan-control-design.md.
|
||||
: "${COOL_T_LO:=50}"; : "${COOL_P_LO:=30}"; : "${COOL_T_HI:=83}"; : "${COOL_P_HI:=100}"
|
||||
: "${QUIET_T_LO:=68}"; : "${QUIET_P_LO:=20}"; : "${QUIET_T_HI:=83}"; : "${QUIET_P_HI:=100}"
|
||||
: "${MIN_STEP:=3}" # min fan-% change worth an IPMI write (anti-jitter on the smooth curve)
|
||||
|
||||
log() { printf '%s %s\n' "$(date '+%Y-%m-%dT%H:%M:%S%z')" "$*"; }
|
||||
|
||||
# ---- pure functions (no side effects; unit-tested) ----
|
||||
|
||||
# fc_curve <mode> <temp> -> fan percent (continuous linear interpolation between
|
||||
# the per-mode (T_LO,P_LO)..(T_HI,P_HI) anchors; clamped flat outside the range).
|
||||
fc_curve() {
|
||||
local mode="$1" temp="$2" tlo plo thi phi
|
||||
if [[ "$mode" == "quiet" ]]; then tlo=$QUIET_T_LO; plo=$QUIET_P_LO; thi=$QUIET_T_HI; phi=$QUIET_P_HI
|
||||
else tlo=$COOL_T_LO; plo=$COOL_P_LO; thi=$COOL_T_HI; phi=$COOL_P_HI; fi
|
||||
if (( temp <= tlo )); then echo "$plo"; return 0; fi
|
||||
if (( temp >= thi )); then echo "$phi"; return 0; fi
|
||||
echo $(( plo + ( (temp - tlo) * (phi - plo) + (thi - tlo) / 2 ) / (thi - tlo) )) # rounded
|
||||
}
|
||||
|
||||
# fc_decide <mode> <temp> <current_pct> <deadband> -> fan percent
|
||||
# Ramps up immediately; only steps down once the curve still wants a lower
|
||||
# percent even DEADBAND degrees hotter (prevents flapping at band edges).
|
||||
fc_decide() {
|
||||
local mode="$1" temp="$2" current="$3" deadband="$4" target
|
||||
target="$(fc_curve "$mode" "$temp")"
|
||||
if (( current < 0 || target >= current )); then echo "$target"; return 0; fi
|
||||
if (( $(fc_curve "$mode" "$((temp + deadband))") < current )); then echo "$target"; else echo "$current"; fi
|
||||
}
|
||||
|
||||
# fc_presence_mode <state> <last_changed_epoch> <now_epoch> <hold_secs> <open_state> -> quiet|cool
|
||||
fc_presence_mode() {
|
||||
local state="$1" lc="$2" now="$3" hold="$4" open="$5"
|
||||
if [[ "$state" == "$open" ]]; then echo "quiet"; return 0; fi
|
||||
if (( now - lc < hold )); then echo "quiet"; return 0; fi
|
||||
echo "cool"
|
||||
}
|
||||
|
||||
# fc_parse_temp <ipmitool 'Temp' line> -> integer degC
|
||||
fc_parse_temp() {
|
||||
echo "$1" | grep -oE '[0-9]+ degrees C' | grep -oE '^[0-9]+' | head -1
|
||||
}
|
||||
|
||||
# fc_json_str_field <json> <key> -> string value (first match; jq-free)
|
||||
fc_json_str_field() {
|
||||
printf '%s' "$1" | grep -oE "\"$2\"[[:space:]]*:[[:space:]]*\"[^\"]*\"" | head -1 \
|
||||
| sed -E "s/.*:[[:space:]]*\"(.*)\"\$/\1/"
|
||||
}
|
||||
|
||||
# fc_pct_to_hex <pct> -> 0xNN
|
||||
fc_pct_to_hex() { printf '0x%02x' "$1"; }
|
||||
|
||||
# fc_clamp <pct> -> 0..100
|
||||
fc_clamp() { local p="$1"; (( p < 0 )) && p=0; (( p > 100 )) && p=100; echo "$p"; }
|
||||
|
||||
# fc_fan_watts <rpm> -> estimated TOTAL fan power (W). The iDRAC reports only
|
||||
# total DCMI watts + RPM (no per-fan power), so this is a MODEL: fan power ∝ RPM³
|
||||
# (fan affinity law), calibrated to the 2026-06-05 power sweep — fits within ~3W
|
||||
# (~2W @4800rpm · ~17W @9360 · ~42W @12720 · ~99W @16920). Integer: 0.0205·(rpm/1e3)³.
|
||||
fc_fan_watts() { echo $(( $1 * $1 * $1 * 205 / 10000000000000 )); }
|
||||
|
||||
# fc_resolve <ha_mode> <temp> <manual_pct> <presence> <current> <deadband> -> pct
|
||||
# HA mode resolution (the hard ceiling is handled by the caller):
|
||||
# manual -> clamp(manual_pct), no hysteresis
|
||||
# cool|quiet -> that curve (with hysteresis)
|
||||
# auto (else) -> presence-driven curve (garage door)
|
||||
fc_resolve() {
|
||||
local ha_mode="$1" temp="$2" manual_pct="$3" presence="$4" current="$5" deadband="$6"
|
||||
if [[ "$ha_mode" == "manual" ]]; then fc_clamp "$manual_pct"; return 0; fi
|
||||
local eff; [[ "$ha_mode" == "auto" ]] && eff="$presence" || eff="$ha_mode"
|
||||
fc_decide "$eff" "$temp" "$current" "$deadband"
|
||||
}
|
||||
|
||||
# ---- side-effecting wrappers ----
|
||||
|
||||
ipmi_manual_on=0
|
||||
|
||||
set_manual() { # <pct>
|
||||
local pct="$1" hex; hex="$(fc_pct_to_hex "$pct")"
|
||||
if (( DRY_RUN == 1 )); then log "DRY set fan ${pct}% (${hex})"; ipmi_manual_on=1; return 0; fi
|
||||
if (( ipmi_manual_on == 0 )); then
|
||||
"$IPMITOOL" raw 0x30 0x30 0x01 0x00 >/dev/null 2>&1 || return 1
|
||||
ipmi_manual_on=1
|
||||
fi
|
||||
"$IPMITOOL" raw 0x30 0x30 0x02 0xff "$hex" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
restore_auto() {
|
||||
if (( DRY_RUN == 1 )); then log "DRY restore Dell auto fan control"; ipmi_manual_on=0; return 0; fi
|
||||
"$IPMITOOL" raw 0x30 0x30 0x01 0x01 >/dev/null 2>&1
|
||||
ipmi_manual_on=0
|
||||
}
|
||||
|
||||
read_cpu_temp() {
|
||||
fc_parse_temp "$("$IPMITOOL" sdr type temperature 2>/dev/null | grep -E '^Temp ' | head -1)"
|
||||
}
|
||||
|
||||
read_fan_rpm() { # Fan1 RPM — representative (all 6 fans are set together)
|
||||
"$IPMITOOL" sdr type fan 2>/dev/null | awk -F'|' '/^Fan1/{gsub(/[^0-9]/,"",$5); print $5+0; exit}'
|
||||
}
|
||||
|
||||
presence_cache="cool"; presence_ts=0
|
||||
get_presence() {
|
||||
local now; now="$(date +%s)"
|
||||
if (( now - presence_ts < PRESENCE_INTERVAL )); then echo "$presence_cache"; return 0; fi
|
||||
presence_ts="$now"
|
||||
[[ -z "$HA_TOKEN" ]] && { echo "$presence_cache"; return 0; }
|
||||
local resp state lc_iso lc_epoch
|
||||
resp="$(curl -fsS --max-time 5 -H "Authorization: Bearer $HA_TOKEN" \
|
||||
"$HA_URL/api/states/$GARAGE_ENTITY" 2>/dev/null)" || { echo "$presence_cache"; return 0; }
|
||||
state="$(fc_json_str_field "$resp" state)"
|
||||
[[ -z "$state" ]] && { echo "$presence_cache"; return 0; }
|
||||
lc_iso="$(fc_json_str_field "$resp" last_changed)"
|
||||
lc_epoch="$(date -d "$lc_iso" +%s 2>/dev/null || echo "$now")"
|
||||
presence_cache="$(fc_presence_mode "$state" "$lc_epoch" "$now" "$HOLD_SECS" "$GARAGE_OPEN_STATE")"
|
||||
echo "$presence_cache"
|
||||
}
|
||||
|
||||
# ha_entity_state <entity> -> state string (empty if HA disabled/unreachable)
|
||||
ha_entity_state() {
|
||||
[[ -z "$HA_TOKEN" ]] && return 0
|
||||
local resp
|
||||
resp="$(curl -fsS --max-time 5 -H "Authorization: Bearer $HA_TOKEN" \
|
||||
"$HA_URL/api/states/$1" 2>/dev/null)" || return 0
|
||||
fc_json_str_field "$resp" state
|
||||
}
|
||||
|
||||
push_metrics() { # <temp> <pct> <mode> <ha_ok> <fallback> [fan_rpm] [fan_watts_est]
|
||||
[[ -z "$PUSHGATEWAY_URL" ]] && return 0
|
||||
local mode_num; case "$3" in quiet) mode_num=1;; cool) mode_num=2;; manual) mode_num=3;; *) mode_num=0;; esac
|
||||
curl -fsS --max-time 5 --data-binary @- \
|
||||
"$PUSHGATEWAY_URL/metrics/job/fan_control/instance/pve-r730" >/dev/null 2>&1 <<EOF || true
|
||||
# TYPE pve_fan_control_cpu_temp_celsius gauge
|
||||
pve_fan_control_cpu_temp_celsius $1
|
||||
# TYPE pve_fan_control_fan_percent gauge
|
||||
pve_fan_control_fan_percent $2
|
||||
# TYPE pve_fan_control_mode gauge
|
||||
pve_fan_control_mode $mode_num
|
||||
# TYPE pve_fan_control_ha_reachable gauge
|
||||
pve_fan_control_ha_reachable $4
|
||||
# TYPE pve_fan_control_fallback gauge
|
||||
pve_fan_control_fallback $5
|
||||
# TYPE pve_fan_control_fan_rpm gauge
|
||||
pve_fan_control_fan_rpm ${6:-0}
|
||||
# TYPE pve_fan_control_fan_watts_est gauge
|
||||
pve_fan_control_fan_watts_est ${7:-0}
|
||||
EOF
|
||||
}
|
||||
|
||||
main() {
|
||||
log "fan-control start (loop=${LOOP_INTERVAL}s presence=${PRESENCE_INTERVAL}s hold=${HOLD_SECS}s ceiling=${CEILING}C dry_run=${DRY_RUN})"
|
||||
trap 'log "exit — restoring Dell auto fan control"; restore_auto' EXIT
|
||||
local current=-1 fails=0 in_fallback=0 cool_since=0
|
||||
while true; do
|
||||
local temp; temp="$(read_cpu_temp)"
|
||||
if [[ -z "$temp" ]]; then
|
||||
fails=$((fails + 1)); log "WARN cannot read CPU temp ($fails/$MAX_IPMI_FAILS)"
|
||||
if (( fails >= MAX_IPMI_FAILS )); then log "ERR temp unreadable — Dell auto"; restore_auto; current=-1; fi
|
||||
(( RUN_ONCE == 1 )) && break || { sleep "$LOOP_INTERVAL"; continue; }
|
||||
fi
|
||||
fails=0
|
||||
|
||||
if (( temp >= CEILING )); then
|
||||
(( in_fallback == 0 )) && { log "CEILING temp=${temp}≥${CEILING} — Dell auto"; restore_auto; current=-1; in_fallback=1; }
|
||||
push_metrics "$temp" 0 fallback 1 1
|
||||
(( RUN_ONCE == 1 )) && break || { sleep "$LOOP_INTERVAL"; continue; }
|
||||
fi
|
||||
if (( in_fallback == 1 )); then
|
||||
if (( temp < RESUME_BELOW )); then
|
||||
(( cool_since == 0 )) && cool_since="$(date +%s)"
|
||||
if (( $(date +%s) - cool_since >= RESUME_STABLE )); then
|
||||
log "recovered (temp<${RESUME_BELOW}C ${RESUME_STABLE}s) — resuming manual"; in_fallback=0; cool_since=0
|
||||
else
|
||||
push_metrics "$temp" 0 fallback 1 1; (( RUN_ONCE == 1 )) && break || { sleep "$LOOP_INTERVAL"; continue; }
|
||||
fi
|
||||
else
|
||||
cool_since=0; push_metrics "$temp" 0 fallback 1 1
|
||||
(( RUN_ONCE == 1 )) && break || { sleep "$LOOP_INTERVAL"; continue; }
|
||||
fi
|
||||
fi
|
||||
|
||||
# HA-desired mode (auto/cool/quiet/manual); unreachable/unset => auto.
|
||||
local ha_mode ha_ok=1; ha_mode="$(ha_entity_state "$MODE_ENTITY")"; [[ -z "$HA_TOKEN" ]] && ha_ok=0
|
||||
[[ -z "$ha_mode" ]] && ha_mode="auto"
|
||||
case "$ha_mode" in auto|cool|quiet|manual) ;; *) ha_mode="auto" ;; esac
|
||||
local manual_pct=0
|
||||
if [[ "$ha_mode" == "manual" ]]; then
|
||||
manual_pct="$(ha_entity_state "$MANUAL_ENTITY")"; manual_pct="${manual_pct%%.*}"
|
||||
[[ "$manual_pct" =~ ^[0-9]+$ ]] || manual_pct=0
|
||||
fi
|
||||
local presence="cool"; [[ "$ha_mode" == "auto" ]] && presence="$(get_presence)"
|
||||
local eff; if [[ "$ha_mode" == "manual" ]]; then eff="manual"; elif [[ "$ha_mode" == "auto" ]]; then eff="$presence"; else eff="$ha_mode"; fi
|
||||
local pct; pct="$(fc_resolve "$ha_mode" "$temp" "$manual_pct" "$presence" "$current" "$DEADBAND")"
|
||||
# Only write when first-run or the change clears MIN_STEP (kills 1-2% jitter
|
||||
# on the continuous curve; fc_decide already gives asymmetric hysteresis).
|
||||
if (( current < 0 || pct - current >= MIN_STEP || current - pct >= MIN_STEP )); then
|
||||
if set_manual "$pct"; then log "temp=${temp}C ha_mode=${ha_mode} eff=${eff} fan=${pct}% (was ${current}%)"; current="$pct"
|
||||
else log "WARN set_manual ${pct}% failed"; fi
|
||||
fi
|
||||
local rpm fan_w; rpm="$(read_fan_rpm)"; rpm="${rpm:-0}"; fan_w="$(fc_fan_watts "$rpm")"
|
||||
push_metrics "$temp" "$current" "$eff" "$ha_ok" 0 "$rpm" "$fan_w"
|
||||
(( RUN_ONCE == 1 )) && break || sleep "$LOOP_INTERVAL"
|
||||
done
|
||||
}
|
||||
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then main "$@"; fi
|
||||
|
|
@ -1,76 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# One-shot migration of every private image on registry.viktorbarzin.me to
|
||||
# Forgejo. Used as a stop-gap when the dual-push CI pipelines aren't
|
||||
# producing Forgejo images on their own (Forgejo-Woodpecker forge driver
|
||||
# context-deadline-exceeded issue, see bd code-d3y / 2026-05-07).
|
||||
#
|
||||
# Pulls each image from registry.viktorbarzin.me, retags, pushes to
|
||||
# forgejo.viktorbarzin.me/viktor/<name>:<tag> — preserving the blob bytes
|
||||
# verbatim so the cluster can flip image= without a rebuild.
|
||||
#
|
||||
# Run from any host with docker + network reach to BOTH registries. Auth
|
||||
# from `docker login` (~/.docker/config.json) — make sure both registries
|
||||
# are logged in:
|
||||
# docker login registry.viktorbarzin.me -u viktorbarzin
|
||||
# docker login forgejo.viktorbarzin.me -u viktor # use viktor PAT, not ci-pusher
|
||||
#
|
||||
# (ci-pusher CANNOT push to viktor/<image> — Forgejo container packages
|
||||
# are scoped to the pushing user. Only viktor's PAT can write to viktor/*.)
|
||||
#
|
||||
# After the script, the new image lives at
|
||||
# forgejo.viktorbarzin.me/viktor/<name>:<tag>
|
||||
# Phase 3 of the consolidation flips infra/stacks/<svc>/main.tf image=
|
||||
# to that path.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
OLD_REG=registry.viktorbarzin.me
|
||||
NEW_REG=forgejo.viktorbarzin.me/viktor
|
||||
|
||||
# Image list: <name>:<tag>. Generated 2026-05-07 from `grep -rEn 'image\s*=\s*
|
||||
# "registry\.viktorbarzin\.me'` across infra/stacks/.
|
||||
#
|
||||
# Excluded:
|
||||
# - wealthfolio-sync: registry repo exists but has 0 tags (CronJob has been
|
||||
# broken for 36+ days, separate decision needed). User to triage before
|
||||
# migration.
|
||||
# - fire-planner: registry repo exists but has 0 tags. Dockerfile + CI added
|
||||
# in this session (commit 8b53d99e); rebuild via Woodpecker before flipping.
|
||||
IMAGES=(
|
||||
"chrome-service-novnc:v4"
|
||||
"chrome-service-novnc:latest"
|
||||
"payslip-ingest:latest"
|
||||
"job-hunter:latest"
|
||||
"claude-agent-service:latest"
|
||||
"freedify:latest"
|
||||
"beadboard:latest"
|
||||
"infra-ci:latest"
|
||||
)
|
||||
|
||||
for img in "${IMAGES[@]}"; do
|
||||
echo "=== $img ==="
|
||||
src="$OLD_REG/$img"
|
||||
dst="$NEW_REG/$img"
|
||||
|
||||
if ! docker pull "$src" 2>&1 | tee /tmp/pull-$$ | grep -q 'Status: '; then
|
||||
if grep -q 'not found' /tmp/pull-$$; then
|
||||
echo " SKIP — image not present in source registry"
|
||||
rm -f /tmp/pull-$$
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
rm -f /tmp/pull-$$
|
||||
|
||||
echo " tag → $dst"
|
||||
docker tag "$src" "$dst"
|
||||
|
||||
echo " push $dst"
|
||||
docker push "$dst" 2>&1 | tail -2
|
||||
|
||||
echo " cleanup local copy"
|
||||
docker rmi "$src" "$dst" 2>&1 | tail -1 || true
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "Done. Verify in Forgejo Web UI: https://forgejo.viktorbarzin.me/viktor/-/packages?type=container"
|
||||
echo "Phase 3 of the plan flips infra/stacks/{wealthfolio,fire-planner}/main.tf image= references."
|
||||
|
|
@ -1,698 +0,0 @@
|
|||
// Frigate Bulk Classification Labeler
|
||||
// Paste this into the browser console on the Frigate /classification page
|
||||
// while viewing a model's training images.
|
||||
//
|
||||
// Image URL pattern: /clips/{modelName}/train/{filename}
|
||||
// Categorize API: POST /api/classification/{modelName}/dataset/categorize
|
||||
// body: { category: "...", training_file: "..." }
|
||||
// Delete API: POST /api/classification/{modelName}/train/delete
|
||||
// body: { ids: ["..."] }
|
||||
// Dataset API: GET /api/classification/{modelName}/dataset
|
||||
// returns: { categories: { catName: [files...] }, training_metadata: {...} }
|
||||
|
||||
(async () => {
|
||||
"use strict";
|
||||
|
||||
// --- Configuration ---
|
||||
const API_BASE = window.location.origin + "/api";
|
||||
const TOOLBAR_ID = "bulk-classify-toolbar";
|
||||
// Frigate's axios instance sends these headers on every request.
|
||||
// X-CSRF-TOKEN is required for state-modifying (POST/PUT/DELETE) requests.
|
||||
const API_HEADERS = {
|
||||
"Content-Type": "application/json",
|
||||
"X-CSRF-TOKEN": "1",
|
||||
"X-CACHE-BYPASS": "1",
|
||||
};
|
||||
|
||||
// Abort if already injected
|
||||
if (document.getElementById(TOOLBAR_ID)) {
|
||||
console.log("Bulk classifier already active. Refresh page to re-inject.");
|
||||
return;
|
||||
}
|
||||
|
||||
// --- Extract model name from page ---
|
||||
// Training images use src="/clips/{modelName}/train/{filename}"
|
||||
let modelName = null;
|
||||
|
||||
// Method 1: Extract from training image src on the page
|
||||
for (const img of document.querySelectorAll("img")) {
|
||||
const src = img.getAttribute("src") || "";
|
||||
const m = src.match(/\/clips\/([^/]+)\/train\//);
|
||||
if (m) { modelName = decodeURIComponent(m[1]); break; }
|
||||
}
|
||||
|
||||
// Method 2: List all custom models from config and let the user pick
|
||||
if (!modelName) {
|
||||
try {
|
||||
const resp = await fetch(`${API_BASE}/config`);
|
||||
const config = await resp.json();
|
||||
// Custom classification models are under config.classification.custom
|
||||
const models = Object.keys(config.classification?.custom || {});
|
||||
if (models.length === 1) {
|
||||
modelName = models[0];
|
||||
} else if (models.length > 1) {
|
||||
modelName = prompt(
|
||||
`Multiple classification models found. Enter the model name:\n\n${models.join(", ")}`,
|
||||
);
|
||||
}
|
||||
} catch (_) {}
|
||||
}
|
||||
|
||||
if (!modelName) {
|
||||
alert(
|
||||
"Could not detect model name.\nMake sure you are on the /classification page with training images visible.",
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`[bulk-classify] Detected model: "${modelName}"`);
|
||||
|
||||
// --- Fetch categories from the dataset API ---
|
||||
let categories = [];
|
||||
try {
|
||||
const resp = await fetch(`${API_BASE}/classification/${encodeURIComponent(modelName)}/dataset`);
|
||||
const data = await resp.json();
|
||||
// Dataset response: { categories: { catName: [files...] }, training_metadata: {...} }
|
||||
categories = Object.keys(data.categories || data);
|
||||
} catch (e) {
|
||||
console.error("Failed to fetch categories:", e);
|
||||
}
|
||||
|
||||
// Deduplicate
|
||||
categories = [...new Set(categories)];
|
||||
console.log("[bulk-classify] Categories:", categories);
|
||||
|
||||
// --- Fetch all training filenames and build event groups ---
|
||||
// Frigate groups training images by eventId (first two segments of the filename).
|
||||
// Filename format: {timestamp}-{randomId}-{timestamp2}-{label}-{score}.webp
|
||||
// EventId = "{timestamp}-{randomId}"
|
||||
let allTrainFiles = [];
|
||||
const eventGroups = {}; // eventId -> [filename, ...]
|
||||
|
||||
function parseEventId(filename) {
|
||||
const base = filename.replace(/\.webp$/, "");
|
||||
const parts = base.split("-");
|
||||
if (parts.length >= 2) return `${parts[0]}-${parts[1]}`;
|
||||
return filename; // fallback: treat as its own group
|
||||
}
|
||||
|
||||
try {
|
||||
const resp = await fetch(
|
||||
`${API_BASE}/classification/${encodeURIComponent(modelName)}/train`,
|
||||
{ headers: API_HEADERS },
|
||||
);
|
||||
allTrainFiles = await resp.json();
|
||||
for (const f of allTrainFiles) {
|
||||
const eid = parseEventId(f);
|
||||
if (!eventGroups[eid]) eventGroups[eid] = [];
|
||||
eventGroups[eid].push(f);
|
||||
}
|
||||
console.log(
|
||||
`[bulk-classify] Loaded ${allTrainFiles.length} training files in ${Object.keys(eventGroups).length} event groups.`,
|
||||
);
|
||||
} catch (e) {
|
||||
console.error("[bulk-classify] Failed to fetch training files:", e);
|
||||
}
|
||||
|
||||
// Get all filenames in the same event group as the given filename
|
||||
function getGroupFiles(filename) {
|
||||
const eid = parseEventId(filename);
|
||||
return eventGroups[eid] || [filename];
|
||||
}
|
||||
|
||||
// --- State ---
|
||||
const selected = new Set();
|
||||
|
||||
// --- Inject styles ---
|
||||
const style = document.createElement("style");
|
||||
style.textContent = `
|
||||
#${TOOLBAR_ID} {
|
||||
position: fixed;
|
||||
bottom: 20px;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
z-index: 99999;
|
||||
background: #1e1e2e;
|
||||
border: 1px solid #444;
|
||||
border-radius: 12px;
|
||||
padding: 12px 20px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 12px;
|
||||
box-shadow: 0 8px 32px rgba(0,0,0,0.5);
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
||||
font-size: 14px;
|
||||
color: #cdd6f4;
|
||||
}
|
||||
#${TOOLBAR_ID} button {
|
||||
padding: 6px 14px;
|
||||
border: 1px solid #555;
|
||||
border-radius: 6px;
|
||||
background: #313244;
|
||||
color: #cdd6f4;
|
||||
cursor: pointer;
|
||||
font-size: 13px;
|
||||
white-space: nowrap;
|
||||
}
|
||||
#${TOOLBAR_ID} button:hover {
|
||||
background: #45475a;
|
||||
}
|
||||
#${TOOLBAR_ID} button.primary {
|
||||
background: #89b4fa;
|
||||
color: #1e1e2e;
|
||||
border-color: #89b4fa;
|
||||
font-weight: 600;
|
||||
}
|
||||
#${TOOLBAR_ID} button.primary:hover {
|
||||
background: #74c7ec;
|
||||
}
|
||||
#${TOOLBAR_ID} button.primary:disabled {
|
||||
opacity: 0.5;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
#${TOOLBAR_ID} button.danger {
|
||||
background: #f38ba8;
|
||||
color: #1e1e2e;
|
||||
border-color: #f38ba8;
|
||||
font-weight: 600;
|
||||
}
|
||||
#${TOOLBAR_ID} button.danger:hover {
|
||||
background: #eba0ac;
|
||||
}
|
||||
.bulk-classify-dropdown {
|
||||
position: relative;
|
||||
display: inline-block;
|
||||
}
|
||||
.bulk-classify-dropdown-btn {
|
||||
padding: 6px 14px;
|
||||
border: 1px solid #555;
|
||||
border-radius: 6px;
|
||||
background: #313244;
|
||||
color: #cdd6f4;
|
||||
cursor: pointer;
|
||||
font-size: 13px;
|
||||
white-space: nowrap;
|
||||
min-width: 140px;
|
||||
text-align: left;
|
||||
}
|
||||
.bulk-classify-dropdown-btn::after {
|
||||
content: " ▾";
|
||||
float: right;
|
||||
margin-left: 8px;
|
||||
}
|
||||
.bulk-classify-dropdown-menu {
|
||||
display: none;
|
||||
position: absolute;
|
||||
bottom: 100%;
|
||||
left: 0;
|
||||
margin-bottom: 4px;
|
||||
background: #313244;
|
||||
border: 1px solid #555;
|
||||
border-radius: 6px;
|
||||
max-height: 250px;
|
||||
overflow-y: auto;
|
||||
min-width: 180px;
|
||||
box-shadow: 0 -4px 16px rgba(0,0,0,0.4);
|
||||
z-index: 100000;
|
||||
}
|
||||
.bulk-classify-dropdown-menu.open {
|
||||
display: block;
|
||||
}
|
||||
.bulk-classify-dropdown-item {
|
||||
padding: 8px 14px;
|
||||
cursor: pointer;
|
||||
font-size: 13px;
|
||||
color: #cdd6f4;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.bulk-classify-dropdown-item:hover {
|
||||
background: #45475a;
|
||||
}
|
||||
.bulk-classify-dropdown-item.active {
|
||||
background: #89b4fa;
|
||||
color: #1e1e2e;
|
||||
}
|
||||
#${TOOLBAR_ID} .count {
|
||||
font-weight: 600;
|
||||
min-width: 30px;
|
||||
text-align: center;
|
||||
}
|
||||
#${TOOLBAR_ID} .separator {
|
||||
width: 1px;
|
||||
height: 24px;
|
||||
background: #555;
|
||||
}
|
||||
#${TOOLBAR_ID} .progress {
|
||||
font-size: 12px;
|
||||
color: #a6adc8;
|
||||
}
|
||||
.bulk-classify-checkbox {
|
||||
position: absolute;
|
||||
top: 6px;
|
||||
left: 6px;
|
||||
z-index: 9999;
|
||||
width: 22px;
|
||||
height: 22px;
|
||||
cursor: pointer;
|
||||
accent-color: #89b4fa;
|
||||
pointer-events: auto;
|
||||
}
|
||||
.bulk-classify-selected {
|
||||
outline: 3px solid #89b4fa !important;
|
||||
outline-offset: -3px;
|
||||
}
|
||||
.bulk-classify-overlay {
|
||||
position: fixed;
|
||||
inset: 0;
|
||||
z-index: 99998;
|
||||
background: rgba(0,0,0,0.6);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
}
|
||||
.bulk-classify-dialog {
|
||||
background: #1e1e2e;
|
||||
border: 1px solid #444;
|
||||
border-radius: 12px;
|
||||
padding: 24px;
|
||||
min-width: 350px;
|
||||
color: #cdd6f4;
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
||||
}
|
||||
.bulk-classify-dialog h3 {
|
||||
margin: 0 0 16px;
|
||||
font-size: 16px;
|
||||
}
|
||||
.bulk-classify-dialog .progress-bar {
|
||||
width: 100%;
|
||||
height: 8px;
|
||||
background: #313244;
|
||||
border-radius: 4px;
|
||||
overflow: hidden;
|
||||
margin: 12px 0;
|
||||
}
|
||||
.bulk-classify-dialog .progress-fill {
|
||||
height: 100%;
|
||||
background: #89b4fa;
|
||||
transition: width 0.2s;
|
||||
}
|
||||
.bulk-classify-dialog .status {
|
||||
font-size: 13px;
|
||||
color: #a6adc8;
|
||||
}
|
||||
`;
|
||||
document.head.appendChild(style);
|
||||
|
||||
// --- Helper: find all training image cards ---
|
||||
function getImageCards() {
|
||||
// Training images use src="/clips/{modelName}/train/{filename}"
|
||||
// Filenames are like: 1770573871.602803-in4y00-1770573889.027752-none-1.0.webp
|
||||
const pattern = /\/clips\/[^/]+\/train\/([^/?#]+)/;
|
||||
const imgs = document.querySelectorAll("img");
|
||||
const cards = [];
|
||||
const seen = new Set();
|
||||
for (const img of imgs) {
|
||||
const src = img.getAttribute("src") || "";
|
||||
const match = src.match(pattern);
|
||||
if (match && !seen.has(match[1])) {
|
||||
seen.add(match[1]);
|
||||
// Walk up to find the card container (Frigate uses aspect-square divs)
|
||||
let card =
|
||||
img.closest("[class*='aspect-']") ||
|
||||
img.closest("[class*='card']") ||
|
||||
img.parentElement?.parentElement ||
|
||||
img.parentElement;
|
||||
// Resolve the full group of filenames for this card
|
||||
const groupFiles = getGroupFiles(match[1]);
|
||||
cards.push({ element: card, filename: match[1], img, groupFiles });
|
||||
}
|
||||
}
|
||||
return cards;
|
||||
}
|
||||
|
||||
// --- Debug: log what images we found ---
|
||||
const debugImgs = document.querySelectorAll("img");
|
||||
const debugSrcs = Array.from(debugImgs)
|
||||
.map((i) => i.getAttribute("src"))
|
||||
.filter(Boolean);
|
||||
console.log(
|
||||
`[bulk-classify] Found ${debugSrcs.length} <img> elements. Sample srcs:`,
|
||||
debugSrcs.slice(0, 5),
|
||||
);
|
||||
const initialCards = getImageCards();
|
||||
console.log(
|
||||
`[bulk-classify] Matched ${initialCards.length} training image cards.`,
|
||||
);
|
||||
|
||||
// --- Add checkboxes to all cards ---
|
||||
function injectCheckboxes() {
|
||||
const cards = getImageCards();
|
||||
for (const { element, filename, groupFiles } of cards) {
|
||||
if (element.querySelector(".bulk-classify-checkbox")) continue;
|
||||
|
||||
// Ensure relative positioning for absolute checkbox
|
||||
element.style.position = "relative";
|
||||
|
||||
const cb = document.createElement("input");
|
||||
cb.type = "checkbox";
|
||||
cb.className = "bulk-classify-checkbox";
|
||||
cb.dataset.filename = filename;
|
||||
cb.checked = selected.has(filename);
|
||||
|
||||
// Show group count badge next to checkbox if group has >1 image
|
||||
let badge = null;
|
||||
if (groupFiles.length > 1) {
|
||||
badge = document.createElement("span");
|
||||
badge.className = "bulk-classify-badge";
|
||||
badge.textContent = groupFiles.length;
|
||||
badge.style.cssText =
|
||||
"position:absolute;top:6px;left:32px;z-index:9999;background:#89b4fa;color:#1e1e2e;" +
|
||||
"font-size:11px;font-weight:700;padding:1px 5px;border-radius:8px;pointer-events:none;";
|
||||
}
|
||||
|
||||
cb.addEventListener("change", (e) => {
|
||||
e.stopPropagation();
|
||||
if (cb.checked) {
|
||||
// Select ALL files in this event group
|
||||
for (const f of groupFiles) selected.add(f);
|
||||
element.classList.add("bulk-classify-selected");
|
||||
} else {
|
||||
for (const f of groupFiles) selected.delete(f);
|
||||
element.classList.remove("bulk-classify-selected");
|
||||
}
|
||||
updateCount();
|
||||
});
|
||||
|
||||
// Also allow clicking the image to toggle
|
||||
element.addEventListener("click", (e) => {
|
||||
// Don't intercept if clicking the checkbox itself or a button
|
||||
if (
|
||||
e.target === cb ||
|
||||
e.target.closest("button") ||
|
||||
e.target.closest("a")
|
||||
)
|
||||
return;
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
cb.checked = !cb.checked;
|
||||
cb.dispatchEvent(new Event("change"));
|
||||
});
|
||||
|
||||
element.prepend(cb);
|
||||
if (badge) element.appendChild(badge);
|
||||
}
|
||||
}
|
||||
|
||||
// --- Toolbar ---
|
||||
const toolbar = document.createElement("div");
|
||||
toolbar.id = TOOLBAR_ID;
|
||||
|
||||
const countLabel = document.createElement("span");
|
||||
countLabel.className = "count";
|
||||
countLabel.textContent = "0";
|
||||
|
||||
const countText = document.createElement("span");
|
||||
countText.textContent = "selected";
|
||||
|
||||
const sep1 = document.createElement("div");
|
||||
sep1.className = "separator";
|
||||
|
||||
const selectAllBtn = document.createElement("button");
|
||||
selectAllBtn.textContent = "Select All";
|
||||
selectAllBtn.addEventListener("click", () => {
|
||||
const cards = getImageCards();
|
||||
for (const { element, groupFiles } of cards) {
|
||||
for (const f of groupFiles) selected.add(f);
|
||||
element.classList.add("bulk-classify-selected");
|
||||
const cb = element.querySelector(".bulk-classify-checkbox");
|
||||
if (cb) cb.checked = true;
|
||||
}
|
||||
updateCount();
|
||||
});
|
||||
|
||||
const deselectBtn = document.createElement("button");
|
||||
deselectBtn.textContent = "Deselect All";
|
||||
deselectBtn.addEventListener("click", () => {
|
||||
const cards = getImageCards();
|
||||
for (const { element, groupFiles } of cards) {
|
||||
for (const f of groupFiles) selected.delete(f);
|
||||
element.classList.remove("bulk-classify-selected");
|
||||
const cb = element.querySelector(".bulk-classify-checkbox");
|
||||
if (cb) cb.checked = false;
|
||||
}
|
||||
updateCount();
|
||||
});
|
||||
|
||||
const sep2 = document.createElement("div");
|
||||
sep2.className = "separator";
|
||||
|
||||
// --- Custom dropdown (replaces native <select> which React intercepts) ---
|
||||
let selectedCategory = "";
|
||||
const dropdown = document.createElement("div");
|
||||
dropdown.className = "bulk-classify-dropdown";
|
||||
|
||||
const dropdownBtn = document.createElement("div");
|
||||
dropdownBtn.className = "bulk-classify-dropdown-btn";
|
||||
dropdownBtn.textContent = "-- pick category --";
|
||||
|
||||
const dropdownMenu = document.createElement("div");
|
||||
dropdownMenu.className = "bulk-classify-dropdown-menu";
|
||||
|
||||
function buildMenuItems() {
|
||||
dropdownMenu.innerHTML = "";
|
||||
for (const cat of categories) {
|
||||
const item = document.createElement("div");
|
||||
item.className = "bulk-classify-dropdown-item";
|
||||
if (cat === selectedCategory) item.classList.add("active");
|
||||
item.textContent = cat;
|
||||
item.addEventListener("mousedown", (e) => {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
selectedCategory = cat;
|
||||
dropdownBtn.textContent = cat;
|
||||
dropdownMenu.classList.remove("open");
|
||||
buildMenuItems(); // refresh active state
|
||||
});
|
||||
dropdownMenu.appendChild(item);
|
||||
}
|
||||
}
|
||||
buildMenuItems();
|
||||
|
||||
dropdownBtn.addEventListener("mousedown", (e) => {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
dropdownMenu.classList.toggle("open");
|
||||
});
|
||||
|
||||
// Close dropdown when clicking outside
|
||||
document.addEventListener("mousedown", (e) => {
|
||||
if (!dropdown.contains(e.target)) {
|
||||
dropdownMenu.classList.remove("open");
|
||||
}
|
||||
});
|
||||
|
||||
dropdown.appendChild(dropdownBtn);
|
||||
dropdown.appendChild(dropdownMenu);
|
||||
|
||||
// Allow typing a new category
|
||||
const newCatInput = document.createElement("input");
|
||||
newCatInput.type = "text";
|
||||
newCatInput.placeholder = "or type new...";
|
||||
newCatInput.style.cssText =
|
||||
"padding:6px 10px;border:1px solid #555;border-radius:6px;background:#313244;color:#cdd6f4;font-size:13px;width:120px;";
|
||||
|
||||
const categorizeBtn = document.createElement("button");
|
||||
categorizeBtn.className = "primary";
|
||||
categorizeBtn.textContent = "Categorize Selected";
|
||||
|
||||
const deleteBtn = document.createElement("button");
|
||||
deleteBtn.className = "danger";
|
||||
deleteBtn.textContent = "Delete Selected";
|
||||
|
||||
toolbar.append(
|
||||
countLabel,
|
||||
countText,
|
||||
sep1,
|
||||
selectAllBtn,
|
||||
deselectBtn,
|
||||
sep2,
|
||||
dropdown,
|
||||
newCatInput,
|
||||
categorizeBtn,
|
||||
deleteBtn,
|
||||
);
|
||||
|
||||
// Prevent events from bubbling out of toolbar to React's root handler
|
||||
for (const evt of ["click", "mousedown", "mouseup", "pointerdown", "pointerup", "focus", "blur"]) {
|
||||
toolbar.addEventListener(evt, (e) => e.stopPropagation());
|
||||
}
|
||||
|
||||
document.body.appendChild(toolbar);
|
||||
|
||||
function updateCount() {
|
||||
countLabel.textContent = selected.size;
|
||||
categorizeBtn.disabled = selected.size === 0;
|
||||
}
|
||||
|
||||
// --- Progress dialog ---
|
||||
function showProgress(title, total) {
|
||||
const overlay = document.createElement("div");
|
||||
overlay.className = "bulk-classify-overlay";
|
||||
const dialog = document.createElement("div");
|
||||
dialog.className = "bulk-classify-dialog";
|
||||
dialog.innerHTML = `
|
||||
<h3>${title}</h3>
|
||||
<div class="status">0 / ${total}</div>
|
||||
<div class="progress-bar"><div class="progress-fill" style="width:0%"></div></div>
|
||||
<div class="errors" style="color:#f38ba8;font-size:12px;margin-top:8px"></div>
|
||||
`;
|
||||
overlay.appendChild(dialog);
|
||||
document.body.appendChild(overlay);
|
||||
|
||||
return {
|
||||
update(current, errorMsg) {
|
||||
const pct = Math.round((current / total) * 100);
|
||||
dialog.querySelector(".status").textContent =
|
||||
`${current} / ${total}`;
|
||||
dialog.querySelector(".progress-fill").style.width = pct + "%";
|
||||
if (errorMsg) {
|
||||
dialog.querySelector(".errors").textContent += errorMsg + "\n";
|
||||
}
|
||||
},
|
||||
close() {
|
||||
overlay.remove();
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// --- Categorize handler ---
|
||||
// POST /api/classification/{modelName}/dataset/categorize
|
||||
// body: { category: "...", training_file: "..." }
|
||||
categorizeBtn.addEventListener("click", async () => {
|
||||
const category = newCatInput.value.trim() || selectedCategory;
|
||||
if (!category) {
|
||||
alert("Select a category or type a new one.");
|
||||
return;
|
||||
}
|
||||
if (selected.size === 0) {
|
||||
alert("No images selected.");
|
||||
return;
|
||||
}
|
||||
|
||||
const files = Array.from(selected);
|
||||
if (
|
||||
!confirm(
|
||||
`Categorize ${files.length} image(s) as "${category}"?`,
|
||||
)
|
||||
)
|
||||
return;
|
||||
|
||||
const progress = showProgress(
|
||||
`Categorizing as "${category}"`,
|
||||
files.length,
|
||||
);
|
||||
let errors = 0;
|
||||
|
||||
for (let i = 0; i < files.length; i++) {
|
||||
try {
|
||||
const resp = await fetch(
|
||||
`${API_BASE}/classification/${encodeURIComponent(modelName)}/dataset/categorize`,
|
||||
{
|
||||
method: "POST",
|
||||
headers: API_HEADERS,
|
||||
body: JSON.stringify({
|
||||
category: category,
|
||||
training_file: files[i],
|
||||
}),
|
||||
},
|
||||
);
|
||||
if (!resp.ok) {
|
||||
const text = await resp.text();
|
||||
progress.update(i + 1, `Failed: ${files[i]} - ${text}`);
|
||||
errors++;
|
||||
} else {
|
||||
progress.update(i + 1);
|
||||
}
|
||||
} catch (e) {
|
||||
progress.update(i + 1, `Error: ${files[i]} - ${e.message}`);
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
|
||||
setTimeout(() => {
|
||||
progress.close();
|
||||
if (errors === 0) {
|
||||
selected.clear();
|
||||
updateCount();
|
||||
alert(
|
||||
`Done! ${files.length} image(s) categorized as "${category}".\nRefreshing the training view...`,
|
||||
);
|
||||
window.location.reload();
|
||||
} else {
|
||||
alert(
|
||||
`Completed with ${errors} error(s). Check console for details.`,
|
||||
);
|
||||
}
|
||||
}, 500);
|
||||
});
|
||||
|
||||
// --- Delete handler ---
|
||||
// POST /api/classification/{modelName}/train/delete
|
||||
// body: { ids: ["filename1", "filename2", ...] }
|
||||
deleteBtn.addEventListener("click", async () => {
|
||||
if (selected.size === 0) {
|
||||
alert("No images selected.");
|
||||
return;
|
||||
}
|
||||
|
||||
const files = Array.from(selected);
|
||||
if (
|
||||
!confirm(
|
||||
`DELETE ${files.length} training image(s)? This cannot be undone.`,
|
||||
)
|
||||
)
|
||||
return;
|
||||
|
||||
const progress = showProgress("Deleting training images", 1);
|
||||
|
||||
try {
|
||||
const resp = await fetch(
|
||||
`${API_BASE}/classification/${encodeURIComponent(modelName)}/train/delete`,
|
||||
{
|
||||
method: "POST",
|
||||
headers: API_HEADERS,
|
||||
body: JSON.stringify({ ids: files }),
|
||||
},
|
||||
);
|
||||
if (!resp.ok) {
|
||||
const text = await resp.text();
|
||||
progress.update(1, `Failed: ${text}`);
|
||||
} else {
|
||||
progress.update(1);
|
||||
}
|
||||
} catch (e) {
|
||||
progress.update(1, `Error: ${e.message}`);
|
||||
}
|
||||
|
||||
setTimeout(() => {
|
||||
progress.close();
|
||||
selected.clear();
|
||||
updateCount();
|
||||
alert(`Deleted ${files.length} training image(s).\nRefreshing...`);
|
||||
window.location.reload();
|
||||
}, 500);
|
||||
});
|
||||
|
||||
// --- Initial injection + MutationObserver for dynamic loading ---
|
||||
injectCheckboxes();
|
||||
|
||||
const observer = new MutationObserver(() => {
|
||||
injectCheckboxes();
|
||||
});
|
||||
observer.observe(document.body, { childList: true, subtree: true });
|
||||
|
||||
updateCount();
|
||||
console.log(
|
||||
`Bulk classifier active for model "${modelName}". ${categories.length} categories found: [${categories.join(", ")}]`,
|
||||
);
|
||||
})();
|
||||
|
|
@ -1,305 +0,0 @@
|
|||
#!/usr/bin/env node
|
||||
// Frigate Classification Page Inspector
|
||||
// Phase 1: Fetch API data via HTTP to understand the data model
|
||||
// Phase 2: Fetch the classification page HTML and parse its DOM structure
|
||||
// No browser needed — uses plain HTTP requests.
|
||||
|
||||
import { spawn } from "child_process";
|
||||
import http from "http";
|
||||
|
||||
const KUBE_CONFIG = `${process.cwd()}/config`;
|
||||
const LOCAL_PORT = 15000;
|
||||
const FRIGATE_NS = "frigate";
|
||||
const FRIGATE_SVC = "svc/frigate";
|
||||
const FRIGATE_PORT = 80;
|
||||
const BASE_URL = `http://localhost:${LOCAL_PORT}`;
|
||||
|
||||
async function startPortForward() {
|
||||
console.log(
|
||||
`[port-forward] Starting: kubectl port-forward ${FRIGATE_SVC} ${LOCAL_PORT}:${FRIGATE_PORT} -n ${FRIGATE_NS}`,
|
||||
);
|
||||
const proc = spawn(
|
||||
"kubectl",
|
||||
[
|
||||
"--kubeconfig",
|
||||
KUBE_CONFIG,
|
||||
"port-forward",
|
||||
FRIGATE_SVC,
|
||||
`${LOCAL_PORT}:${FRIGATE_PORT}`,
|
||||
"-n",
|
||||
FRIGATE_NS,
|
||||
],
|
||||
{ stdio: ["ignore", "pipe", "pipe"] },
|
||||
);
|
||||
|
||||
await new Promise((resolve, reject) => {
|
||||
const timer = setTimeout(
|
||||
() => reject(new Error("Port-forward timed out")),
|
||||
15000,
|
||||
);
|
||||
proc.stdout.on("data", (data) => {
|
||||
if (data.toString().includes("Forwarding from")) {
|
||||
clearTimeout(timer);
|
||||
resolve();
|
||||
}
|
||||
});
|
||||
proc.stderr.on("data", (data) => {
|
||||
console.error(`[port-forward stderr] ${data.toString().trim()}`);
|
||||
});
|
||||
proc.on("error", (err) => {
|
||||
clearTimeout(timer);
|
||||
reject(err);
|
||||
});
|
||||
proc.on("exit", (code) => {
|
||||
if (code !== null && code !== 0) {
|
||||
clearTimeout(timer);
|
||||
reject(new Error(`port-forward exited with code ${code}`));
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
console.log("[port-forward] Ready");
|
||||
return proc;
|
||||
}
|
||||
|
||||
function httpGet(path) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const url = `${BASE_URL}${path}`;
|
||||
http.get(url, (res) => {
|
||||
let body = "";
|
||||
res.on("data", (chunk) => (body += chunk));
|
||||
res.on("end", () =>
|
||||
resolve({ status: res.statusCode, body, headers: res.headers }),
|
||||
);
|
||||
}).on("error", (err) => reject(err));
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
let portForwardProc = null;
|
||||
|
||||
try {
|
||||
portForwardProc = await startPortForward();
|
||||
|
||||
// ================================================================
|
||||
// API INSPECTION
|
||||
// ================================================================
|
||||
console.log("\n" + "=".repeat(80));
|
||||
console.log("API INSPECTION");
|
||||
console.log("=".repeat(80));
|
||||
|
||||
// Get config to find model names
|
||||
const configResp = await httpGet("/api/config");
|
||||
let modelNames = [];
|
||||
if (configResp.status === 200) {
|
||||
try {
|
||||
const config = JSON.parse(configResp.body);
|
||||
// Custom classification models are under config.classification.custom
|
||||
const classificationModels = config.classification?.custom || {};
|
||||
modelNames = Object.keys(classificationModels);
|
||||
console.log(
|
||||
`\n[API] /api/config - Classification models: ${JSON.stringify(modelNames)}`,
|
||||
);
|
||||
console.log(
|
||||
`[API] Classification config:\n${JSON.stringify(config.classification, null, 2)}`,
|
||||
);
|
||||
} catch (e) {
|
||||
console.log(`[API] /api/config - Failed to parse: ${e.message}`);
|
||||
console.log(
|
||||
`[API] Raw (first 500): ${configResp.body.slice(0, 500)}`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
console.log(`[API] /api/config - HTTP ${configResp.status}`);
|
||||
}
|
||||
|
||||
for (const model of modelNames) {
|
||||
console.log(`\n--- Model: ${model} ---`);
|
||||
const encodedModel = encodeURIComponent(model);
|
||||
|
||||
// Dataset endpoint
|
||||
const datasetResp = await httpGet(
|
||||
`/api/classification/${encodedModel}/dataset`,
|
||||
);
|
||||
if (datasetResp.status === 200) {
|
||||
try {
|
||||
const dataset = JSON.parse(datasetResp.body);
|
||||
// Dataset response: { categories: { catName: [files...] }, training_metadata: {...} }
|
||||
const cats = dataset.categories || dataset;
|
||||
const categories = Object.keys(cats);
|
||||
console.log(`[API] /api/classification/${model}/dataset`);
|
||||
console.log(` Categories: ${JSON.stringify(categories)}`);
|
||||
for (const cat of categories) {
|
||||
const items = Array.isArray(cats[cat]) ? cats[cat] : [];
|
||||
console.log(
|
||||
` "${cat}": ${items.length} items, first 3: ${JSON.stringify(items.slice(0, 3))}`,
|
||||
);
|
||||
}
|
||||
if (dataset.training_metadata) {
|
||||
console.log(` Training metadata: ${JSON.stringify(dataset.training_metadata, null, 2)}`);
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(` Failed to parse dataset: ${e.message}`);
|
||||
}
|
||||
} else {
|
||||
console.log(
|
||||
`[API] /api/classification/${model}/dataset - HTTP ${datasetResp.status}: ${datasetResp.body.slice(0, 200)}`,
|
||||
);
|
||||
}
|
||||
|
||||
// Train endpoint
|
||||
const trainResp = await httpGet(
|
||||
`/api/classification/${encodedModel}/train`,
|
||||
);
|
||||
if (trainResp.status === 200) {
|
||||
try {
|
||||
const train = JSON.parse(trainResp.body);
|
||||
const entries = Array.isArray(train) ? train : Object.entries(train);
|
||||
console.log(`[API] /api/classification/${model}/train`);
|
||||
console.log(
|
||||
` Type: ${Array.isArray(train) ? "array" : typeof train}, length/keys: ${Array.isArray(train) ? train.length : Object.keys(train).length}`,
|
||||
);
|
||||
console.log(
|
||||
` First 5 entries:\n${JSON.stringify(entries.slice(0, 5), null, 2)}`,
|
||||
);
|
||||
} catch (e) {
|
||||
console.log(` Failed to parse train: ${e.message}`);
|
||||
}
|
||||
} else {
|
||||
console.log(
|
||||
`[API] /api/classification/${model}/train - HTTP ${trainResp.status}: ${trainResp.body.slice(0, 200)}`,
|
||||
);
|
||||
}
|
||||
|
||||
// Try to get a thumbnail URL to understand the image src pattern
|
||||
if (trainResp.status === 200) {
|
||||
try {
|
||||
const train = JSON.parse(trainResp.body);
|
||||
const firstFile = Array.isArray(train) ? train[0] : null;
|
||||
if (firstFile) {
|
||||
// Try various thumbnail URL patterns
|
||||
const patterns = [
|
||||
`/api/classification/${encodedModel}/train/${firstFile}/thumbnail.jpg`,
|
||||
`/api/classification/${encodedModel}/train/${firstFile}`,
|
||||
`/clips/${encodedModel}/train/${firstFile}`,
|
||||
];
|
||||
for (const p of patterns) {
|
||||
const resp = await httpGet(p);
|
||||
console.log(
|
||||
` Thumbnail URL test: ${p} -> HTTP ${resp.status} (content-type: ${resp.headers["content-type"]}, size: ${resp.body.length})`,
|
||||
);
|
||||
}
|
||||
}
|
||||
} catch (_) {}
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================
|
||||
// HTML/DOM INSPECTION
|
||||
// ================================================================
|
||||
console.log("\n" + "=".repeat(80));
|
||||
console.log("HTML / DOM INSPECTION");
|
||||
console.log("=".repeat(80));
|
||||
|
||||
// Fetch the main classification page HTML
|
||||
const classifPageResp = await httpGet("/classification");
|
||||
console.log(
|
||||
`\n[HTML] /classification - HTTP ${classifPageResp.status} (${classifPageResp.body.length} bytes)`,
|
||||
);
|
||||
|
||||
// This is likely a React SPA, so the HTML will be minimal. Let's check.
|
||||
const html = classifPageResp.body;
|
||||
console.log(`[HTML] First 2000 chars:\n${html.slice(0, 2000)}`);
|
||||
|
||||
// Check for any JS bundle references (to find source maps or component names)
|
||||
const scriptMatches = html.match(/<script[^>]*src="([^"]+)"[^>]*>/g) || [];
|
||||
console.log(`\n[HTML] Script tags: ${scriptMatches.length}`);
|
||||
for (const s of scriptMatches) {
|
||||
console.log(` ${s}`);
|
||||
}
|
||||
|
||||
// Fetch the main JS bundle to look for classification component code
|
||||
const jsMatch = html.match(/src="(\/assets\/[^"]+\.js)"/);
|
||||
if (jsMatch) {
|
||||
console.log(`\n[JS] Fetching main bundle: ${jsMatch[1]}`);
|
||||
const jsResp = await httpGet(jsMatch[1]);
|
||||
if (jsResp.status === 200) {
|
||||
const js = jsResp.body;
|
||||
console.log(`[JS] Bundle size: ${js.length} bytes`);
|
||||
|
||||
// Search for classification-related code patterns
|
||||
const searchTerms = [
|
||||
"classify image as",
|
||||
"Classify image as",
|
||||
"categorize",
|
||||
"/classification/",
|
||||
"dataset/categorize",
|
||||
"training_file",
|
||||
"train/delete",
|
||||
"ModelTraining",
|
||||
"classification",
|
||||
];
|
||||
for (const term of searchTerms) {
|
||||
const idx = js.indexOf(term);
|
||||
if (idx !== -1) {
|
||||
const context = js.slice(Math.max(0, idx - 200), idx + 200);
|
||||
console.log(`\n[JS] Found "${term}" at offset ${idx}:`);
|
||||
console.log(` ...${context}...`);
|
||||
}
|
||||
}
|
||||
|
||||
// Look for the dropdown/select implementation
|
||||
const selectTerms = [
|
||||
"combobox",
|
||||
"listbox",
|
||||
"SelectTrigger",
|
||||
"SelectContent",
|
||||
"SelectItem",
|
||||
"Select>",
|
||||
"DropdownMenu",
|
||||
];
|
||||
for (const term of selectTerms) {
|
||||
const idx = js.indexOf(term);
|
||||
if (idx !== -1) {
|
||||
const context = js.slice(Math.max(0, idx - 150), idx + 150);
|
||||
console.log(`\n[JS] Found "${term}" at offset ${idx}:`);
|
||||
console.log(` ...${context}...`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also check if there are multiple JS chunks
|
||||
const allJsMatches =
|
||||
html.match(/src="(\/assets\/[^"]+\.js)"/g) || [];
|
||||
console.log(`\n[JS] All JS assets: ${allJsMatches.length}`);
|
||||
for (const m of allJsMatches) {
|
||||
const path = m.match(/src="([^"]+)"/)?.[1];
|
||||
if (path) console.log(` ${path}`);
|
||||
}
|
||||
|
||||
// Try to fetch the Frigate source for classification view from GitHub
|
||||
console.log("\n" + "=".repeat(80));
|
||||
console.log("FRIGATE VERSION");
|
||||
console.log("=".repeat(80));
|
||||
|
||||
const versionResp = await httpGet("/api/version");
|
||||
if (versionResp.status === 200) {
|
||||
console.log(`[API] Frigate version: ${versionResp.body}`);
|
||||
}
|
||||
|
||||
console.log("\n" + "=".repeat(80));
|
||||
console.log("INSPECTION COMPLETE");
|
||||
console.log("=".repeat(80));
|
||||
} catch (err) {
|
||||
console.error(`\n[ERROR] ${err.message}`);
|
||||
console.error(err.stack);
|
||||
} finally {
|
||||
if (portForwardProc) {
|
||||
console.log("\n[cleanup] Killing port-forward...");
|
||||
portForwardProc.kill("SIGTERM");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
|
|
@ -1,511 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Generate Terragrunt service stack files for all app-level services."""
|
||||
import os
|
||||
import textwrap
|
||||
|
||||
REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# Each service: (module_name, source_dir, [(arg_name, var_expr), ...], tier)
|
||||
# var_expr is what goes on the right side of = in the module call.
|
||||
# If var_expr starts with "var.", it's a variable passthrough and we declare the variable.
|
||||
# If it's a literal string, we inline it.
|
||||
# Special: "LOCAL_TIER" means we use local.tiers.<tier>
|
||||
SERVICES = [
|
||||
("blog", "blog", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("descheduler", "descheduler", []),
|
||||
("f1-stream", "f1-stream", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
("turn_secret", "var.coturn_turn_secret"),
|
||||
("public_ip", "var.public_ip"),
|
||||
]),
|
||||
("coturn", "coturn", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:edge"),
|
||||
("turn_secret", "var.coturn_turn_secret"),
|
||||
("public_ip", "var.public_ip"),
|
||||
]),
|
||||
("hackmd", "hackmd", [
|
||||
("hackmd_db_password", "var.hackmd_db_password"),
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:edge"),
|
||||
]),
|
||||
("kms", "kms", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("k8s-dashboard", "k8s-dashboard", [
|
||||
("tier", "LOCAL_TIER:cluster"),
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("client_certificate_secret_name", "var.client_certificate_secret_name"),
|
||||
]),
|
||||
("privatebin", "privatebin", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:edge"),
|
||||
]),
|
||||
("reloader", "reloader", [
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("shadowsocks", "shadowsocks", [
|
||||
("password", "var.shadowsocks_password"),
|
||||
("tier", "LOCAL_TIER:edge"),
|
||||
]),
|
||||
("city-guesser", "city-guesser", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("echo", "echo", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:edge"),
|
||||
]),
|
||||
("url", "url-shortener", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("geolite_license_key", "var.url_shortener_geolite_license_key"),
|
||||
("api_key", "var.url_shortener_api_key"),
|
||||
("mysql_password", "var.url_shortener_mysql_password"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("webhook_handler", "webhook_handler", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("webhook_secret", "var.webhook_handler_secret"),
|
||||
("fb_verify_token", "var.webhook_handler_fb_verify_token"),
|
||||
("fb_page_token", "var.webhook_handler_fb_page_token"),
|
||||
("fb_app_secret", "var.webhook_handler_fb_app_secret"),
|
||||
("git_user", "var.webhook_handler_git_user"),
|
||||
("git_token", "var.webhook_handler_git_token"),
|
||||
("ssh_key", "var.webhook_handler_ssh_key"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("excalidraw", "excalidraw", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("travel_blog", "travel_blog", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("dashy", "dashy", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("send", "send", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("ytdlp", "youtube_dl", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
("openrouter_api_key", "var.openrouter_api_key"),
|
||||
("slack_bot_token", "var.slack_bot_token"),
|
||||
("slack_channel", "var.slack_channel"),
|
||||
]),
|
||||
("immich", "immich", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("postgresql_password", "var.immich_postgresql_password"),
|
||||
("frame_api_key", "var.immich_frame_api_key"),
|
||||
("homepage_token", 'var.homepage_credentials["immich"]["token"]'),
|
||||
("tier", "LOCAL_TIER:gpu"),
|
||||
]),
|
||||
("resume", "resume", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
("database_url", "var.resume_database_url"),
|
||||
("auth_secret", "var.resume_auth_secret"),
|
||||
("smtp_password", 'var.mailserver_accounts["info@viktorbarzin.me"]'),
|
||||
]),
|
||||
("frigate", "frigate", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:gpu"),
|
||||
]),
|
||||
("paperless-ngx", "paperless-ngx", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("db_password", "var.paperless_db_password"),
|
||||
("homepage_username", 'var.homepage_credentials["paperless-ngx"]["username"]'),
|
||||
("homepage_password", 'var.homepage_credentials["paperless-ngx"]["password"]'),
|
||||
("tier", "LOCAL_TIER:edge"),
|
||||
]),
|
||||
("jsoncrack", "jsoncrack", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("servarr", "servarr", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
("aiostreams_database_connection_string", "var.aiostreams_database_connection_string"),
|
||||
]),
|
||||
("ollama", "ollama", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:gpu"),
|
||||
("ollama_api_credentials", "var.ollama_api_credentials"),
|
||||
]),
|
||||
("ntfy", "ntfy", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("cyberchef", "cyberchef", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("diun", "diun", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("diun_nfty_token", "var.diun_nfty_token"),
|
||||
("diun_slack_url", "var.diun_slack_url"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("meshcentral", "meshcentral", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("netbox", "netbox", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("nextcloud", "nextcloud", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("db_password", "var.nextcloud_db_password"),
|
||||
("tier", "LOCAL_TIER:edge"),
|
||||
]),
|
||||
("homepage", "homepage", [
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
]),
|
||||
("matrix", "matrix", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("linkwarden", "linkwarden", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("postgresql_password", "var.linkwarden_postgresql_password"),
|
||||
("authentik_client_id", "var.linkwarden_authentik_client_id"),
|
||||
("authentik_client_secret", "var.linkwarden_authentik_client_secret"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("actualbudget", "actualbudget", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:edge"),
|
||||
("credentials", "var.actualbudget_credentials"),
|
||||
]),
|
||||
("owntracks", "owntracks", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("owntracks_credentials", "var.owntracks_credentials"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("dawarich", "dawarich", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("database_password", "var.dawarich_database_password"),
|
||||
("geoapify_api_key", "var.geoapify_api_key"),
|
||||
("tier", "LOCAL_TIER:edge"),
|
||||
]),
|
||||
("changedetection", "changedetection", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("tandoor", "tandoor", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tandoor_database_password", "var.tandoor_database_password"),
|
||||
("tandoor_email_password", "var.tandoor_email_password"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("n8n", "n8n", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("postgresql_password", "var.n8n_postgresql_password"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("real-estate-crawler", "real-estate-crawler", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("db_password", "var.realestate_crawler_db_password"),
|
||||
("notification_settings", "var.realestate_crawler_notification_settings"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("osm_routing", "osm-routing", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("tor-proxy", "tor-proxy", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("onlyoffice", "onlyoffice", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("db_password", "var.onlyoffice_db_password"),
|
||||
("jwt_token", "var.onlyoffice_jwt_token"),
|
||||
("tier", "LOCAL_TIER:edge"),
|
||||
]),
|
||||
("forgejo", "forgejo", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:edge"),
|
||||
]),
|
||||
("freshrss", "freshrss", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("navidrome", "navidrome", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("networking-toolbox", "networking-toolbox", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("tuya-bridge", "tuya-bridge", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:cluster"),
|
||||
("tiny_tuya_api_key", "var.tiny_tuya_api_key"),
|
||||
("tiny_tuya_api_secret", "var.tiny_tuya_api_secret"),
|
||||
("tiny_tuya_service_secret", "var.tiny_tuya_service_secret"),
|
||||
("slack_url", "var.tiny_tuya_slack_url"),
|
||||
]),
|
||||
("stirling-pdf", "stirling-pdf", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("isponsorblocktv", "isponsorblocktv", [
|
||||
("tier", "LOCAL_TIER:edge"),
|
||||
]),
|
||||
("ebook2audiobook", "ebook2audiobook", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:gpu"),
|
||||
]),
|
||||
("rybbit", "rybbit", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("clickhouse_password", "var.clickhouse_password"),
|
||||
("postgres_password", "var.clickhouse_postgres_password"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("wealthfolio", "wealthfolio", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("wealthfolio_password_hash", "var.wealthfolio_password_hash"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("speedtest", "speedtest", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
("db_password", "var.speedtest_db_password"),
|
||||
]),
|
||||
("freedify", "freedify", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
("additional_credentials", "var.freedify_credentials"),
|
||||
]),
|
||||
("affine", "affine", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("postgresql_password", "var.affine_postgresql_password"),
|
||||
("smtp_password", 'var.mailserver_accounts["info@viktorbarzin.me"]'),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("plotting-book", "plotting-book", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("health", "health", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("postgresql_password", "var.health_postgresql_password"),
|
||||
("secret_key", "var.health_secret_key"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("whisper", "whisper", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("tier", "LOCAL_TIER:gpu"),
|
||||
]),
|
||||
("grampsweb", "grampsweb", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("smtp_password", 'var.mailserver_accounts["info@viktorbarzin.me"]'),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
("openclaw", "openclaw", [
|
||||
("tls_secret_name", "var.tls_secret_name"),
|
||||
("ssh_key", "var.openclaw_ssh_key"),
|
||||
("skill_secrets", "var.openclaw_skill_secrets"),
|
||||
("gemini_api_key", "var.gemini_api_key"),
|
||||
("llama_api_key", "var.llama_api_key"),
|
||||
("brave_api_key", "var.brave_api_key"),
|
||||
("modal_api_key", "var.modal_api_key"),
|
||||
("tier", "LOCAL_TIER:aux"),
|
||||
]),
|
||||
]
|
||||
|
||||
# Variable type overrides (var_name -> type declaration)
|
||||
VAR_TYPES = {
|
||||
"tls_secret_name": "string",
|
||||
"client_certificate_secret_name": "string",
|
||||
"public_ip": "string",
|
||||
"hackmd_db_password": "string",
|
||||
"shadowsocks_password": "string",
|
||||
"openrouter_api_key": "string",
|
||||
"slack_bot_token": "string",
|
||||
"slack_channel": "string",
|
||||
"ollama_api_credentials": "string",
|
||||
"clickhouse_password": "string",
|
||||
"clickhouse_postgres_password": "string",
|
||||
"wealthfolio_password_hash": "string",
|
||||
"speedtest_db_password": "string",
|
||||
"affine_postgresql_password": "string",
|
||||
"health_postgresql_password": "string",
|
||||
"health_secret_key": "string",
|
||||
"gemini_api_key": "string",
|
||||
"llama_api_key": "string",
|
||||
"brave_api_key": "string",
|
||||
"modal_api_key": "string",
|
||||
"coturn_turn_secret": "string",
|
||||
"onlyoffice_db_password": "string",
|
||||
"onlyoffice_jwt_token": "string",
|
||||
"resume_database_url": "string",
|
||||
"resume_auth_secret": "string",
|
||||
"nextcloud_db_password": "string",
|
||||
"paperless_db_password": "string",
|
||||
"diun_nfty_token": "string",
|
||||
"diun_slack_url": "string",
|
||||
"dawarich_database_password": "string",
|
||||
"geoapify_api_key": "string",
|
||||
"tandoor_database_password": "string",
|
||||
"tandoor_email_password": "string",
|
||||
"n8n_postgresql_password": "string",
|
||||
"realestate_crawler_db_password": "string",
|
||||
"immich_postgresql_password": "string",
|
||||
"immich_frame_api_key": "string",
|
||||
"linkwarden_postgresql_password": "string",
|
||||
"linkwarden_authentik_client_id": "string",
|
||||
"linkwarden_authentik_client_secret": "string",
|
||||
"aiostreams_database_connection_string": "string",
|
||||
"tiny_tuya_api_key": "string",
|
||||
"tiny_tuya_api_secret": "string",
|
||||
"tiny_tuya_service_secret": "string",
|
||||
"tiny_tuya_slack_url": "string",
|
||||
"url_shortener_geolite_license_key": "string",
|
||||
"url_shortener_api_key": "string",
|
||||
"url_shortener_mysql_password": "string",
|
||||
"webhook_handler_secret": "string",
|
||||
"webhook_handler_fb_verify_token": "string",
|
||||
"webhook_handler_fb_page_token": "string",
|
||||
"webhook_handler_fb_app_secret": "string",
|
||||
"webhook_handler_git_user": "string",
|
||||
"webhook_handler_git_token": "string",
|
||||
"webhook_handler_ssh_key": "string",
|
||||
"openclaw_ssh_key": "string",
|
||||
"openclaw_skill_secrets": "map(string)",
|
||||
"actualbudget_credentials": "map(any)",
|
||||
"freedify_credentials": "map(any)",
|
||||
"realestate_crawler_notification_settings": "map(string)",
|
||||
"homepage_credentials": "map(any)",
|
||||
"mailserver_accounts": "map(any)",
|
||||
"owntracks_credentials": "string",
|
||||
}
|
||||
|
||||
TERRAGRUNT_HCL = """\
|
||||
include "root" {
|
||||
path = find_in_parent_folders()
|
||||
}
|
||||
|
||||
dependency "platform" {
|
||||
config_path = "../platform"
|
||||
skip_outputs = true
|
||||
}
|
||||
"""
|
||||
|
||||
TIERS_BLOCK = """\
|
||||
locals {
|
||||
tiers = {
|
||||
core = "0-core"
|
||||
cluster = "1-cluster"
|
||||
gpu = "2-gpu"
|
||||
edge = "3-edge"
|
||||
aux = "4-aux"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
def extract_var_name(expr):
|
||||
"""Extract variable name from var.xxx or var.xxx["yyy"]["zzz"]."""
|
||||
if not expr.startswith("var."):
|
||||
return None
|
||||
# Get the base variable name (before any indexing)
|
||||
name = expr[4:]
|
||||
bracket = name.find("[")
|
||||
if bracket != -1:
|
||||
name = name[:bracket]
|
||||
return name
|
||||
|
||||
|
||||
def gen_main_tf(mod_name, source_dir, args):
|
||||
"""Generate main.tf content for a service stack."""
|
||||
lines = []
|
||||
|
||||
# Collect variables needed
|
||||
vars_needed = {}
|
||||
needs_tiers = False
|
||||
for arg_name, var_expr in args:
|
||||
if var_expr.startswith("LOCAL_TIER:"):
|
||||
needs_tiers = True
|
||||
continue
|
||||
vname = extract_var_name(var_expr)
|
||||
if vname and vname not in vars_needed:
|
||||
vtype = VAR_TYPES.get(vname, None)
|
||||
vars_needed[vname] = vtype
|
||||
|
||||
# Variable declarations
|
||||
for vname, vtype in vars_needed.items():
|
||||
if vtype:
|
||||
lines.append(f'variable "{vname}" {{ type = {vtype} }}')
|
||||
else:
|
||||
lines.append(f'variable "{vname}" {{}}')
|
||||
|
||||
if vars_needed:
|
||||
lines.append("")
|
||||
|
||||
# Tiers block if needed
|
||||
if needs_tiers:
|
||||
lines.append(TIERS_BLOCK)
|
||||
|
||||
# Module call
|
||||
lines.append(f'module "{mod_name}" {{')
|
||||
lines.append(f' source = "../../modules/kubernetes/{source_dir}"')
|
||||
for arg_name, var_expr in args:
|
||||
if var_expr.startswith("LOCAL_TIER:"):
|
||||
tier = var_expr.split(":")[1]
|
||||
val = f"local.tiers.{tier}"
|
||||
else:
|
||||
val = var_expr
|
||||
# Pad for alignment
|
||||
lines.append(f" {arg_name:30s} = {val}")
|
||||
lines.append("}")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
stacks_dir = os.path.join(REPO_ROOT, "stacks")
|
||||
|
||||
for mod_name, source_dir, args in SERVICES:
|
||||
# Use source_dir as the stack directory name for consistency
|
||||
# But some modules have different names than source dirs
|
||||
# Use the module name for the stack dir
|
||||
stack_dir = os.path.join(stacks_dir, mod_name)
|
||||
os.makedirs(stack_dir, exist_ok=True)
|
||||
|
||||
# terragrunt.hcl
|
||||
tg_path = os.path.join(stack_dir, "terragrunt.hcl")
|
||||
with open(tg_path, "w") as f:
|
||||
f.write(TERRAGRUNT_HCL)
|
||||
|
||||
# main.tf
|
||||
main_path = os.path.join(stack_dir, "main.tf")
|
||||
with open(main_path, "w") as f:
|
||||
f.write(gen_main_tf(mod_name, source_dir, args))
|
||||
|
||||
# secrets symlink
|
||||
secrets_link = os.path.join(stack_dir, "secrets")
|
||||
if not os.path.exists(secrets_link):
|
||||
os.symlink("../../secrets", secrets_link)
|
||||
|
||||
print(f" Created stacks/{mod_name}/")
|
||||
|
||||
print(f"\nGenerated {len(SERVICES)} service stacks")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,143 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# graceful-db-maintenance.sh — Scale down/up dependents of a service
|
||||
# based on the dependency.kyverno.io/wait-for pod annotation.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/graceful-db-maintenance.sh shutdown mysql.dbaas
|
||||
# # ... perform maintenance ...
|
||||
# ./scripts/graceful-db-maintenance.sh startup mysql.dbaas
|
||||
#
|
||||
# The shutdown action saves original replica counts to a state file
|
||||
# so startup can restore them exactly.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
ACTION="${1:-}"
|
||||
SERVICE="${2:-}"
|
||||
STATE_DIR="/tmp"
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 <shutdown|startup> <service>"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " $0 shutdown mysql.dbaas # Scale down all MySQL dependents"
|
||||
echo " $0 startup mysql.dbaas # Restore all MySQL dependents"
|
||||
echo " $0 shutdown postgresql.dbaas # Scale down all PostgreSQL dependents"
|
||||
echo " $0 shutdown redis.redis # Scale down all Redis dependents"
|
||||
exit 1
|
||||
}
|
||||
|
||||
[[ -z "$ACTION" || -z "$SERVICE" ]] && usage
|
||||
[[ "$ACTION" != "shutdown" && "$ACTION" != "startup" ]] && usage
|
||||
|
||||
STATE_FILE="${STATE_DIR}/dep-maintenance-$(echo "$SERVICE" | tr '.' '-').json"
|
||||
KUBECONFIG="${KUBECONFIG:-$(dirname "$0")/../config}"
|
||||
export KUBECONFIG
|
||||
|
||||
# Find all pods with the dependency annotation containing our service
|
||||
find_dependent_owners() {
|
||||
local service="$1"
|
||||
kubectl get pods --all-namespaces \
|
||||
-o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.annotations.dependency\.kyverno\.io/wait-for}{"\t"}{.metadata.ownerReferences[0].kind}{"\t"}{.metadata.ownerReferences[0].name}{"\n"}{end}' \
|
||||
2>/dev/null | \
|
||||
grep "$service" | \
|
||||
while IFS=$'\t' read -r ns annotation owner_kind owner_name; do
|
||||
[[ -z "$owner_kind" || -z "$owner_name" ]] && continue
|
||||
# Resolve ReplicaSet -> Deployment
|
||||
if [[ "$owner_kind" == "ReplicaSet" ]]; then
|
||||
deploy_name=$(kubectl get replicaset "$owner_name" -n "$ns" \
|
||||
-o jsonpath='{.metadata.ownerReferences[0].name}' 2>/dev/null || true)
|
||||
if [[ -n "$deploy_name" ]]; then
|
||||
echo "Deployment/${deploy_name}/${ns}"
|
||||
fi
|
||||
elif [[ "$owner_kind" == "StatefulSet" ]]; then
|
||||
echo "StatefulSet/${owner_name}/${ns}"
|
||||
fi
|
||||
done | sort -u
|
||||
}
|
||||
|
||||
do_shutdown() {
|
||||
echo "Finding dependents of $SERVICE..."
|
||||
local owners
|
||||
owners=$(find_dependent_owners "$SERVICE")
|
||||
|
||||
if [[ -z "$owners" ]]; then
|
||||
echo "No dependents found for $SERVICE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Dependents found:"
|
||||
echo "$owners" | while IFS='/' read -r kind name ns; do
|
||||
echo " $ns/$kind/$name"
|
||||
done
|
||||
|
||||
# Save current replica counts
|
||||
local state="[]"
|
||||
while IFS='/' read -r kind name ns; do
|
||||
replicas=$(kubectl get "$kind" "$name" -n "$ns" \
|
||||
-o jsonpath='{.spec.replicas}' 2>/dev/null || echo "1")
|
||||
state=$(echo "$state" | jq --arg kind "$kind" --arg name "$name" \
|
||||
--arg ns "$ns" --argjson replicas "${replicas:-1}" \
|
||||
'. + [{"kind": $kind, "name": $name, "namespace": $ns, "replicas": $replicas}]')
|
||||
done <<< "$owners"
|
||||
|
||||
echo "$state" > "$STATE_FILE"
|
||||
echo "Saved replica state to $STATE_FILE"
|
||||
|
||||
# Scale down
|
||||
while IFS='/' read -r kind name ns; do
|
||||
echo "Scaling $ns/$kind/$name to 0..."
|
||||
kubectl scale "$kind" "$name" -n "$ns" --replicas=0
|
||||
done <<< "$owners"
|
||||
|
||||
echo ""
|
||||
echo "Waiting for pods to terminate..."
|
||||
while IFS='/' read -r kind name ns; do
|
||||
kubectl rollout status "$kind" "$name" -n "$ns" --timeout=120s 2>/dev/null || true
|
||||
done <<< "$owners"
|
||||
|
||||
echo ""
|
||||
echo "All dependents of $SERVICE scaled to 0."
|
||||
echo "Run '$0 startup $SERVICE' after maintenance to restore."
|
||||
}
|
||||
|
||||
do_startup() {
|
||||
if [[ ! -f "$STATE_FILE" ]]; then
|
||||
echo "Error: No state file found at $STATE_FILE"
|
||||
echo "Did you run '$0 shutdown $SERVICE' first?"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Restoring dependents of $SERVICE from $STATE_FILE..."
|
||||
|
||||
local count
|
||||
count=$(jq length "$STATE_FILE")
|
||||
|
||||
for ((i = 0; i < count; i++)); do
|
||||
kind=$(jq -r ".[$i].kind" "$STATE_FILE")
|
||||
name=$(jq -r ".[$i].name" "$STATE_FILE")
|
||||
ns=$(jq -r ".[$i].namespace" "$STATE_FILE")
|
||||
replicas=$(jq -r ".[$i].replicas" "$STATE_FILE")
|
||||
|
||||
echo "Scaling $ns/$kind/$name to $replicas..."
|
||||
kubectl scale "$kind" "$name" -n "$ns" --replicas="$replicas"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "Waiting for rollouts..."
|
||||
for ((i = 0; i < count; i++)); do
|
||||
kind=$(jq -r ".[$i].kind" "$STATE_FILE")
|
||||
name=$(jq -r ".[$i].name" "$STATE_FILE")
|
||||
ns=$(jq -r ".[$i].namespace" "$STATE_FILE")
|
||||
kubectl rollout status "$kind" "$name" -n "$ns" --timeout=300s 2>/dev/null || true
|
||||
done
|
||||
|
||||
rm -f "$STATE_FILE"
|
||||
echo ""
|
||||
echo "All dependents of $SERVICE restored."
|
||||
}
|
||||
|
||||
case "$ACTION" in
|
||||
shutdown) do_shutdown ;;
|
||||
startup) do_startup ;;
|
||||
esac
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
for n in $(kubectl get nodes -o wide | grep node | awk '{print $1}'); do
|
||||
echo $n;
|
||||
kubectl drain $n --ignore-daemonsets --delete-emptydir-data && \
|
||||
ssh wizard@$n < image_pull_remote.sh
|
||||
# Check result
|
||||
kubectl get --raw "/api/v1/nodes/$n/proxy/configz" | jq '.kubeletconfig | {serializeImagePulls, maxParallelImagePulls}'
|
||||
kubectl uncordon $n
|
||||
done
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Containerd
|
||||
sudo sed -i 's/.*max_concurrent_downloads.*/max_concurrent_downloads = 5/g' /etc/containerd/config.toml
|
||||
sudo systemctl restart containerd
|
||||
|
||||
# Kubelet
|
||||
#sed serializeImagePulls: false # Allow container images to be downloaded in parallel
|
||||
#maxParallelImagePulls: 20 # To limit the number of parallel image pulls.
|
||||
|
||||
sudo sed -i '/serializeImagePulls:/d' /var/lib/kubelet/config.yaml && \
|
||||
sudo sed -i '/maxParallelImagePulls:/d' /var/lib/kubelet/config.yaml && \
|
||||
echo -e 'serializeImagePulls: false\nmaxParallelImagePulls: 5' | sudo tee -a /var/lib/kubelet/config.yaml
|
||||
sudo systemctl restart kubelet
|
||||
|
|
@ -1,57 +0,0 @@
|
|||
# kube-apiserver audit policy -- k8s-master (10.0.20.100), single control-plane.
|
||||
#
|
||||
# Goal: a durable "who/when/what" trail for MUTATIONS (create/update/patch/
|
||||
# delete) so resource deletions can be attributed even though direct
|
||||
# kubectl-to-apiserver calls otherwise leave no trace (see the 2026-06-06
|
||||
# novelapp incident: a dashboard delete was attributable, a direct-kubectl
|
||||
# recreate was not). Deployed OUTSIDE Terraform (the k8s VMs are not TF-managed,
|
||||
# see memory id=1575); this file is the source of truth, scp'd to
|
||||
# /etc/kubernetes/audit-policy.yaml and wired into the apiserver static-pod
|
||||
# manifest + the kubeadm-config ConfigMap (so "kubeadm upgrade" preserves it).
|
||||
#
|
||||
# Tuned for LOW WRITE VOLUME (the cluster's sdc HDD is write-sensitive, see
|
||||
# memory id=559): reads are dropped entirely, high-churn resources and probe
|
||||
# endpoints are dropped, and the verbose RequestReceived stage is omitted, so
|
||||
# only one Metadata-level line is written per mutating request.
|
||||
apiVersion: audit.k8s.io/v1
|
||||
kind: Policy
|
||||
# Only emit the post-execution stage -- halves volume vs logging both stages.
|
||||
omitStages:
|
||||
- RequestReceived
|
||||
rules:
|
||||
# 1. Never log read-only verbs -- the overwhelming majority of traffic and
|
||||
# irrelevant to "who changed/deleted X".
|
||||
- level: None
|
||||
verbs: ["get", "list", "watch"]
|
||||
|
||||
# 2. Drop high-churn / low-value resources even on writes.
|
||||
- level: None
|
||||
resources:
|
||||
- group: ""
|
||||
resources: ["events", "endpoints", "nodes/status", "pods/status"]
|
||||
- group: "coordination.k8s.io"
|
||||
resources: ["leases"]
|
||||
- group: "discovery.k8s.io"
|
||||
resources: ["endpointslices"]
|
||||
- group: "metrics.k8s.io"
|
||||
- group: "authentication.k8s.io"
|
||||
resources: ["tokenreviews"]
|
||||
- group: "authorization.k8s.io"
|
||||
resources: ["subjectaccessreviews", "selfsubjectaccessreviews"]
|
||||
|
||||
# 3. Drop noisy non-resource probe / discovery URLs.
|
||||
- level: None
|
||||
nonResourceURLs:
|
||||
- "/healthz*"
|
||||
- "/readyz*"
|
||||
- "/livez*"
|
||||
- "/version"
|
||||
- "/metrics"
|
||||
- "/openapi/*"
|
||||
- "/swagger*"
|
||||
|
||||
# 4. Everything else (every create/update/patch/delete on real resources):
|
||||
# record WHO (user + sourceIP + userAgent), WHAT (resource/namespace/name),
|
||||
# WHEN, and the verb -- at Metadata level (no request/response bodies, so
|
||||
# each entry stays small).
|
||||
- level: Metadata
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
NAMESPACE=$1
|
||||
if [ -z "$NAMESPACE" ]; then
|
||||
echo "Pass in parameter namespace"
|
||||
exit 1
|
||||
fi
|
||||
kubectl proxy &
|
||||
kubectl get namespace $NAMESPACE -o json |jq '.spec = {"finalizers":[]}' > /tmp/kill_rogue_ns.json
|
||||
curl -k -H "Content-Type: application/json" -X PUT --data-binary @/tmp/kill_rogue_ns.json 127.0.0.1:8001/api/v1/namespaces/$NAMESPACE/finalize
|
||||
kill %1
|
||||
|
|
@ -1,469 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# lvm-pvc-snapshot — LVM thin snapshot management for Proxmox CSI PVCs
|
||||
# Deploy to PVE host at /usr/local/bin/lvm-pvc-snapshot
|
||||
set -euo pipefail
|
||||
|
||||
# --- Configuration ---
|
||||
VG="pve"
|
||||
THINPOOL="data"
|
||||
SNAP_SUFFIX_FORMAT="%Y%m%d_%H%M"
|
||||
RETENTION_DAYS=7
|
||||
MIN_FREE_PCT=10
|
||||
PUSHGATEWAY="${LVM_SNAP_PUSHGATEWAY:-http://10.0.20.100:30091}"
|
||||
PUSHGATEWAY_JOB="lvm-pvc-snapshot"
|
||||
LOCKFILE="/run/lvm-pvc-snapshot.lock"
|
||||
KUBECONFIG="${KUBECONFIG:-/root/.kube/config}"
|
||||
export KUBECONFIG
|
||||
|
||||
# Namespaces to exclude from snapshots (high-churn, have app-level dumps)
|
||||
# These PVCs cause significant CoW write amplification (~36% overhead)
|
||||
EXCLUDE_NAMESPACES="${LVM_SNAP_EXCLUDE_NS:-dbaas,monitoring}"
|
||||
|
||||
# --- Logging ---
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
||||
warn() { log "WARN: $*" >&2; }
|
||||
die() { log "FATAL: $*" >&2; exit 1; }
|
||||
|
||||
# --- Helpers ---
|
||||
|
||||
get_thinpool_free_pct() {
|
||||
local data_pct
|
||||
data_pct=$(lvs --noheadings --nosuffix -o data_percent "${VG}/${THINPOOL}" 2>/dev/null | tr -d ' ')
|
||||
echo "scale=2; 100 - ${data_pct}" | bc
|
||||
}
|
||||
|
||||
build_exclude_lv_list() {
|
||||
# Query K8s for PVs in excluded namespaces, extract their LV names
|
||||
if [[ -z "${EXCLUDE_NAMESPACES}" ]] || ! command -v kubectl &>/dev/null; then
|
||||
return
|
||||
fi
|
||||
kubectl get pv -o json 2>/dev/null | jq -r --arg ns "${EXCLUDE_NAMESPACES}" '
|
||||
($ns | split(",")) as $excl |
|
||||
.items[] |
|
||||
select(.spec.csi.driver == "csi.proxmox.sinextra.dev") |
|
||||
select(.spec.claimRef.namespace as $n | $excl | index($n)) |
|
||||
.spec.csi.volumeHandle | split("/") | last
|
||||
' 2>/dev/null || true
|
||||
}
|
||||
|
||||
discover_pvc_lvs() {
|
||||
# List thin LVs matching PVC pattern, excluding snapshots, pre-restore backups,
|
||||
# and LVs belonging to excluded namespaces (high-churn databases/metrics)
|
||||
local all_lvs exclude_lvs
|
||||
all_lvs=$(lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \
|
||||
| awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \
|
||||
| grep -E '^vm-[0-9]+-pvc-' \
|
||||
| grep -v '_snap_' \
|
||||
| grep -v '_pre_restore_')
|
||||
|
||||
exclude_lvs=$(build_exclude_lv_list)
|
||||
|
||||
if [[ -n "${exclude_lvs}" ]]; then
|
||||
# Filter out excluded LVs
|
||||
local exclude_pattern
|
||||
exclude_pattern=$(echo "${exclude_lvs}" | paste -sd'|' -)
|
||||
echo "${all_lvs}" | grep -vE "(${exclude_pattern})" || true
|
||||
else
|
||||
echo "${all_lvs}"
|
||||
fi
|
||||
}
|
||||
|
||||
list_snapshots() {
|
||||
lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \
|
||||
| awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \
|
||||
| grep '_snap_' || true
|
||||
}
|
||||
|
||||
parse_snap_timestamp() {
|
||||
# Extract YYYYMMDD_HHMM from snapshot name, convert to epoch
|
||||
local snap_name="$1"
|
||||
local ts_str
|
||||
ts_str=$(echo "${snap_name}" | grep -oE '[0-9]{8}_[0-9]{4}$')
|
||||
if [[ -z "${ts_str}" ]]; then
|
||||
echo "0"
|
||||
return
|
||||
fi
|
||||
local ymd="${ts_str:0:8}"
|
||||
local hm="${ts_str:9:4}"
|
||||
date -d "${ymd:0:4}-${ymd:4:2}-${ymd:6:2} ${hm:0:2}:${hm:2:2}" +%s 2>/dev/null || echo "0"
|
||||
}
|
||||
|
||||
get_original_lv_from_snap() {
|
||||
# vm-200-pvc-abc_snap_20260403_1200 -> vm-200-pvc-abc
|
||||
echo "$1" | sed 's/_snap_[0-9]\{8\}_[0-9]\{4\}$//'
|
||||
}
|
||||
|
||||
push_metrics() {
|
||||
local status="$1" created="$2" failed="$3" pruned="$4"
|
||||
local free_pct
|
||||
free_pct=$(get_thinpool_free_pct)
|
||||
|
||||
cat <<METRICS | curl -sf --connect-timeout 5 --max-time 10 --data-binary @- \
|
||||
"${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || warn "Failed to push metrics to Pushgateway"
|
||||
# HELP lvm_snapshot_last_run_timestamp Unix timestamp of last snapshot run
|
||||
# TYPE lvm_snapshot_last_run_timestamp gauge
|
||||
lvm_snapshot_last_run_timestamp $(date +%s)
|
||||
# HELP lvm_snapshot_last_status Exit status (0=success, 1=partial failure, 2=aborted)
|
||||
# TYPE lvm_snapshot_last_status gauge
|
||||
lvm_snapshot_last_status ${status}
|
||||
# HELP lvm_snapshot_created_total Number of snapshots created in last run
|
||||
# TYPE lvm_snapshot_created_total gauge
|
||||
lvm_snapshot_created_total ${created}
|
||||
# HELP lvm_snapshot_failed_total Number of snapshot failures in last run
|
||||
# TYPE lvm_snapshot_failed_total gauge
|
||||
lvm_snapshot_failed_total ${failed}
|
||||
# HELP lvm_snapshot_pruned_total Number of snapshots pruned in last run
|
||||
# TYPE lvm_snapshot_pruned_total gauge
|
||||
lvm_snapshot_pruned_total ${pruned}
|
||||
# HELP lvm_snapshot_thinpool_free_pct Thin pool free percentage
|
||||
# TYPE lvm_snapshot_thinpool_free_pct gauge
|
||||
lvm_snapshot_thinpool_free_pct ${free_pct}
|
||||
METRICS
|
||||
}
|
||||
|
||||
# --- Subcommands ---
|
||||
|
||||
cmd_snapshot() {
|
||||
log "Starting PVC LVM thin snapshot run"
|
||||
|
||||
# Check thin pool free space
|
||||
local free_pct
|
||||
free_pct=$(get_thinpool_free_pct)
|
||||
log "Thin pool free space: ${free_pct}%"
|
||||
if (( $(echo "${free_pct} < ${MIN_FREE_PCT}" | bc -l) )); then
|
||||
warn "Thin pool has only ${free_pct}% free (minimum: ${MIN_FREE_PCT}%). Aborting."
|
||||
push_metrics 2 0 0 0
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Discover PVC LVs
|
||||
local lvs_list
|
||||
lvs_list=$(discover_pvc_lvs)
|
||||
if [[ -z "${lvs_list}" ]]; then
|
||||
warn "No PVC LVs found matching pattern"
|
||||
push_metrics 2 0 0 0
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local count=0 failed=0 total
|
||||
total=$(echo "${lvs_list}" | wc -l | tr -d ' ')
|
||||
local snap_ts
|
||||
snap_ts=$(date +"${SNAP_SUFFIX_FORMAT}")
|
||||
|
||||
log "Found ${total} PVC LVs to snapshot"
|
||||
|
||||
while IFS= read -r lv; do
|
||||
local snap_name="${lv}_snap_${snap_ts}"
|
||||
if lvcreate -s -kn -n "${snap_name}" "${VG}/${lv}" >/dev/null 2>&1; then
|
||||
log " Created: ${snap_name}"
|
||||
count=$((count + 1))
|
||||
else
|
||||
warn " Failed to create snapshot for ${lv}"
|
||||
failed=$((failed + 1))
|
||||
fi
|
||||
done <<< "${lvs_list}"
|
||||
|
||||
log "Snapshot run complete: ${count} created, ${failed} failed out of ${total}"
|
||||
|
||||
# Auto-prune
|
||||
log "Running auto-prune..."
|
||||
local pruned
|
||||
pruned=$(cmd_prune_count)
|
||||
|
||||
# Determine status
|
||||
local status=0
|
||||
if (( failed > 0 && count > 0 )); then
|
||||
status=1 # partial
|
||||
elif (( failed > 0 && count == 0 )); then
|
||||
status=2 # all failed
|
||||
fi
|
||||
|
||||
push_metrics "${status}" "${count}" "${failed}" "${pruned}"
|
||||
log "Done"
|
||||
}
|
||||
|
||||
cmd_list() {
|
||||
printf "%-45s %-50s %8s %8s\n" "ORIGINAL LV" "SNAPSHOT" "AGE" "DATA%"
|
||||
printf "%-45s %-50s %8s %8s\n" "-----------" "--------" "---" "-----"
|
||||
|
||||
local now
|
||||
now=$(date +%s)
|
||||
|
||||
local snap_lines
|
||||
snap_lines=$(lvs --noheadings --nosuffix -o lv_name,lv_size,data_percent "${VG}" 2>/dev/null \
|
||||
| grep -E '_snap_|_pre_restore_' || true)
|
||||
|
||||
if [[ -z "${snap_lines}" ]]; then
|
||||
echo "(no snapshots found)"
|
||||
return
|
||||
fi
|
||||
|
||||
echo "${snap_lines}" | while read -r name size data_pct; do
|
||||
local original age_str ts epoch
|
||||
if [[ "${name}" == *"_pre_restore_"* ]]; then
|
||||
original=$(echo "${name}" | sed 's/_pre_restore_[0-9]\{8\}_[0-9]\{4\}$//')
|
||||
ts=$(echo "${name}" | grep -oE '[0-9]{8}_[0-9]{4}$')
|
||||
else
|
||||
original=$(get_original_lv_from_snap "${name}")
|
||||
ts=$(echo "${name}" | grep -oE '[0-9]{8}_[0-9]{4}$')
|
||||
fi
|
||||
epoch=$(parse_snap_timestamp "${name}")
|
||||
if (( epoch > 0 )); then
|
||||
local age_s=$(( now - epoch ))
|
||||
local days=$(( age_s / 86400 ))
|
||||
local hours=$(( (age_s % 86400) / 3600 ))
|
||||
age_str="${days}d${hours}h"
|
||||
else
|
||||
age_str="unknown"
|
||||
fi
|
||||
printf "%-45s %-50s %8s %7s%%\n" "${original}" "${name}" "${age_str}" "${data_pct}"
|
||||
done
|
||||
}
|
||||
|
||||
cmd_prune() {
|
||||
local pruned
|
||||
pruned=$(cmd_prune_count)
|
||||
log "Pruned ${pruned} expired snapshots"
|
||||
}
|
||||
|
||||
cmd_prune_count() {
|
||||
# NOTE: stdout of this function is captured by callers (`pruned=$(cmd_prune_count)`),
|
||||
# so all log/warn output must go to stderr — the only thing on stdout is the count.
|
||||
local now cutoff pruned=0
|
||||
now=$(date +%s)
|
||||
cutoff=$(( now - RETENTION_DAYS * 86400 ))
|
||||
|
||||
local snaps
|
||||
snaps=$(lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \
|
||||
| awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \
|
||||
| grep -E '_snap_|_pre_restore_' || true)
|
||||
|
||||
if [[ -z "${snaps}" ]]; then
|
||||
echo "0"
|
||||
return
|
||||
fi
|
||||
|
||||
while IFS= read -r snap; do
|
||||
local epoch
|
||||
epoch=$(parse_snap_timestamp "${snap}")
|
||||
if (( epoch > 0 && epoch < cutoff )); then
|
||||
if lvremove -f "${VG}/${snap}" >/dev/null 2>&1; then
|
||||
log " Pruned: ${snap}" >&2
|
||||
pruned=$((pruned + 1))
|
||||
else
|
||||
warn " Failed to prune: ${snap}"
|
||||
fi
|
||||
fi
|
||||
done <<< "${snaps}"
|
||||
|
||||
echo "${pruned}"
|
||||
}
|
||||
|
||||
cmd_restore() {
|
||||
local pvc_lv="${1:-}" snapshot_lv="${2:-}"
|
||||
|
||||
if [[ -z "${pvc_lv}" || -z "${snapshot_lv}" ]]; then
|
||||
die "Usage: $0 restore <pvc-lv-name> <snapshot-lv-name>"
|
||||
fi
|
||||
|
||||
# Validate LVs exist
|
||||
if ! lvs "${VG}/${pvc_lv}" >/dev/null 2>&1; then
|
||||
die "PVC LV '${pvc_lv}' not found in VG '${VG}'"
|
||||
fi
|
||||
if ! lvs "${VG}/${snapshot_lv}" >/dev/null 2>&1; then
|
||||
die "Snapshot LV '${snapshot_lv}' not found in VG '${VG}'"
|
||||
fi
|
||||
|
||||
# Discover K8s context
|
||||
log "Discovering Kubernetes context for LV '${pvc_lv}'..."
|
||||
|
||||
local volume_handle="local-lvm:${pvc_lv}"
|
||||
local pv_info
|
||||
pv_info=$(kubectl get pv -o json 2>/dev/null | jq -r \
|
||||
--arg vh "${volume_handle}" \
|
||||
'.items[] | select(.spec.csi.volumeHandle == $vh) | "\(.metadata.name) \(.spec.claimRef.namespace) \(.spec.claimRef.name)"' \
|
||||
) || die "Failed to query PVs (is kubectl configured?)"
|
||||
|
||||
if [[ -z "${pv_info}" ]]; then
|
||||
die "No PV found with volumeHandle '${volume_handle}'"
|
||||
fi
|
||||
|
||||
local pv_name pvc_ns pvc_name
|
||||
read -r pv_name pvc_ns pvc_name <<< "${pv_info}"
|
||||
log "Found: PV=${pv_name}, PVC=${pvc_ns}/${pvc_name}"
|
||||
|
||||
# Find the workload (Deployment or StatefulSet) that uses this PVC
|
||||
local workload_type="" workload_name="" original_replicas=""
|
||||
|
||||
# Check StatefulSets first (databases use these)
|
||||
local sts_info
|
||||
sts_info=$(kubectl get statefulset -n "${pvc_ns}" -o json 2>/dev/null | jq -r \
|
||||
--arg pvc "${pvc_name}" \
|
||||
'.items[] | select(
|
||||
(.spec.template.spec.volumes // [] | .[].persistentVolumeClaim.claimName == $pvc) or
|
||||
(.spec.volumeClaimTemplates // [] | .[].metadata.name as $vct |
|
||||
.spec.replicas as $r | range($r) | "\($vct)-\(.metadata.name)-\(.)" ) == $pvc
|
||||
) | "\(.metadata.name) \(.spec.replicas)"' 2>/dev/null \
|
||||
) || true
|
||||
|
||||
# If not found via simple volume check, try matching VCT naming pattern
|
||||
if [[ -z "${sts_info}" ]]; then
|
||||
sts_info=$(kubectl get statefulset -n "${pvc_ns}" -o json 2>/dev/null | jq -r \
|
||||
--arg pvc "${pvc_name}" \
|
||||
'.items[] | .metadata.name as $sts | .spec.replicas as $r |
|
||||
select(.spec.volumeClaimTemplates != null) |
|
||||
.spec.volumeClaimTemplates[].metadata.name as $vct |
|
||||
[range($r)] | map("\($vct)-\($sts)-\(.)") |
|
||||
if any(. == $pvc) then "\($sts) \($r)" else empty end' 2>/dev/null \
|
||||
) || true
|
||||
fi
|
||||
|
||||
if [[ -n "${sts_info}" ]]; then
|
||||
read -r workload_name original_replicas <<< "${sts_info}"
|
||||
workload_type="statefulset"
|
||||
else
|
||||
# Check Deployments
|
||||
local deploy_info
|
||||
deploy_info=$(kubectl get deployment -n "${pvc_ns}" -o json 2>/dev/null | jq -r \
|
||||
--arg pvc "${pvc_name}" \
|
||||
'.items[] | select(
|
||||
.spec.template.spec.volumes // [] | .[].persistentVolumeClaim.claimName == $pvc
|
||||
) | "\(.metadata.name) \(.spec.replicas)"' 2>/dev/null \
|
||||
) || true
|
||||
|
||||
if [[ -n "${deploy_info}" ]]; then
|
||||
read -r workload_name original_replicas <<< "${deploy_info}"
|
||||
workload_type="deployment"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -z "${workload_type}" ]]; then
|
||||
warn "Could not auto-discover workload for PVC '${pvc_name}' in namespace '${pvc_ns}'."
|
||||
warn "You may need to scale down the pod manually."
|
||||
echo ""
|
||||
read -rp "Continue with LV swap anyway? (yes/no): " confirm
|
||||
[[ "${confirm}" == "yes" ]] || die "Aborted by user"
|
||||
workload_type="manual"
|
||||
fi
|
||||
|
||||
# Dry-run output
|
||||
local backup_name="${pvc_lv}_pre_restore_$(date +"${SNAP_SUFFIX_FORMAT}")"
|
||||
echo ""
|
||||
echo "╔══════════════════════════════════════════════════════════════╗"
|
||||
echo "║ RESTORE DRY-RUN ║"
|
||||
echo "╠══════════════════════════════════════════════════════════════╣"
|
||||
echo "║ PVC: ${pvc_ns}/${pvc_name}"
|
||||
echo "║ PV: ${pv_name}"
|
||||
if [[ "${workload_type}" != "manual" ]]; then
|
||||
echo "║ Workload: ${workload_type}/${workload_name} (replicas: ${original_replicas}→0→${original_replicas})"
|
||||
fi
|
||||
echo "║"
|
||||
echo "║ Actions:"
|
||||
if [[ "${workload_type}" != "manual" ]]; then
|
||||
echo "║ 1. Scale ${workload_type}/${workload_name} to 0 replicas"
|
||||
echo "║ 2. Wait for pod termination"
|
||||
fi
|
||||
echo "║ 3. Rename ${pvc_lv} → ${backup_name}"
|
||||
echo "║ 4. Rename ${snapshot_lv} → ${pvc_lv}"
|
||||
if [[ "${workload_type}" != "manual" ]]; then
|
||||
echo "║ 5. Scale ${workload_type}/${workload_name} back to ${original_replicas} replicas"
|
||||
fi
|
||||
echo "╚══════════════════════════════════════════════════════════════╝"
|
||||
echo ""
|
||||
|
||||
# Interactive confirmation
|
||||
read -rp "Type 'yes' to proceed with restore: " confirm
|
||||
if [[ "${confirm}" != "yes" ]]; then
|
||||
die "Aborted by user"
|
||||
fi
|
||||
|
||||
# Scale down
|
||||
if [[ "${workload_type}" != "manual" ]]; then
|
||||
log "Scaling ${workload_type}/${workload_name} to 0 replicas..."
|
||||
kubectl scale "${workload_type}/${workload_name}" -n "${pvc_ns}" --replicas=0
|
||||
|
||||
log "Waiting for pod termination (timeout: 120s)..."
|
||||
kubectl wait --for=delete pod -l "app.kubernetes.io/name=${workload_name}" -n "${pvc_ns}" --timeout=120s 2>/dev/null || \
|
||||
kubectl wait --for=delete pod -l "app=${workload_name}" -n "${pvc_ns}" --timeout=120s 2>/dev/null || \
|
||||
warn "Timeout waiting for pods — continuing anyway (LV may still be in use)"
|
||||
sleep 5 # extra grace period for device detach
|
||||
fi
|
||||
|
||||
# Verify LV is not active
|
||||
local lv_active
|
||||
lv_active=$(lvs --noheadings -o lv_active "${VG}/${pvc_lv}" 2>/dev/null | tr -d ' ')
|
||||
if [[ "${lv_active}" == "active" ]]; then
|
||||
warn "LV ${pvc_lv} is still active. Attempting to deactivate..."
|
||||
# Close any LUKS mapper on the LV before deactivation
|
||||
if dmsetup ls 2>/dev/null | grep -q "${pvc_lv}"; then
|
||||
log "Closing LUKS mapper for ${pvc_lv}..."
|
||||
cryptsetup luksClose "${pvc_lv}" 2>/dev/null || true
|
||||
fi
|
||||
lvchange -an "${VG}/${pvc_lv}" 2>/dev/null || warn "Could not deactivate — proceeding with caution"
|
||||
fi
|
||||
|
||||
# LV swap
|
||||
log "Renaming ${pvc_lv} → ${backup_name}"
|
||||
lvrename "${VG}" "${pvc_lv}" "${backup_name}" || die "Failed to rename original LV"
|
||||
|
||||
log "Renaming ${snapshot_lv} → ${pvc_lv}"
|
||||
lvrename "${VG}" "${snapshot_lv}" "${pvc_lv}" || die "Failed to rename snapshot LV"
|
||||
|
||||
# Scale back up
|
||||
if [[ "${workload_type}" != "manual" ]]; then
|
||||
log "Scaling ${workload_type}/${workload_name} back to ${original_replicas} replicas..."
|
||||
kubectl scale "${workload_type}/${workload_name}" -n "${pvc_ns}" --replicas="${original_replicas}"
|
||||
|
||||
log "Waiting for pod to become Ready (timeout: 300s)..."
|
||||
kubectl wait --for=condition=Ready pod -l "app.kubernetes.io/name=${workload_name}" -n "${pvc_ns}" --timeout=300s 2>/dev/null || \
|
||||
kubectl wait --for=condition=Ready pod -l "app=${workload_name}" -n "${pvc_ns}" --timeout=300s 2>/dev/null || \
|
||||
warn "Timeout waiting for pod Ready — check manually"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
log "Restore complete!"
|
||||
log "Old data preserved as: ${backup_name}"
|
||||
log "To delete old data after verification: lvremove -f ${VG}/${backup_name}"
|
||||
}
|
||||
|
||||
# --- Main ---
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") <command> [args]
|
||||
|
||||
Commands:
|
||||
snapshot Create thin snapshots of all PVC LVs
|
||||
list List existing snapshots with age and data%
|
||||
prune Remove snapshots older than ${RETENTION_DAYS} days
|
||||
restore <lv> <snap> Restore a PVC from a snapshot (interactive)
|
||||
|
||||
Environment:
|
||||
LVM_SNAP_PUSHGATEWAY Pushgateway URL (default: ${PUSHGATEWAY})
|
||||
KUBECONFIG Kubeconfig path (default: /root/.kube/config)
|
||||
EOF
|
||||
}
|
||||
|
||||
main() {
|
||||
local cmd="${1:-}"
|
||||
shift || true
|
||||
|
||||
# Acquire lock (except for list which is read-only)
|
||||
if [[ "${cmd}" != "list" && "${cmd}" != "" && "${cmd}" != "help" && "${cmd}" != "--help" && "${cmd}" != "-h" ]]; then
|
||||
exec 200>"${LOCKFILE}"
|
||||
if ! flock -n 200; then
|
||||
die "Another instance is already running (lockfile: ${LOCKFILE})"
|
||||
fi
|
||||
fi
|
||||
|
||||
case "${cmd}" in
|
||||
snapshot) cmd_snapshot ;;
|
||||
list) cmd_list ;;
|
||||
prune) cmd_prune ;;
|
||||
restore) cmd_restore "$@" ;;
|
||||
help|--help|-h|"") usage ;;
|
||||
*) die "Unknown command: ${cmd}. Run '$0 help' for usage." ;;
|
||||
esac
|
||||
}
|
||||
|
||||
main "$@"
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
[Unit]
|
||||
Description=Daily LVM thin snapshots of Proxmox CSI PVCs
|
||||
|
||||
[Timer]
|
||||
OnCalendar=*-*-* 03:00:00
|
||||
Persistent=true
|
||||
RandomizedDelaySec=300
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
|
|
@ -1,117 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# scripts/migrate-state-to-pg — One-shot migration from local SOPS state to PG backend.
|
||||
# Prerequisites: vault login -method=oidc, PG terraform_state DB exists, Vault static role created.
|
||||
# Usage: scripts/migrate-state-to-pg [--dry-run]
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||
SYNC="$REPO_ROOT/scripts/state-sync"
|
||||
STACKS_DIR="$REPO_ROOT/stacks"
|
||||
STATE_DIR="$REPO_ROOT/state/stacks"
|
||||
|
||||
TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
|
||||
is_tier0() {
|
||||
echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
|
||||
}
|
||||
|
||||
DRY_RUN=false
|
||||
[ "${1:-}" = "--dry-run" ] && DRY_RUN=true
|
||||
|
||||
# Fetch PG credentials from Vault
|
||||
echo "==> Fetching PG credentials from Vault..."
|
||||
PG_CREDS=$(vault read -format=json database/static-creds/pg-terraform-state) || {
|
||||
echo "ERROR: Cannot read PG credentials. Run: vault login -method=oidc" >&2
|
||||
exit 1
|
||||
}
|
||||
PG_USER=$(echo "$PG_CREDS" | jq -r .data.username)
|
||||
PG_PASS=$(echo "$PG_CREDS" | jq -r .data.password)
|
||||
export PG_CONN_STR="postgres://${PG_USER}:${PG_PASS}@10.0.20.200:5432/terraform_state?sslmode=disable"
|
||||
echo " PG_CONN_STR set (user: $PG_USER)"
|
||||
|
||||
# Enable provider cache
|
||||
export TF_PLUGIN_CACHE_DIR="${TF_PLUGIN_CACHE_DIR:-$HOME/.terraform.d/plugin-cache}"
|
||||
export TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1
|
||||
mkdir -p "$TF_PLUGIN_CACHE_DIR"
|
||||
|
||||
migrated=0
|
||||
failed=0
|
||||
skipped=0
|
||||
failed_stacks=""
|
||||
|
||||
# Increment helpers (avoid arithmetic exit code 1 when value is 0)
|
||||
inc_migrated() { migrated=$((migrated + 1)); }
|
||||
inc_failed() { failed=$((failed + 1)); }
|
||||
inc_skipped() { skipped=$((skipped + 1)); }
|
||||
|
||||
# Iterate over all stack directories that have state
|
||||
for state_dir in "$STATE_DIR"/*/; do
|
||||
stack="$(basename "$state_dir")"
|
||||
|
||||
# Skip Tier 0
|
||||
if is_tier0 "$stack"; then
|
||||
echo "--- SKIP (Tier 0): $stack"
|
||||
inc_skipped
|
||||
continue
|
||||
fi
|
||||
|
||||
# Skip stacks with no state file
|
||||
if [ ! -f "$state_dir/terraform.tfstate.enc" ] && [ ! -f "$state_dir/terraform.tfstate" ]; then
|
||||
echo "--- SKIP (no state): $stack"
|
||||
inc_skipped
|
||||
continue
|
||||
fi
|
||||
|
||||
# Skip stacks with no corresponding stack directory
|
||||
if [ ! -d "$STACKS_DIR/$stack" ]; then
|
||||
echo "--- SKIP (no stack dir): $stack"
|
||||
inc_skipped
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "==> Migrating: $stack"
|
||||
|
||||
if $DRY_RUN; then
|
||||
echo " [dry-run] Would migrate $stack"
|
||||
inc_skipped
|
||||
continue
|
||||
fi
|
||||
|
||||
# Decrypt state if needed (call decrypt_state directly — state-sync skips Tier 1)
|
||||
if [ -f "$state_dir/terraform.tfstate.enc" ] && [ ! -f "$state_dir/terraform.tfstate" ]; then
|
||||
sops -d --input-type json --output-type json "$state_dir/terraform.tfstate.enc" > "$state_dir/terraform.tfstate" || {
|
||||
echo " WARNING: decrypt failed, skipping"
|
||||
inc_skipped
|
||||
continue
|
||||
}
|
||||
fi
|
||||
|
||||
# Migrate state
|
||||
cd "$STACKS_DIR/$stack"
|
||||
if terragrunt init -upgrade -migrate-state -force-copy -input=false 2>&1 | tee /tmp/tg-migrate-$stack.log; then
|
||||
echo " init OK"
|
||||
|
||||
# Verify — plan should show no changes
|
||||
if terragrunt plan -detailed-exitcode -input=false 2>&1 | tail -5 | grep -q "No changes"; then
|
||||
echo " plan OK — no drift"
|
||||
inc_migrated
|
||||
else
|
||||
echo " WARNING: plan shows changes (may be normal drift, not migration issue)"
|
||||
inc_migrated
|
||||
fi
|
||||
else
|
||||
echo " FAILED: init error (see /tmp/tg-migrate-$stack.log)"
|
||||
inc_failed
|
||||
failed_stacks="$failed_stacks $stack"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "========================================"
|
||||
echo "Migration complete"
|
||||
echo " Migrated: $migrated"
|
||||
echo " Failed: $failed"
|
||||
echo " Skipped: $skipped"
|
||||
if [ -n "$failed_stacks" ]; then
|
||||
echo " Failed stacks:$failed_stacks"
|
||||
fi
|
||||
echo "========================================"
|
||||
|
|
@ -1,112 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Phase 3: Migrate all service module state from root to individual stacks
|
||||
# Each module in root state is at: module.kubernetes_cluster.module.<name>["<name>"]
|
||||
# Target: state/stacks/<name>/terraform.tfstate as module.<name>
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_STATE="$(pwd)/terraform.tfstate"
|
||||
STATE_DIR="$(pwd)/state/stacks"
|
||||
|
||||
# All 64 service modules currently in root state
|
||||
MODULES=(
|
||||
actualbudget
|
||||
affine
|
||||
blog
|
||||
changedetection
|
||||
city-guesser
|
||||
coturn
|
||||
cyberchef
|
||||
dashy
|
||||
dawarich
|
||||
descheduler
|
||||
diun
|
||||
ebook2audiobook
|
||||
echo
|
||||
excalidraw
|
||||
f1-stream
|
||||
forgejo
|
||||
freedify
|
||||
freshrss
|
||||
frigate
|
||||
hackmd
|
||||
health
|
||||
homepage
|
||||
immich
|
||||
isponsorblocktv
|
||||
jsoncrack
|
||||
kms
|
||||
linkwarden
|
||||
matrix
|
||||
meshcentral
|
||||
n8n
|
||||
navidrome
|
||||
netbox
|
||||
networking-toolbox
|
||||
nextcloud
|
||||
ntfy
|
||||
ollama
|
||||
onlyoffice
|
||||
openclaw
|
||||
osm_routing
|
||||
owntracks
|
||||
paperless-ngx
|
||||
plotting-book
|
||||
privatebin
|
||||
real-estate-crawler
|
||||
reloader
|
||||
resume
|
||||
rybbit
|
||||
send
|
||||
servarr
|
||||
shadowsocks
|
||||
speedtest
|
||||
stirling-pdf
|
||||
tandoor
|
||||
tor-proxy
|
||||
travel_blog
|
||||
tuya-bridge
|
||||
url
|
||||
wealthfolio
|
||||
webhook_handler
|
||||
whisper
|
||||
ytdlp
|
||||
)
|
||||
|
||||
TOTAL=${#MODULES[@]}
|
||||
SUCCESS=0
|
||||
FAIL=0
|
||||
|
||||
echo "=== Phase 3: Service State Migration ==="
|
||||
echo "Migrating $TOTAL modules from root state to individual stacks"
|
||||
echo ""
|
||||
|
||||
for mod in "${MODULES[@]}"; do
|
||||
idx=$((SUCCESS + FAIL + 1))
|
||||
echo "[$idx/$TOTAL] Migrating: $mod"
|
||||
|
||||
# Create state directory
|
||||
mkdir -p "$STATE_DIR/$mod"
|
||||
|
||||
# Source address (with for_each key)
|
||||
SRC="module.kubernetes_cluster.module.${mod}[\"${mod}\"]"
|
||||
DST="module.${mod}"
|
||||
DST_STATE="$STATE_DIR/$mod/terraform.tfstate"
|
||||
|
||||
if terraform state mv \
|
||||
-state="$ROOT_STATE" \
|
||||
-state-out="$DST_STATE" \
|
||||
"$SRC" "$DST" 2>&1; then
|
||||
echo " ✓ $mod migrated successfully"
|
||||
SUCCESS=$((SUCCESS + 1))
|
||||
else
|
||||
echo " ✗ $mod FAILED"
|
||||
FAIL=$((FAIL + 1))
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
|
||||
echo "=== Migration Summary ==="
|
||||
echo "Total: $TOTAL"
|
||||
echo "Success: $SUCCESS"
|
||||
echo "Failed: $FAIL"
|
||||
|
|
@ -1,19 +0,0 @@
|
|||
[Unit]
|
||||
Description=Track NFS filesystem changes for incremental offsite backup
|
||||
After=local-fs.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/bin/inotifywait -m -r \
|
||||
--format '%%w%%f' \
|
||||
-e create -e modify -e moved_to -e delete \
|
||||
--exclude '(/\..*swp$|/\.nfs|/\.Trash|\.db-shm$|\.db-wal$|\.db-journal$|/stats/.*\.stat$|^/srv/nfs/anca-elements/)' \
|
||||
/srv/nfs \
|
||||
/srv/nfs-ssd
|
||||
StandardOutput=append:/mnt/backup/.nfs-changes.log
|
||||
StandardError=journal
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
[Unit]
|
||||
Description=Mirror /srv/nfs (selective) to /mnt/backup (local 2nd copy of critical NFS)
|
||||
After=network-online.target local-fs.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/nfs-mirror
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=nfs-mirror
|
||||
# Heavy sustained IO — don't compete with foreground services.
|
||||
Nice=10
|
||||
IOSchedulingClass=idle
|
||||
TimeoutStartSec=18000
|
||||
|
|
@ -1,179 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# nfs-mirror — local 2nd copy of /srv/nfs (selective) → /mnt/backup
|
||||
#
|
||||
# Deploy to PVE host at /usr/local/bin/nfs-mirror.
|
||||
# Schedule: weekly Mon 04:00 via nfs-mirror.timer.
|
||||
#
|
||||
# ROLE in the 3-2-1 strategy:
|
||||
# Copy 1 (sdc): /srv/nfs/* (live PVE NFS)
|
||||
# Copy 2 (sda, this): /mnt/backup/<svc>/ ← this script
|
||||
# Copy 3 (Synology): /Backup/Viki/nfs/ (via offsite-sync-backup + inotify)
|
||||
#
|
||||
# Replaces the dedicated anca-elements-mirror script; same disk, same
|
||||
# destination layout (anca-elements lives at /mnt/backup/anca-elements/),
|
||||
# but now covers every other critical NFS subtree in one pass.
|
||||
#
|
||||
# SKIP-LIST rationale (2026-05-26 simplification; REGENERABLE-SERVICE
|
||||
# CARVE-OUT added 2026-06-01 — see below):
|
||||
# immich — 1.5T, doesn't fit on sda; offsite-sync ships it direct to Synology
|
||||
# frigate — camera ring buffer; intentionally NOT backed up anywhere
|
||||
# temp — scratch; intentionally NOT backed up
|
||||
#
|
||||
# 2026-06-01 carve-out: the offsite Synology (5.3T) hit 97% and the
|
||||
# `Backup` share had grown +670G in a week — traced to the 2026-05-26
|
||||
# change that started mirroring large *regenerable* services to sda and
|
||||
# thence to Synology pve-backup/. These are now re-excluded because they
|
||||
# cost offsite capacity for data we can rebuild on demand:
|
||||
# ollama (20G) — LLM model blobs, re-pullable
|
||||
# prometheus-backup (64G) — metrics TSDB snapshots; was offsite-excluded
|
||||
# pre-2026-05-26 by original intent
|
||||
# audiblez (24G) — generated audiobooks, re-derivable from ebooks
|
||||
# ebook2audiobook (11G) — same, generation output
|
||||
# Their live copy stays on sdc (/srv/nfs); only the sda + Synology copies
|
||||
# are dropped. `*-backup` DB dumps (sqlite-backup et al.) are intentionally
|
||||
# KEPT — they are real database safety copies, not regenerable.
|
||||
#
|
||||
# Note: /srv/nfs-ssd is intentionally NOT mirrored — its dirs (immich,
|
||||
# ollama, llamacpp) go direct to Synology nfs-ssd/ via offsite-sync
|
||||
# Step 2, which (also 2026-06-01) was narrowed to immich-only so ollama
|
||||
# + llamacpp on the SSD stop reaching Synology too.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SRC=/srv/nfs/
|
||||
DST=/mnt/backup/
|
||||
LOG=/var/log/nfs-mirror.log
|
||||
LOCKFILE=/run/nfs-mirror.lock
|
||||
# Manifest of files changed under /mnt/backup since the last offsite-sync.
|
||||
# offsite-sync-backup Step 1 reads this and rsyncs the listed files to Synology
|
||||
# pve-backup/ on its next daily run. Without populating it, nfs-mirror's writes
|
||||
# would only reach Synology via the monthly full sync (1st-7th of month), and
|
||||
# the monthly --delete pass would also wipe any pre-positioned data.
|
||||
MANIFEST=/mnt/backup/.changed-files
|
||||
PUSHGATEWAY="${NFS_MIRROR_PUSHGATEWAY:-http://10.0.20.100:30091}"
|
||||
PUSHGATEWAY_JOB=nfs-mirror
|
||||
|
||||
EXCLUDES=(
|
||||
# ---- /mnt/backup subtrees owned by daily-backup — leave alone ----
|
||||
--exclude='/pvc-data/'
|
||||
--exclude='/sqlite-backup/'
|
||||
--exclude='/pfsense/'
|
||||
--exclude='/pve-config/'
|
||||
--exclude='/lost+found/'
|
||||
|
||||
# ---- state files used by other backup jobs ----
|
||||
--exclude='/.changed-files'
|
||||
--exclude='/.last-offsite-sync'
|
||||
--exclude='/.lv-pvc-mapping.json'
|
||||
--exclude='/.nfs-changes.log'
|
||||
|
||||
# ---- anca-elements: now in Immich (canonical), /mnt/backup copy deleted
|
||||
# 2026-05-26. Kept in excludes so nfs-mirror doesn't re-populate from sdc
|
||||
# if /srv/nfs/anca-elements is ever re-attached.
|
||||
--exclude='/anca-elements/'
|
||||
|
||||
# ---- NFS paths intentionally NOT backed up ----
|
||||
--exclude='/immich/' # 1.5T — ships sdc → Synology direct (Step 2)
|
||||
--exclude='/frigate/' # ring buffer — no backup anywhere
|
||||
--exclude='/temp/' # scratch — no backup anywhere
|
||||
|
||||
# ---- regenerable services: live-only on sdc, no offsite (2026-06-01) ----
|
||||
# See header carve-out. --delete reaps any existing copies from sda on
|
||||
# the next run; a one-off direct delete already cleared them from Synology.
|
||||
--exclude='/ollama/' # LLM models — re-pullable
|
||||
--exclude='/prometheus-backup/' # metrics TSDB snapshots
|
||||
--exclude='/audiblez/' # generated audiobooks
|
||||
--exclude='/ebook2audiobook/' # generated audiobooks
|
||||
|
||||
# ---- Synology / Windows / macOS cruft ----
|
||||
--exclude='/@eaDir/'
|
||||
--exclude='*@synoeastream'
|
||||
--exclude='/.DS_Store'
|
||||
--exclude='/Thumbs.db'
|
||||
)
|
||||
|
||||
log() { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG"; }
|
||||
warn() { log "WARN: $*"; }
|
||||
|
||||
# Locked manifest append (shared with daily-backup) — see daily-backup.sh
|
||||
# for the rationale. flock prevents interleaved appends when nfs-mirror
|
||||
# (Mon 04:11) overruns into daily-backup (Mon 05:00).
|
||||
MANIFEST_LOCK="${MANIFEST}.lock"
|
||||
manifest_append() {
|
||||
(
|
||||
flock -x 200
|
||||
cat >> "${MANIFEST}"
|
||||
) 200>"${MANIFEST_LOCK}"
|
||||
}
|
||||
|
||||
push_metrics() {
|
||||
local status="${1:-0}" bytes="${2:-0}"
|
||||
cat <<EOF | curl -s --connect-timeout 5 --max-time 10 --data-binary @- "${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || true
|
||||
nfs_mirror_last_run_timestamp $(date +%s)
|
||||
nfs_mirror_last_status ${status}
|
||||
nfs_mirror_bytes ${bytes}
|
||||
EOF
|
||||
}
|
||||
|
||||
KILLED=""
|
||||
STAMP=""
|
||||
cleanup() {
|
||||
rm -f "$LOCKFILE"
|
||||
[ -n "$STAMP" ] && rm -f "$STAMP"
|
||||
if [ -n "$KILLED" ]; then
|
||||
push_metrics 2 0 # status=2 = aborted
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
trap 'KILLED=1; exit 143' TERM INT
|
||||
|
||||
if ! ( set -o noclobber; echo $$ > "$LOCKFILE" ) 2>/dev/null; then
|
||||
log "FATAL: another instance running (pid $(cat "$LOCKFILE" 2>/dev/null || echo unknown))"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mountpoint -q /mnt/backup || { log "FATAL: /mnt/backup not mounted"; push_metrics 1 0; exit 1; }
|
||||
[ -d "$SRC" ] || { log "FATAL: source $SRC missing"; push_metrics 1 0; exit 1; }
|
||||
|
||||
log "=== mirror starting: $SRC → $DST ==="
|
||||
log "skip: immich (Synology direct), frigate/temp (no backup), anca-elements, ollama/prometheus-backup/audiblez/ebook2audiobook (regenerable, live-only)"
|
||||
|
||||
# Marker file used to identify files written by this rsync run, so we can append
|
||||
# their paths to the offsite-sync manifest. Touch BEFORE rsync; `find -newer` AFTER.
|
||||
STAMP=$(mktemp)
|
||||
|
||||
RSYNC_RC=0
|
||||
rsync \
|
||||
-rlt --delete -H \
|
||||
--no-perms --no-owner --no-group \
|
||||
--info=stats2 \
|
||||
"${EXCLUDES[@]}" \
|
||||
"$SRC" "$DST" 2>&1 | tee -a "$LOG" || RSYNC_RC=${PIPESTATUS[0]}
|
||||
|
||||
DST_BYTES=$(df -B1 --output=used /mnt/backup | tail -1)
|
||||
|
||||
if [ "$RSYNC_RC" -eq 0 ]; then
|
||||
# Capture files that rsync created/modified and feed them to the offsite-sync
|
||||
# manifest so daily Step 1 incremental picks them up tomorrow morning.
|
||||
# Use -cnewer (ctime), not -newer (mtime): rsync -t preserves SOURCE mtime
|
||||
# on the dest, so freshly-written files with old source mtime look "older"
|
||||
# than $STAMP and -newer misses them. ctime is set when the inode is written,
|
||||
# regardless of -t, so it correctly identifies what this run created.
|
||||
# (Bug hit 2026-05-26 full bypass-list mirror: 800k files copied, manifest
|
||||
# captured only 2 entries → forced a .force-full-sync to recover.)
|
||||
NEW_COUNT=$(find /mnt/backup -cnewer "$STAMP" -type f \
|
||||
! -path '/mnt/backup/.changed-files' \
|
||||
! -path '/mnt/backup/.changed-files.lock' \
|
||||
! -path '/mnt/backup/.lv-pvc-mapping.json' \
|
||||
! -path '/mnt/backup/.nfs-changes.log' \
|
||||
! -path '/mnt/backup/.last-offsite-sync' \
|
||||
! -path '/mnt/backup/.force-full-sync' \
|
||||
-printf '%P\n' 2>/dev/null | tee >(manifest_append) | wc -l)
|
||||
log "=== mirror complete; ${NEW_COUNT} files added to offsite manifest ==="
|
||||
log "/mnt/backup used: $(df -h --output=used /mnt/backup | tail -1 | tr -d ' ')"
|
||||
push_metrics 0 "$DST_BYTES"
|
||||
else
|
||||
log "=== mirror failed: rsync exited $RSYNC_RC ==="
|
||||
push_metrics 1 "$DST_BYTES"
|
||||
exit "$RSYNC_RC"
|
||||
fi
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
[Unit]
|
||||
Description=Daily local NFS mirror to /mnt/backup
|
||||
|
||||
[Timer]
|
||||
# Daily 02:00 — runs 3h before daily-backup (05:00) so the .changed-files
|
||||
# manifest is populated and offsite-sync (06:00) ships both legs' deltas.
|
||||
# Switched from weekly Mon 04:00 → daily 2026-05-26: steady-state delta is
|
||||
# 10-20 min of mostly-metadata rsync, so the IO cost is negligible and it
|
||||
# cuts non-CronJob app-data RPO from 7d to ~24h (matters for nextcloud
|
||||
# shared files, audiobookshelf library, mailserver Maildir, etc.).
|
||||
OnCalendar=*-*-* 02:00:00
|
||||
Persistent=true
|
||||
RandomizedDelaySec=15min
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
|
|
@ -1,97 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Simple and reliable containerd registry mirror manager
|
||||
# Usage: ./registry-mirror.sh [--add|--remove] [mirror_url]
|
||||
# Docs - https://github.com/containerd/containerd/blob/main/docs/cri/registry.md
|
||||
# To apply on all nodes (tail +3 skips master node):
|
||||
# for node in $(kubectl get nodes -o wide | awk '{print $6}' | tail -n +3); do cat node_registry_manager.sh | s wizard@$node "sudo bash -s -- --add http://10.0.20.10:5000"; done
|
||||
# for node in $(kubectl get nodes -o wide | awk '{print $6}' | tail -n +3); do cat node_registry_manager.sh | s wizard@$node "sudo bash -s -- --remove http://10.0.20.10:5000"; done
|
||||
|
||||
set -euo pipefail
|
||||
CONFIG_FILE="/etc/containerd/config.toml"
|
||||
BACKUP_FILE="/etc/containerd/config.toml.bak"
|
||||
|
||||
# Validate environment
|
||||
[ -f "$CONFIG_FILE" ] || { echo "Error: $CONFIG_FILE not found" >&2; exit 1; }
|
||||
[ "$(id -u)" -eq 0 ] || { echo "Error: Requires root privileges" >&2; exit 1; }
|
||||
|
||||
add_mirror() {
|
||||
local mirror_url="$1"
|
||||
|
||||
# Create backup
|
||||
cp -p "$CONFIG_FILE" "$BACKUP_FILE"
|
||||
|
||||
# Check if mirror already exists
|
||||
if grep -q "endpoint = \[.*\"$mirror_url\".*\]" "$CONFIG_FILE"; then
|
||||
echo "Mirror already exists: $mirror_url"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Check if docker.io section exists
|
||||
if grep -q "^\[plugins\.\"io\.containerd\.grpc\.v1\.cri\"\.registry\.mirrors\.\"docker.io\"\]" "$CONFIG_FILE"; then
|
||||
# Append to existing section
|
||||
sed -i "/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."docker.io"\]/a \ endpoint = [\"$mirror_url\"]" "$CONFIG_FILE"
|
||||
else
|
||||
# Add new section after registry.mirrors
|
||||
if grep -q "^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\]" "$CONFIG_FILE"; then
|
||||
sed -i "/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\]/a \\n[plugins.\"io.containerd.grpc.v1.cri\".registry.mirrors.\"docker.io\"]\n endpoint = [\"$mirror_url\"]" "$CONFIG_FILE"
|
||||
else
|
||||
# Add complete new section
|
||||
echo -e "\n[plugins.\"io.containerd.grpc.v1.cri\".registry.mirrors.\"docker.io\"]\n endpoint = [\"$mirror_url\"]" >> "$CONFIG_FILE"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Added mirror: $mirror_url"
|
||||
}
|
||||
|
||||
remove_mirror() {
|
||||
local mirror_url="$1"
|
||||
|
||||
# Create backup
|
||||
cp -p "$CONFIG_FILE" "$BACKUP_FILE"
|
||||
|
||||
# Remove the specific mirror URL
|
||||
sed -i "/endpoint = \[.*\"$mirror_url\".*\]/d" "$CONFIG_FILE"
|
||||
|
||||
# Clean up empty sections
|
||||
sed -i '/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."docker.io"\]$/,/^\[/{//!d}' "$CONFIG_FILE"
|
||||
sed -i '/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."docker.io"\]$/d' "$CONFIG_FILE"
|
||||
|
||||
# Clean up multiple empty lines
|
||||
sed -i '/^$/N;/^\n$/D' "$CONFIG_FILE"
|
||||
|
||||
echo "Removed mirror: $mirror_url"
|
||||
}
|
||||
|
||||
restart_containerd() {
|
||||
echo "Restarting containerd..."
|
||||
if systemctl restart containerd; then
|
||||
echo "Successfully restarted containerd"
|
||||
return 0
|
||||
else
|
||||
echo "Error: Failed to restart containerd" >&2
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
case "$1" in
|
||||
--add)
|
||||
[ -z "$2" ] && { echo "Error: Mirror URL required" >&2; exit 1; }
|
||||
add_mirror "$2"
|
||||
restart_containerd || exit 1
|
||||
;;
|
||||
--remove)
|
||||
[ -z "$2" ] && { echo "Error: Mirror URL required" >&2; exit 1; }
|
||||
remove_mirror "$2"
|
||||
restart_containerd || exit 1
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 [--add|--remove] [mirror_url]" >&2
|
||||
echo "Examples:" >&2
|
||||
echo " Add mirror: $0 --add https://registry.example.com" >&2
|
||||
echo " Remove mirror: $0 --remove https://registry.example.com" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
exit 0
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
[Unit]
|
||||
Description=Daily offsite sync: sda + NFS changes to Synology
|
||||
After=network-online.target daily-backup.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/offsite-sync-backup
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=offsite-sync-backup
|
||||
TimeoutStartSec=7200
|
||||
|
|
@ -1,187 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# offsite-sync-backup — Sync backups to Synology NAS
|
||||
# Deploy to PVE host at /usr/local/bin/offsite-sync-backup
|
||||
# Schedule: Daily 06:00 via systemd timer (After=daily-backup.service)
|
||||
#
|
||||
# Two sync paths:
|
||||
# Step 1: sda (/mnt/backup) → Synology pve-backup/ (PVC snapshots, pfsense, pve-config, sqlite)
|
||||
# Step 2: NFS (/srv/nfs, /srv/nfs-ssd) → Synology nfs/, nfs-ssd/ (inotify change-tracked)
|
||||
set -euo pipefail
|
||||
|
||||
# --- Configuration ---
|
||||
BACKUP_ROOT="/mnt/backup"
|
||||
SYNOLOGY="Administrator@192.168.1.13"
|
||||
PVE_BACKUP_DEST="${SYNOLOGY}:/volume1/Backup/Viki/pve-backup"
|
||||
NFS_DEST="${SYNOLOGY}:/volume1/Backup/Viki/nfs"
|
||||
NFS_SSD_DEST="${SYNOLOGY}:/volume1/Backup/Viki/nfs-ssd"
|
||||
MANIFEST="${BACKUP_ROOT}/.changed-files"
|
||||
NFS_CHANGE_LOG="${BACKUP_ROOT}/.nfs-changes.log"
|
||||
PUSHGATEWAY="${OFFSITE_SYNC_PUSHGATEWAY:-http://10.0.20.100:30091}"
|
||||
PUSHGATEWAY_JOB="offsite-backup-sync"
|
||||
LOCKFILE="/run/offsite-sync-backup.lock"
|
||||
|
||||
# --- Logging ---
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
||||
warn() { log "WARN: $*" >&2; }
|
||||
|
||||
# --- Locking ---
|
||||
cleanup() { rm -f "${LOCKFILE}"; }
|
||||
trap cleanup EXIT
|
||||
if ! ( set -o noclobber; echo $$ > "${LOCKFILE}" ) 2>/dev/null; then
|
||||
log "FATAL: Another instance running"; exit 1
|
||||
fi
|
||||
|
||||
# --- Main ---
|
||||
log "=== Offsite sync starting ==="
|
||||
STATUS=0
|
||||
|
||||
if ! mountpoint -q "${BACKUP_ROOT}"; then
|
||||
log "FATAL: ${BACKUP_ROOT} is not mounted"; exit 1
|
||||
fi
|
||||
|
||||
if ! timeout 10 ssh -o BatchMode=yes -o ConnectTimeout=5 "${SYNOLOGY}" true 2>/dev/null; then
|
||||
log "FATAL: Cannot SSH to Synology"
|
||||
echo "backup_last_success_timestamp 0" | \
|
||||
curl -s --connect-timeout 5 --max-time 10 --data-binary @- \
|
||||
"${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
DAY_OF_MONTH=$(date +%d)
|
||||
|
||||
# ============================================================
|
||||
# STEP 1: sda → Synology pve-backup/ (PVC snapshots, pfsense, pve-config)
|
||||
# ============================================================
|
||||
log "--- Step 1: sda → Synology pve-backup/ ---"
|
||||
|
||||
# Trigger: monthly cleanup window OR daily-backup signalled the manifest grew
|
||||
# past its cap (Synology was unreachable too long for incremental to keep up).
|
||||
FORCE_FULL_FLAG="${BACKUP_ROOT}/.force-full-sync"
|
||||
FORCE_FULL=""
|
||||
[ -f "${FORCE_FULL_FLAG}" ] && FORCE_FULL=1
|
||||
if [ "${DAY_OF_MONTH}" -le 7 ] || [ -n "${FORCE_FULL}" ]; then
|
||||
[ -n "${FORCE_FULL}" ] && log "Forced full sync (manifest size cap tripped)..." || log "Monthly full sync (1st Sunday)..."
|
||||
# No -z on LAN: gigabit hop to 192.168.1.13 doesn't benefit from compression
|
||||
# and burns CPU on the PVE host that's already busy with cluster IO.
|
||||
rsync -rlt --delete --chmod=Du=rwx,Dgo=rx,Fu=rw,Fog=r \
|
||||
--exclude='.changed-files' \
|
||||
--exclude='.changed-files.lock' \
|
||||
--exclude='.last-offsite-sync' \
|
||||
--exclude='.lv-pvc-mapping.json' \
|
||||
--exclude='.nfs-changes.log' \
|
||||
--exclude='.force-full-sync' \
|
||||
--exclude='/anca-elements/' \
|
||||
"${BACKUP_ROOT}/" "${PVE_BACKUP_DEST}/" 2>&1 || STATUS=1
|
||||
rm -f "${FORCE_FULL_FLAG}"
|
||||
elif [ -s "${MANIFEST}" ]; then
|
||||
MANIFEST_LINES=$(wc -l < "${MANIFEST}")
|
||||
log "Incremental sync (${MANIFEST_LINES} files from manifest)..."
|
||||
# anca-elements: now in Immich (canonical); /mnt/backup copy deleted
|
||||
# 2026-05-26. Exclude retained as a safety belt in case it re-appears.
|
||||
rsync -rlt --chmod=Du=rwx,Dgo=rx,Fu=rw,Fog=r --files-from="${MANIFEST}" \
|
||||
--exclude='anca-elements/' \
|
||||
"${BACKUP_ROOT}/" "${PVE_BACKUP_DEST}/" 2>&1 || STATUS=1
|
||||
else
|
||||
log "No changed files in manifest, nothing to sync"
|
||||
fi
|
||||
|
||||
# ============================================================
|
||||
# STEP 2: NFS → Synology nfs/ + nfs-ssd/ (inotify change-tracked, FILTERED)
|
||||
# ============================================================
|
||||
#
|
||||
# DESIGN: Step 2 only carries paths that BYPASS the sda mirror. As of
|
||||
# 2026-05-26 that's just /srv/nfs/immich/ (1.5T, doesn't fit on sda).
|
||||
# Everything else under /srv/nfs/ now flows through sda via nfs-mirror,
|
||||
# reaching Synology via Step 1 (sda → pve-backup/). frigate and temp are
|
||||
# excluded from both legs — intentionally NOT backed up.
|
||||
#
|
||||
# nfs-ssd: as of 2026-06-01 this leg is ALSO immich-only. ollama (59G) and
|
||||
# llamacpp (26G) on the SSD were filling the offsite Synology (5.3T hit 97%)
|
||||
# for re-pullable model blobs, so they're dropped — live copy stays on the
|
||||
# SSD, no offsite. The monthly --delete pass below reaps them from Synology
|
||||
# nfs-ssd/; a one-off direct delete cleared the bulk on 2026-06-01.
|
||||
#
|
||||
# Keep this aligned with /usr/local/bin/nfs-mirror's EXCLUDES. Both legs now
|
||||
# carry immich only; everything else is either curated through sda (Step 1)
|
||||
# or intentionally live-only (frigate, temp, ollama, llamacpp, audiblez,
|
||||
# ebook2audiobook, prometheus-backup).
|
||||
log "--- Step 2: NFS → Synology (immich-only on both nfs/ and nfs-ssd/) ---"
|
||||
|
||||
# Regex matching paths NOT on sda (must reach Synology directly).
|
||||
NFS_SDA_BYPASS_RE='^/srv/nfs/immich/'
|
||||
|
||||
# rsync include/exclude args for the monthly full sync (HDD).
|
||||
NFS_FULL_INCLUDES=(
|
||||
--include='/immich/' --include='/immich/***'
|
||||
--exclude='*'
|
||||
)
|
||||
|
||||
if [ "${DAY_OF_MONTH}" -le 7 ]; then
|
||||
# Monthly: full sync with --delete for cleanup, restricted to bypass-list.
|
||||
# --delete here will reap legacy dirs on Synology (frigate, ollama,
|
||||
# audiblez, ebook2audiobook, *-backup, prometheus, loki, temp,
|
||||
# alertmanager) since they're no longer in NFS_FULL_INCLUDES.
|
||||
log "Monthly full NFS sync (immich-only — reaps legacy bypass dirs)..."
|
||||
rsync -rlt --delete "${NFS_FULL_INCLUDES[@]}" /srv/nfs/ "${NFS_DEST}/" 2>&1 \
|
||||
&& log " OK: nfs/ full sync (immich-only)" || { warn "nfs/ full sync failed"; STATUS=1; }
|
||||
# nfs-ssd: immich-only (2026-06-01) — --delete reaps legacy ollama/llamacpp.
|
||||
rsync -rlt --delete "${NFS_FULL_INCLUDES[@]}" /srv/nfs-ssd/ "${NFS_SSD_DEST}/" 2>&1 \
|
||||
&& log " OK: nfs-ssd/ full sync (immich-only)" || { warn "nfs-ssd/ full sync failed"; STATUS=1; }
|
||||
> "${NFS_CHANGE_LOG}"
|
||||
elif [ -s "${NFS_CHANGE_LOG}" ]; then
|
||||
# Incremental: only sync changed files matching the bypass leg (immich).
|
||||
sort -u "${NFS_CHANGE_LOG}" > /tmp/nfs-changes-deduped
|
||||
|
||||
# HDD NFS — include only /srv/nfs/immich/ paths.
|
||||
# `|| true` is REQUIRED: if the last iteration's `[ -f "$f" ]` is false
|
||||
# (file was deleted between inotify capture and now — e.g., immich
|
||||
# encoded-video temp file that got cleaned up), the while loop returns
|
||||
# 1, pipefail propagates, and `set -e` kills the script silently before
|
||||
# reaching the rsync. Matches the SSD section's pattern below.
|
||||
grep -E "${NFS_SDA_BYPASS_RE}" /tmp/nfs-changes-deduped | \
|
||||
while IFS= read -r f; do [ -f "$f" ] && echo "${f#/srv/nfs/}"; done \
|
||||
> /tmp/sync-nfs.list 2>/dev/null || true
|
||||
NFS_COUNT=$(wc -l < /tmp/sync-nfs.list 2>/dev/null || echo 0)
|
||||
if [ "${NFS_COUNT:-0}" -gt 0 ]; then
|
||||
rsync -rlt --files-from=/tmp/sync-nfs.list /srv/nfs/ "${NFS_DEST}/" 2>&1 \
|
||||
&& log " OK: nfs/ (${NFS_COUNT} immich files)" \
|
||||
|| { warn "nfs/ incremental failed"; STATUS=1; }
|
||||
fi
|
||||
|
||||
# SSD NFS — immich-only (2026-06-01); ollama/llamacpp are live-only, no offsite.
|
||||
grep '^/srv/nfs-ssd/immich/' /tmp/nfs-changes-deduped | \
|
||||
while IFS= read -r f; do [ -f "$f" ] && echo "${f#/srv/nfs-ssd/}"; done \
|
||||
> /tmp/sync-nfs-ssd.list 2>/dev/null || true
|
||||
SSD_COUNT=$(wc -l < /tmp/sync-nfs-ssd.list 2>/dev/null || echo 0)
|
||||
if [ "${SSD_COUNT:-0}" -gt 0 ]; then
|
||||
rsync -rlt --files-from=/tmp/sync-nfs-ssd.list /srv/nfs-ssd/ "${NFS_SSD_DEST}/" 2>&1 \
|
||||
&& log " OK: nfs-ssd/ (${SSD_COUNT} files)" \
|
||||
|| { warn "nfs-ssd/ incremental failed"; STATUS=1; }
|
||||
fi
|
||||
|
||||
TOTAL=$(wc -l < /tmp/nfs-changes-deduped)
|
||||
log " Processed ${TOTAL} change events (${NFS_COUNT} nfs/immich + ${SSD_COUNT} nfs-ssd files synced)"
|
||||
> "${NFS_CHANGE_LOG}"
|
||||
rm -f /tmp/nfs-changes-deduped /tmp/sync-nfs.list /tmp/sync-nfs-ssd.list
|
||||
else
|
||||
log " No NFS changes to sync"
|
||||
fi
|
||||
|
||||
# ============================================================
|
||||
# Finish
|
||||
# ============================================================
|
||||
if [ "${STATUS}" -eq 0 ]; then
|
||||
touch "${BACKUP_ROOT}/.last-offsite-sync"
|
||||
> "${MANIFEST}"
|
||||
log "=== Offsite sync complete (success) ==="
|
||||
else
|
||||
warn "Offsite sync had errors — manifest preserved for retry"
|
||||
log "=== Offsite sync complete (with errors) ==="
|
||||
fi
|
||||
|
||||
cat <<EOF | curl -s --connect-timeout 5 --max-time 10 --data-binary @- "${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || true
|
||||
backup_last_success_timestamp $(date +%s)
|
||||
offsite_sync_last_status ${STATUS}
|
||||
EOF
|
||||
|
||||
exit "${STATUS}"
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
[Unit]
|
||||
Description=Daily offsite sync: sda + NFS changes to Synology
|
||||
|
||||
[Timer]
|
||||
OnCalendar=*-*-* 06:00:00
|
||||
Persistent=true
|
||||
RandomizedDelaySec=300
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
|
|
@ -1,89 +0,0 @@
|
|||
#!/bin/sh
|
||||
# parse-postmortem-todos.sh — Extract auto-implementable TODOs from a post-mortem markdown file
|
||||
# Usage: bash scripts/parse-postmortem-todos.sh docs/post-mortems/2026-04-14-foo.md
|
||||
# Output: JSON with file path and list of TODOs
|
||||
#
|
||||
# Supports two table formats:
|
||||
# New: | Priority | Action | Type | Details | Status |
|
||||
# Old: | Action | Status | Details | (infers type from action text)
|
||||
set -eu
|
||||
|
||||
PM_FILE="${1:?Usage: $0 <post-mortem.md>}"
|
||||
|
||||
if [ ! -f "$PM_FILE" ]; then
|
||||
echo '{"file": "", "todos": [], "error": "File not found"}' >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 -c "
|
||||
import re, json, sys
|
||||
|
||||
pm_file = sys.argv[1]
|
||||
with open(pm_file) as f:
|
||||
content = f.read()
|
||||
|
||||
safe_types = {'Alert', 'Config', 'Monitor'}
|
||||
|
||||
todos = []
|
||||
|
||||
# Format 1 (new template): | Priority | Action | Type | Details | Status |
|
||||
pattern_new = r'\|\s*(P[0-3])\s*\|\s*(.+?)\s*\|\s*(\w+)\s*\|\s*(.+?)\s*\|\s*TODO\s*\|'
|
||||
for priority, action, todo_type, details in re.findall(pattern_new, content):
|
||||
todos.append({
|
||||
'priority': priority.strip(),
|
||||
'action': action.strip(),
|
||||
'type': todo_type.strip(),
|
||||
'details': details.strip(),
|
||||
'safe': todo_type.strip() in safe_types
|
||||
})
|
||||
|
||||
# Format 2 (old): | Action | TODO/Done | Details | or | Action | Owner | Status |
|
||||
# Look for rows with TODO in any column
|
||||
if not todos:
|
||||
pattern_old = r'\|\s*(.+?)\s*\|\s*TODO\s*\|\s*(.+?)\s*\|'
|
||||
for action, details in re.findall(pattern_old, content):
|
||||
action = action.strip()
|
||||
details = details.strip()
|
||||
# Skip header rows and clean up leading pipes
|
||||
if action.startswith('--') or action.lower() == 'action':
|
||||
continue
|
||||
action = action.lstrip('| ').strip()
|
||||
# Infer type from action text
|
||||
action_lower = action.lower()
|
||||
if any(kw in action_lower for kw in ['prometheusrule', 'alert', 'alerting']):
|
||||
todo_type = 'Alert'
|
||||
elif any(kw in action_lower for kw in ['uptime kuma', 'monitor', 'ping', 'tcp check']):
|
||||
todo_type = 'Monitor'
|
||||
elif any(kw in action_lower for kw in ['config', 'manage', 'add.*option', 'document', 'nfs.conf']):
|
||||
todo_type = 'Config'
|
||||
elif any(kw in action_lower for kw in ['migrate', 'move']):
|
||||
todo_type = 'Migration'
|
||||
elif any(kw in action_lower for kw in ['review', 'investigate', 'verify']):
|
||||
todo_type = 'Investigation'
|
||||
else:
|
||||
todo_type = 'Config' # default to Config for ambiguous items
|
||||
|
||||
# Infer priority from section header context
|
||||
priority = 'P2' # default
|
||||
todos.append({
|
||||
'priority': priority,
|
||||
'action': action,
|
||||
'type': todo_type,
|
||||
'details': details,
|
||||
'safe': todo_type in safe_types
|
||||
})
|
||||
|
||||
safe_todos = [t for t in todos if t['safe']]
|
||||
unsafe_todos = [t for t in todos if not t['safe']]
|
||||
|
||||
result = {
|
||||
'file': pm_file,
|
||||
'todos': safe_todos,
|
||||
'skipped': unsafe_todos,
|
||||
'total_todos_in_doc': len(todos),
|
||||
'safe_todos': len(safe_todos),
|
||||
'skipped_todos': len(unsafe_todos)
|
||||
}
|
||||
|
||||
print(json.dumps(result, indent=2))
|
||||
" "$PM_FILE"
|
||||
|
|
@ -1,236 +0,0 @@
|
|||
<?php
|
||||
// pfSense HAProxy bootstrap — configures the mailserver PROXY-v2 path
|
||||
// (bd code-yiu, Phases 2/3 + 5).
|
||||
//
|
||||
// WHY THIS EXISTS
|
||||
// pfSense HAProxy config is stored XML-in-`/cf/conf/config.xml` under
|
||||
// `<installedpackages><haproxy>`. That file IS picked up by the nightly
|
||||
// `daily-backup` on the PVE host (see `scripts/daily-backup.sh` → `scp
|
||||
// root@10.0.20.1:/cf/conf/config.xml`) and synced to Synology. This script
|
||||
// is the canonical reproducer: run it to rebuild the pfSense HAProxy config
|
||||
// from scratch (DR restore, fresh pfSense install, etc.).
|
||||
//
|
||||
// WHAT IT BUILDS
|
||||
// 4 backend pools — one per mail port:
|
||||
// mailserver_nodes_smtp → k8s-node1..4:30125 (container :2525 postscreen)
|
||||
// mailserver_nodes_smtps → k8s-node1..4:30126 (container :4465 smtps)
|
||||
// mailserver_nodes_sub → k8s-node1..4:30127 (container :5587 submission)
|
||||
// mailserver_nodes_imaps → k8s-node1..4:30128 (container :10993 IMAPS)
|
||||
// Each server uses `send-proxy-v2` and TCP health-check every 120s.
|
||||
// 4 frontends on pfSense 10.0.20.1:{25,465,587,993} TCP mode.
|
||||
// + 1 legacy test frontend on :2525 (kept for validation; safe to remove later).
|
||||
//
|
||||
// USAGE (on pfSense host, via SSH as admin)
|
||||
// scp infra/scripts/pfsense-haproxy-bootstrap.php admin@10.0.20.1:/tmp/
|
||||
// ssh admin@10.0.20.1 'php /tmp/pfsense-haproxy-bootstrap.php'
|
||||
//
|
||||
// IDEMPOTENCY
|
||||
// Removes any existing entries named mailserver_* before re-adding, so
|
||||
// repeat runs are safe and behave as reset-to-declared.
|
||||
|
||||
require_once('/etc/inc/config.inc');
|
||||
require_once('/usr/local/pkg/haproxy/haproxy.inc');
|
||||
require_once('/usr/local/pkg/haproxy/haproxy_utils.inc');
|
||||
|
||||
global $config;
|
||||
parse_config(true);
|
||||
|
||||
if (!is_array($config['installedpackages']['haproxy'])) {
|
||||
$config['installedpackages']['haproxy'] = [];
|
||||
}
|
||||
$h = &$config['installedpackages']['haproxy'];
|
||||
|
||||
$h['enable'] = 'yes';
|
||||
$h['maxconn'] = '1000';
|
||||
|
||||
// Our declared object names (anything starting with mailserver_ is ours)
|
||||
$POOL_NAMES = [
|
||||
'mailserver_nodes', // legacy (Phase 2/3 test)
|
||||
'mailserver_nodes_smtp',
|
||||
'mailserver_nodes_smtps',
|
||||
'mailserver_nodes_sub',
|
||||
'mailserver_nodes_imaps',
|
||||
];
|
||||
$FRONTEND_NAMES = [
|
||||
'mailserver_proxy_test', // legacy (Phase 2/3 test, :2525)
|
||||
'mailserver_proxy_25',
|
||||
'mailserver_proxy_465',
|
||||
'mailserver_proxy_587',
|
||||
'mailserver_proxy_993',
|
||||
];
|
||||
|
||||
// k8s workers. Not in the cluster: master (control-plane) and node5
|
||||
// (doesn't exist in this topology).
|
||||
$NODES = [
|
||||
['k8s-node1', '10.0.20.101'],
|
||||
['k8s-node2', '10.0.20.102'],
|
||||
['k8s-node3', '10.0.20.103'],
|
||||
['k8s-node4', '10.0.20.104'],
|
||||
];
|
||||
|
||||
// Build a pool with optional split healthcheck path.
|
||||
//
|
||||
// $check_port: if non-null, HAProxy sends health probes to that NodePort
|
||||
// (which Service `mailserver-proxy` maps to the pod's stock no-PROXY
|
||||
// listener — see infra/stacks/mailserver/.../mailserver_proxy ports
|
||||
// 30145/30146/30147). Real client traffic still goes to $nodeport with
|
||||
// PROXY v2 framing.
|
||||
// $check_type: 'TCP' for plain accept-on-port checks, 'ESMTP' for
|
||||
// `option smtpchk EHLO <monitor_domain>` (real SMTP banner+EHLO+250).
|
||||
//
|
||||
// Why split: smtpd-proxy587/4465 fatal on every PROXY-v2-aware health
|
||||
// probe with `smtpd_peer_hostaddr_to_sockaddr: ... Servname not supported`
|
||||
// — the daemon respawns get throttled by Postfix master and real clients
|
||||
// land mid-respawn → 6s TCP timeout. Routing health probes to the stock
|
||||
// no-PROXY port sidesteps the bug entirely while data path still gets
|
||||
// PROXY v2 for CrowdSec/Postfix client-IP visibility. The HAProxy package
|
||||
// has no `checkport` field, so `port N` is appended via the server's
|
||||
// `advanced` string (HAProxy parses server keywords in any order).
|
||||
function build_pool(
|
||||
string $name,
|
||||
string $nodeport,
|
||||
array $nodes,
|
||||
string $check_type = 'TCP',
|
||||
?string $check_port = null,
|
||||
string $monitor_domain = ''
|
||||
): array {
|
||||
$advanced_check = $check_port !== null
|
||||
? "send-proxy-v2 port {$check_port}"
|
||||
: 'send-proxy-v2';
|
||||
$servers = [];
|
||||
foreach ($nodes as $n) {
|
||||
$servers[] = [
|
||||
'name' => $n[0],
|
||||
'address' => $n[1],
|
||||
'port' => $nodeport,
|
||||
'weight' => '10',
|
||||
'ssl' => '',
|
||||
// 5s = sub-block-window failover when a NodePort goes sour.
|
||||
// Safe to be aggressive once health probes don't fatal smtpd.
|
||||
'checkinter' => '5000',
|
||||
'advanced' => $advanced_check,
|
||||
'status' => 'active',
|
||||
];
|
||||
}
|
||||
return [
|
||||
'name' => $name,
|
||||
'balance' => 'roundrobin',
|
||||
'check_type' => $check_type,
|
||||
'monitor_domain' => $monitor_domain,
|
||||
'checkinter' => '5000',
|
||||
'retries' => '3',
|
||||
'ha_servers' => ['item' => $servers],
|
||||
'advanced_bind' => '',
|
||||
'persist_cookie_enabled' => '',
|
||||
'transparent_clientip' => '',
|
||||
'advanced' => '',
|
||||
];
|
||||
}
|
||||
|
||||
function build_frontend(string $name, string $descr, string $extaddr, string $port, string $pool): array {
|
||||
return [
|
||||
'name' => $name,
|
||||
'descr' => $descr,
|
||||
'status' => 'active',
|
||||
'secondary' => '',
|
||||
'type' => 'tcp',
|
||||
'a_extaddr' => ['item' => [[
|
||||
'extaddr' => $extaddr,
|
||||
'extaddr_port' => $port,
|
||||
'extaddr_ssl' => '',
|
||||
'extaddr_advanced' => '',
|
||||
]]],
|
||||
'backend_serverpool' => $pool,
|
||||
'ha_acls' => '',
|
||||
'dontlognull'=> '',
|
||||
'httpclose' => '',
|
||||
'forwardfor' => '',
|
||||
'advanced' => '',
|
||||
];
|
||||
}
|
||||
|
||||
// ── Backend pools ───────────────────────────────────────────────────────
|
||||
if (!is_array($h['ha_pools'])) $h['ha_pools'] = ['item' => []];
|
||||
if (!is_array($h['ha_pools']['item'])) $h['ha_pools']['item'] = [];
|
||||
$h['ha_pools']['item'] = array_values(array_filter(
|
||||
$h['ha_pools']['item'],
|
||||
fn($p) => !in_array($p['name'] ?? '', $POOL_NAMES, true)
|
||||
));
|
||||
|
||||
// Legacy test pool (still used by the :2525 test frontend for manual SMTP roundtrip).
|
||||
$h['ha_pools']['item'][] = build_pool('mailserver_nodes', '30125', $NODES);
|
||||
|
||||
// Production pools — one per mail port.
|
||||
//
|
||||
// All SMTP/SMTPS/Submission backends use plain TCP checks against
|
||||
// dedicated non-PROXY healthcheck NodePorts (30145/30146/30147 → pod
|
||||
// stock 25/465/587) so probes hit the no-PROXY listeners and avoid
|
||||
// the smtpd_peer_hostaddr_to_sockaddr fatal that fires on PROXY-v2
|
||||
// LOCAL frames. Real client traffic still goes to 30125-30128 with
|
||||
// PROXY v2 for client-IP visibility.
|
||||
//
|
||||
// We tried `option smtpchk EHLO` initially — it works on the plain
|
||||
// `submission` daemon (587) but flaps the `postscreen` listener on
|
||||
// port 25 (multi-line greet + DNSBL silence + anti-pre-greet
|
||||
// detection makes HAProxy's simple smtpchk parser hit L7RSP). A
|
||||
// plain TCP accept-on-port check is enough for both: HAProxy still
|
||||
// gets fast failover when the listener actually goes away, and we
|
||||
// stop triggering the Postfix fatal entirely.
|
||||
//
|
||||
// IMAPS stays on its existing TCP-check-with-PROXY-frame for now —
|
||||
// Dovecot's PROXY parser doesn't show the same fatal pattern; adding
|
||||
// a separate IMAP healthcheck path would require another svc port.
|
||||
$h['ha_pools']['item'][] = build_pool('mailserver_nodes_smtp', '30125', $NODES, 'TCP', '30145');
|
||||
$h['ha_pools']['item'][] = build_pool('mailserver_nodes_smtps', '30126', $NODES, 'TCP', '30146');
|
||||
$h['ha_pools']['item'][] = build_pool('mailserver_nodes_sub', '30127', $NODES, 'TCP', '30147');
|
||||
$h['ha_pools']['item'][] = build_pool('mailserver_nodes_imaps', '30128', $NODES);
|
||||
|
||||
// ── Frontends ───────────────────────────────────────────────────────────
|
||||
if (!is_array($h['ha_backends'])) $h['ha_backends'] = ['item' => []];
|
||||
if (!is_array($h['ha_backends']['item'])) $h['ha_backends']['item'] = [];
|
||||
$h['ha_backends']['item'] = array_values(array_filter(
|
||||
$h['ha_backends']['item'],
|
||||
fn($f) => !in_array($f['name'] ?? '', $FRONTEND_NAMES, true)
|
||||
));
|
||||
|
||||
// Legacy test frontend — :2525 — retained so SMTP roundtrip tests keep working
|
||||
// without touching the real :25. Safe to remove once fully validated.
|
||||
$h['ha_backends']['item'][] = build_frontend(
|
||||
'mailserver_proxy_test',
|
||||
'code-yiu Phase 2/3 test — PROXY v2 to k8s mailserver NodePort 30125 (alt port :2525)',
|
||||
'10.0.20.1', '2525',
|
||||
'mailserver_nodes'
|
||||
);
|
||||
|
||||
// Production frontends — 4 ports listening on pfSense VLAN20 IP 10.0.20.1.
|
||||
$h['ha_backends']['item'][] = build_frontend(
|
||||
'mailserver_proxy_25',
|
||||
'code-yiu Phase 4/5 — external SMTP (:25) via PROXY v2 → pod :2525 postscreen',
|
||||
'10.0.20.1', '25',
|
||||
'mailserver_nodes_smtp'
|
||||
);
|
||||
$h['ha_backends']['item'][] = build_frontend(
|
||||
'mailserver_proxy_465',
|
||||
'code-yiu Phase 4/5 — external SMTPS (:465) via PROXY v2 → pod :4465 smtpd',
|
||||
'10.0.20.1', '465',
|
||||
'mailserver_nodes_smtps'
|
||||
);
|
||||
$h['ha_backends']['item'][] = build_frontend(
|
||||
'mailserver_proxy_587',
|
||||
'code-yiu Phase 4/5 — external submission (:587) via PROXY v2 → pod :5587 smtpd',
|
||||
'10.0.20.1', '587',
|
||||
'mailserver_nodes_sub'
|
||||
);
|
||||
$h['ha_backends']['item'][] = build_frontend(
|
||||
'mailserver_proxy_993',
|
||||
'code-yiu Phase 4/5 — external IMAPS (:993) via PROXY v2 → pod :10993 Dovecot',
|
||||
'10.0.20.1', '993',
|
||||
'mailserver_nodes_imaps'
|
||||
);
|
||||
|
||||
write_config('code-yiu: mailserver HAProxy — 4 production frontends + legacy :2525 test');
|
||||
|
||||
$messages = '';
|
||||
$rc = haproxy_check_and_run($messages, true);
|
||||
echo 'haproxy_check_and_run rc=' . ($rc ? 'OK' : 'FAIL') . "\n";
|
||||
echo "messages: $messages\n";
|
||||
|
|
@ -1,68 +0,0 @@
|
|||
<?php
|
||||
// pfSense NAT redirect flip — mail ports 25/465/587/993 from
|
||||
// <mailserver> alias (10.0.20.202 MetalLB LB) to pfSense's own HAProxy
|
||||
// listener (10.0.20.1). bd code-yiu.
|
||||
//
|
||||
// THIS IS THE CUTOVER. After this script:
|
||||
// Internet → pfSense WAN:{25,465,587,993} → rdr → 10.0.20.1:{...}
|
||||
// (pfSense HAProxy) → send-proxy-v2 → k8s-node:{30125..30128} NodePort
|
||||
// → kube-proxy → mailserver pod alt listeners (2525/4465/5587/10993)
|
||||
// → Postfix/Dovecot parse PROXY v2 → real client IP recovered.
|
||||
//
|
||||
// Internal clients (Roundcube, email-roundtrip-monitor CronJob) continue
|
||||
// using the existing mailserver ClusterIP Service on the stock ports
|
||||
// (25/465/587/993) which hit container stock listeners WITHOUT PROXY.
|
||||
// No change to internal traffic paths.
|
||||
//
|
||||
// USAGE
|
||||
// scp infra/scripts/pfsense-nat-mailserver-haproxy-flip.php admin@10.0.20.1:/tmp/
|
||||
// ssh admin@10.0.20.1 'php /tmp/pfsense-nat-mailserver-haproxy-flip.php'
|
||||
//
|
||||
// REVERT — run pfsense-nat-mailserver-haproxy-unflip.php (companion script).
|
||||
//
|
||||
// IDEMPOTENT — re-runs converge. Flips nothing if already pointed at 10.0.20.1.
|
||||
|
||||
require_once('/etc/inc/config.inc');
|
||||
require_once('/etc/inc/filter.inc');
|
||||
|
||||
global $config;
|
||||
parse_config(true);
|
||||
|
||||
$PORTS_TO_FLIP = ['25', '465', '587', '993'];
|
||||
$OLD_TARGET = 'mailserver';
|
||||
$NEW_TARGET = '10.0.20.1';
|
||||
|
||||
$changed = 0;
|
||||
foreach ($config['nat']['rule'] as $i => &$r) {
|
||||
$iface = $r['interface'] ?? '';
|
||||
$lport = $r['local-port'] ?? '';
|
||||
$tgt = $r['target'] ?? '';
|
||||
|
||||
if ($iface !== 'wan') continue;
|
||||
if (!in_array($lport, $PORTS_TO_FLIP, true)) continue;
|
||||
if ($tgt !== $OLD_TARGET) {
|
||||
printf("rule %d (dport=%s) target=%s — not flipping (already %s or unexpected)\n",
|
||||
$i, $lport, $tgt, $NEW_TARGET);
|
||||
continue;
|
||||
}
|
||||
|
||||
$r['target'] = $NEW_TARGET;
|
||||
// Also unset the 'associated-rule-id' linked filter rule target if any —
|
||||
// actually pfSense regenerates the associated rule from NAT rule on apply,
|
||||
// so leaving associated-rule-id intact is fine.
|
||||
$changed++;
|
||||
printf("rule %d (dport=%s): target %s → %s\n", $i, $lport, $OLD_TARGET, $NEW_TARGET);
|
||||
}
|
||||
unset($r);
|
||||
|
||||
if ($changed === 0) {
|
||||
echo "No changes. (Already flipped? Run unflip script to revert.)\n";
|
||||
exit(0);
|
||||
}
|
||||
|
||||
write_config("code-yiu: NAT rdr — mail ports {$changed} flipped to HAProxy (10.0.20.1)");
|
||||
|
||||
// Rebuild pf rules & reload.
|
||||
$rc = filter_configure();
|
||||
printf("filter_configure rc=%s\n", var_export($rc, true));
|
||||
echo "done.\n";
|
||||
|
|
@ -1,48 +0,0 @@
|
|||
<?php
|
||||
// REVERT of pfsense-nat-mailserver-haproxy-flip.php.
|
||||
// Moves mail-port NAT rdr target from 10.0.20.1 (pfSense HAProxy) back to
|
||||
// <mailserver> alias (10.0.20.202 MetalLB LB IP). bd code-yiu rollback.
|
||||
//
|
||||
// USE THIS IF: external mail breaks after the flip, any postscreen
|
||||
// PROXY timeouts show up in logs, or you need to back out before Phase 6.
|
||||
|
||||
require_once('/etc/inc/config.inc');
|
||||
require_once('/etc/inc/filter.inc');
|
||||
|
||||
global $config;
|
||||
parse_config(true);
|
||||
|
||||
$PORTS_TO_REVERT = ['25', '465', '587', '993'];
|
||||
$OLD_TARGET = '10.0.20.1';
|
||||
$NEW_TARGET = 'mailserver';
|
||||
|
||||
$changed = 0;
|
||||
foreach ($config['nat']['rule'] as $i => &$r) {
|
||||
$iface = $r['interface'] ?? '';
|
||||
$lport = $r['local-port'] ?? '';
|
||||
$tgt = $r['target'] ?? '';
|
||||
|
||||
if ($iface !== 'wan') continue;
|
||||
if (!in_array($lport, $PORTS_TO_REVERT, true)) continue;
|
||||
if ($tgt !== $OLD_TARGET) {
|
||||
printf("rule %d (dport=%s) target=%s — not reverting (already %s or unexpected)\n",
|
||||
$i, $lport, $tgt, $NEW_TARGET);
|
||||
continue;
|
||||
}
|
||||
|
||||
$r['target'] = $NEW_TARGET;
|
||||
$changed++;
|
||||
printf("rule %d (dport=%s): target %s → %s\n", $i, $lport, $OLD_TARGET, $NEW_TARGET);
|
||||
}
|
||||
unset($r);
|
||||
|
||||
if ($changed === 0) {
|
||||
echo "No changes. (Already reverted.)\n";
|
||||
exit(0);
|
||||
}
|
||||
|
||||
write_config("code-yiu: NAT rdr — mail ports {$changed} reverted to <mailserver> alias");
|
||||
|
||||
$rc = filter_configure();
|
||||
printf("filter_configure rc=%s\n", var_export($rc, true));
|
||||
echo "done.\n";
|
||||
|
|
@ -1,81 +0,0 @@
|
|||
#!/bin/sh
|
||||
# postmortem-pipeline.sh — Woodpecker pipeline step for post-mortem TODO automation
|
||||
# Called from .woodpecker/postmortem-todos.yml
|
||||
set -e
|
||||
|
||||
# 1. Find post-mortem(s) with TODO items
|
||||
# Scan all post-mortems — don't rely on git diff (Woodpecker shallow clone breaks HEAD~1)
|
||||
PM_FILE=""
|
||||
for f in docs/post-mortems/*.md; do
|
||||
if grep -q '| TODO |' "$f" 2>/dev/null; then
|
||||
PM_FILE="$f"
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [ -z "$PM_FILE" ]; then
|
||||
echo "No post-mortem with pending TODOs found"
|
||||
exit 0
|
||||
fi
|
||||
echo "Post-mortem with TODOs: $PM_FILE"
|
||||
|
||||
# 3. Parse TODOs
|
||||
sh scripts/parse-postmortem-todos.sh "$PM_FILE" > /tmp/todos.json
|
||||
cat /tmp/todos.json
|
||||
TODO_COUNT=$(jq '.safe_todos' /tmp/todos.json)
|
||||
echo "$TODO_COUNT safe TODO(s) found"
|
||||
if [ "$TODO_COUNT" -eq 0 ]; then
|
||||
echo "No auto-implementable TODOs — skipping"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 4. Authenticate to Vault via K8s SA JWT
|
||||
SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
|
||||
VAULT_RESP=$(curl -sf -X POST http://vault-active.vault.svc.cluster.local:8200/v1/auth/kubernetes/login \
|
||||
-d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}")
|
||||
VAULT_TOKEN=$(echo "$VAULT_RESP" | jq -r .auth.client_token)
|
||||
if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then
|
||||
echo "ERROR: Vault authentication failed"
|
||||
exit 1
|
||||
fi
|
||||
echo "Vault authenticated"
|
||||
|
||||
# 5. Fetch API token for claude-agent-service
|
||||
AGENT_TOKEN=$(curl -sf -H "X-Vault-Token: $VAULT_TOKEN" \
|
||||
http://vault-active.vault.svc.cluster.local:8200/v1/secret/data/claude-agent-service | \
|
||||
jq -r '.data.data.api_bearer_token')
|
||||
if [ -z "$AGENT_TOKEN" ] || [ "$AGENT_TOKEN" = "null" ]; then
|
||||
echo "ERROR: Failed to fetch agent API token"
|
||||
exit 1
|
||||
fi
|
||||
echo "Agent token fetched"
|
||||
|
||||
# 6. Submit to claude-agent-service
|
||||
TODOS=$(cat /tmp/todos.json)
|
||||
PAYLOAD=$(jq -n \
|
||||
--arg prompt "Implement the auto-implementable TODOs from $PM_FILE. Parsed TODO list: $TODOS" \
|
||||
--arg agent ".claude/agents/postmortem-todo-resolver" \
|
||||
'{prompt: $prompt, agent: $agent, max_budget_usd: 5, timeout_seconds: 900}')
|
||||
|
||||
RESP=$(curl -sf -X POST \
|
||||
-H "Authorization: Bearer $AGENT_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$PAYLOAD" \
|
||||
http://claude-agent-service.claude-agent.svc.cluster.local:8080/execute)
|
||||
JOB_ID=$(echo "$RESP" | jq -r '.job_id')
|
||||
echo "Job submitted: $JOB_ID"
|
||||
|
||||
# 7. Poll for completion (15min max)
|
||||
for i in $(seq 1 60); do
|
||||
sleep 15
|
||||
RESULT=$(curl -sf \
|
||||
-H "Authorization: Bearer $AGENT_TOKEN" \
|
||||
http://claude-agent-service.claude-agent.svc.cluster.local:8080/jobs/$JOB_ID)
|
||||
STATUS=$(echo "$RESULT" | jq -r '.status')
|
||||
echo "[$i/60] Status: $STATUS"
|
||||
if [ "$STATUS" != "running" ]; then
|
||||
echo "$RESULT" | jq .
|
||||
if [ "$STATUS" = "completed" ]; then exit 0; else exit 1; fi
|
||||
fi
|
||||
done
|
||||
echo "ERROR: Job timed out after 15 minutes"
|
||||
exit 1
|
||||
|
|
@ -1,109 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# provision-k8s-worker NAME VMID IP[/CIDR]
|
||||
#
|
||||
# Clone PVE template 2000 (ubuntu-2404-cloudinit-k8s-template) into a new
|
||||
# VM, configure resources to match k8s-node3/4 (32G RAM, 8 vCPU, host CPU,
|
||||
# 256G disk, VLAN 20 on vmbr1), attach the shared cicustom snippet
|
||||
# (/var/lib/vz/snippets/k8s_cloud_init.yaml), and start it. Cloud-init
|
||||
# inside the VM installs containerd + kubelet, applies the bundled
|
||||
# setup script, and runs the kubeadm join. No manual steps after this.
|
||||
#
|
||||
# Hostname is derived from `qm set --name $NAME` and read by cloud-init
|
||||
# from Proxmox metadata — DO NOT hard-code in the snippet.
|
||||
#
|
||||
# Idempotent: aborts if VMID already exists or IP is already in use.
|
||||
#
|
||||
# Usage:
|
||||
# ssh root@192.168.1.127 bash -s -- k8s-node6 206 10.0.20.106 < provision-k8s-worker
|
||||
# or, if the script lives on the PVE host:
|
||||
# provision-k8s-worker k8s-node6 206 10.0.20.106
|
||||
#
|
||||
# Run on the PVE host (needs qm + /var/lib/vz/snippets access).
|
||||
set -euo pipefail
|
||||
|
||||
if [ $# -ne 3 ]; then
|
||||
echo "usage: $0 NAME VMID IP" >&2
|
||||
echo " e.g. $0 k8s-node6 206 10.0.20.106" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
NAME=$1
|
||||
VMID=$2
|
||||
IP=$3
|
||||
CIDR_IP="${IP}/22"
|
||||
GW="10.0.20.1"
|
||||
DNS="10.0.20.201"
|
||||
SEARCH="viktorbarzin.lan"
|
||||
TEMPLATE_ID=2000
|
||||
STORAGE="local-lvm"
|
||||
USER_SNIPPET="local:snippets/k8s_cloud_init.yaml"
|
||||
# Per-node meta-data snippet — written below — supplies local-hostname.
|
||||
# Proxmox's auto-generated metadata DOESN'T include hostname when
|
||||
# cicustom user=… is set, so the shared user-data snippet alone leaves
|
||||
# nodes joining as "ubuntu" (image default). Per-node meta-data is the
|
||||
# clean fix.
|
||||
META_SNIPPET_FILE="/var/lib/vz/snippets/${NAME}-meta.yaml"
|
||||
META_SNIPPET="local:snippets/${NAME}-meta.yaml"
|
||||
BRIDGE="vmbr1"
|
||||
VLAN=20
|
||||
|
||||
# Sanity: VMID must be free
|
||||
if qm status "$VMID" >/dev/null 2>&1; then
|
||||
echo "ERROR: VM $VMID already exists. Refusing to clobber." >&2
|
||||
qm status "$VMID" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Sanity: IP must not be pingable
|
||||
if ping -c 1 -W 1 "$IP" >/dev/null 2>&1; then
|
||||
echo "ERROR: $IP is already responding to ping. Refusing to assign." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Sanity: snippet must exist
|
||||
if [ ! -f "/var/lib/vz/snippets/k8s_cloud_init.yaml" ]; then
|
||||
echo "ERROR: /var/lib/vz/snippets/k8s_cloud_init.yaml missing." >&2
|
||||
echo " Run `tg apply` in infra/stacks/infra/ to regenerate it." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Sanity: template must be a template
|
||||
if ! qm config "$TEMPLATE_ID" | grep -q '^template: 1'; then
|
||||
echo "ERROR: VMID $TEMPLATE_ID is not a template." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[1/6] write per-node meta-data snippet ($META_SNIPPET_FILE)"
|
||||
cat > "$META_SNIPPET_FILE" <<META
|
||||
local-hostname: $NAME
|
||||
instance-id: $NAME-$(date +%s)
|
||||
META
|
||||
|
||||
echo "[2/6] qm clone $TEMPLATE_ID -> $VMID ($NAME)"
|
||||
qm clone "$TEMPLATE_ID" "$VMID" --name "$NAME" --full true --storage "$STORAGE"
|
||||
|
||||
echo "[3/6] qm set $VMID — VM resources + network + cicustom"
|
||||
qm set "$VMID" \
|
||||
--agent 1 \
|
||||
--balloon 32768 \
|
||||
--cores 8 \
|
||||
--cpu host \
|
||||
--memory 32768 \
|
||||
--net0 "virtio,bridge=$BRIDGE,tag=$VLAN" \
|
||||
--ipconfig0 "ip=$CIDR_IP,gw=$GW" \
|
||||
--nameserver "$DNS" \
|
||||
--searchdomain "$SEARCH" \
|
||||
--onboot 1 \
|
||||
--startup 'order=5,up=45,down=420' \
|
||||
--cicustom "user=$USER_SNIPPET,meta=$META_SNIPPET"
|
||||
|
||||
echo "[4/6] qm resize $VMID scsi0 256G"
|
||||
qm resize "$VMID" scsi0 256G
|
||||
|
||||
echo "[5/6] qm start $VMID"
|
||||
qm start "$VMID"
|
||||
|
||||
echo "[6/6] Done. Cloud-init runs now; node should appear in 'kubectl get nodes' within ~6-10 min."
|
||||
echo " Tail cloud-init: socat -u UNIX-CONNECT:/var/run/qemu-server/$VMID.serial0 STDOUT | strings"
|
||||
echo " Final config:"
|
||||
qm config "$VMID" | grep -E '^(name|cores|memory|net0|ipconfig0|cicustom|scsi0|onboot):'
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
# /etc/exports — NFS export configuration for Proxmox VE host
|
||||
# Managed in git: infra/scripts/pve-nfs-exports
|
||||
# Deploy: scp scripts/pve-nfs-exports root@192.168.1.127:/etc/exports && ssh root@192.168.1.127 exportfs -ra
|
||||
#
|
||||
# CRITICAL NOTES (learned from 2026-04-14 outage [PM-2026-04-14]):
|
||||
# - NEVER add fsid=0 to /srv/nfs or /srv/nfs-ssd exports. fsid=0 designates the
|
||||
# NFSv4 pseudo-root which changes path resolution for ALL subdirectory mounts.
|
||||
# When CSI mounts use paths like /srv/nfs/technitium, fsid=0 makes them resolve
|
||||
# as the root itself, causing ENOENT on all subdirectory mounts.
|
||||
# - fsid=1 is acceptable on /srv/nfs-ssd (unique ID, not root).
|
||||
# - The NFS CSI driver mounts subdirectories — never use fsid=0 on any export
|
||||
# that serves dynamic path mounts.
|
||||
# - NFSv3 is disabled on this host (vers3=n in /etc/nfs.conf) — all k8s mounts
|
||||
# must use nfsvers=4 mount option.
|
||||
#
|
||||
# Mount options explanation:
|
||||
# rw — read/write access (required for PVCs)
|
||||
# async — async writes safe: UPS protects host + Vault Raft replication +
|
||||
# databases on block storage. Only NFS metadata at risk.
|
||||
# no_subtree_check — disable subtree checking for performance and reliability
|
||||
# no_root_squash — k8s CSI driver runs as root; squashing breaks PVC writes
|
||||
# insecure — allow source ports >1024 (required: pfSense VLAN NAT uses
|
||||
# unprivileged ports for VLAN 10 → 192.168.1.x traffic)
|
||||
#
|
||||
/srv/nfs *(rw,async,no_subtree_check,no_root_squash,insecure)
|
||||
/srv/nfs-ssd *(rw,sync,no_subtree_check,no_root_squash,insecure,fsid=1)
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
echo 'KUBELET_KUBEADM_ARGS="--container-runtime-endpoint=unix:///var/run/containerd/containerd.sock --pod-infra-container-image=k8s.gcr.io/pause:3.7 --rotate-certificates=true --rotate-server-certificates=true"' | sudo tee /var/lib/kubelet/kubeadm-flags.env
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl restart kubelet
|
||||
|
||||
# Aprprove all csrs:
|
||||
# for csr in $(kb get csr | grep Pending | awk '{print $1}'); do echo $csr; kb certificate approve $csr; done
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -o /tmp/powercheck-armv8 . && rsync /tmp/powercheck-armv8 Administrator@nas:~/server-power-cycle/ && rm /tmp/powercheck-armv8
|
||||
rsync synology_main.sh Administrator@nas:~/server-power-cycle/
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
module viktorbarzin/server-lifecycle
|
||||
|
||||
go 1.22.0
|
||||
|
||||
toolchain go1.23.6
|
||||
|
||||
require (
|
||||
github.com/gosnmp/gosnmp v1.39.0
|
||||
github.com/nightlyone/lockfile v1.0.0
|
||||
)
|
||||
|
||||
require github.com/golang/glog v1.2.4 // indirect
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/golang/glog v1.2.4 h1:CNNw5U8lSiiBk7druxtSHHTsRWcxKoac6kZKm2peBBc=
|
||||
github.com/golang/glog v1.2.4/go.mod h1:6AhwSGph0fcJtXVM/PEHPqZlFeoLxhs7/t5UDAwmO+w=
|
||||
github.com/gosnmp/gosnmp v1.39.0 h1:mPJtSWFLkEemo2bz4fdNztZIFHYG86MC6c6veocq0ZE=
|
||||
github.com/gosnmp/gosnmp v1.39.0/go.mod h1:CxVS6bXqmWZlafUj9pZUnQX5e4fAltqPcijxWpCitDo=
|
||||
github.com/nightlyone/lockfile v1.0.0 h1:RHep2cFKK4PonZJDdEl4GmkabuhbsRMgk/k3uAmxBiA=
|
||||
github.com/nightlyone/lockfile v1.0.0/go.mod h1:rywoIealpdNse2r832aiD9jRk8ErCatROs6LzC841CI=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
|
||||
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
|
|
@ -1,125 +0,0 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
type PowerStateResponse struct {
|
||||
PowerState string `json:"PowerState"`
|
||||
}
|
||||
type ResetType string
|
||||
|
||||
const (
|
||||
On ResetType = "On"
|
||||
GracefulShutdown ResetType = "GracefulShutdown"
|
||||
)
|
||||
|
||||
func checkPowerState(idractCredentials idracCredentials) (string, error) {
|
||||
// Construct the full URL for the Redfish Systems endpoint
|
||||
redfishURL := fmt.Sprintf("%s/redfish/v1/Systems/System.Embedded.1", idractCredentials.url)
|
||||
|
||||
// Create an HTTP client
|
||||
client := &http.Client{
|
||||
Transport: &http.Transport{
|
||||
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
||||
},
|
||||
}
|
||||
|
||||
// Create a new GET request
|
||||
req, err := http.NewRequest("GET", redfishURL, nil)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to create request: %v", err)
|
||||
}
|
||||
|
||||
// Set basic authentication
|
||||
req.SetBasicAuth(idractCredentials.username, idractCredentials.password)
|
||||
|
||||
// Set the Accept header to request JSON
|
||||
req.Header.Set("Accept", "application/json")
|
||||
|
||||
// Send the request
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to send request: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Check the HTTP status code
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return "", fmt.Errorf("unexpected status code: %d, response: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
// Read the response body
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to read response body: %v", err)
|
||||
}
|
||||
|
||||
// return string(body), nil
|
||||
// Parse the JSON response
|
||||
var powerStateResponse PowerStateResponse
|
||||
err = json.Unmarshal(body, &powerStateResponse)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to parse JSON response: %v", err)
|
||||
}
|
||||
|
||||
// Return the power state
|
||||
return powerStateResponse.PowerState, nil
|
||||
}
|
||||
|
||||
func performGracefulShutdown(idracCredentials idracCredentials) error {
|
||||
return performResetType(idracCredentials, GracefulShutdown)
|
||||
}
|
||||
|
||||
func performPowerOn(idracCredentials idracCredentials) error {
|
||||
return performResetType(idracCredentials, On)
|
||||
}
|
||||
|
||||
func performResetType(idracCredentials idracCredentials, resetType ResetType) error {
|
||||
glog.Warningf("Starting graceful reset type %s!\n", resetType)
|
||||
// Define the payload for the shutdown request
|
||||
payload := map[string]string{
|
||||
"ResetType": string(resetType), // Only ResetType is needed
|
||||
}
|
||||
payloadBytes, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal payload: %v", err)
|
||||
}
|
||||
|
||||
// Create a new HTTP request
|
||||
req, err := http.NewRequest("POST", idracCredentials.url, bytes.NewBuffer(payloadBytes))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create request: %v", err)
|
||||
}
|
||||
|
||||
// Set headers
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.SetBasicAuth(idracCredentials.username, idracCredentials.password)
|
||||
|
||||
// Send the request
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to send request: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Check the response status code
|
||||
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted {
|
||||
body, _ := ioutil.ReadAll(resp.Body)
|
||||
return fmt.Errorf("unexpected status code: %d, response: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
glog.Infof("Reset type %s initiated successfully.\n")
|
||||
return nil
|
||||
|
||||
}
|
||||
|
|
@ -1,107 +0,0 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"log"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"github.com/nightlyone/lockfile"
|
||||
)
|
||||
|
||||
const upsMinutesRemainingThreshold = 20
|
||||
|
||||
type idracCredentials = struct {
|
||||
url string
|
||||
username string
|
||||
password string
|
||||
}
|
||||
|
||||
func main() {
|
||||
idracUsername := flag.String("idracUsername", "root", "iDRAC username")
|
||||
idracPassword := flag.String("idracPassword", "calvin", "iDRAC password")
|
||||
idracHost := flag.String("idracHost", "192.168.1.4", "iDRAC host")
|
||||
flag.Parse()
|
||||
defer glog.Flush()
|
||||
// lock, err := tryGetLock()
|
||||
// if err != nil {
|
||||
// glog.Fatalf("Failed to acquire lock: %v", err)
|
||||
// }
|
||||
// defer lock.Unlock()
|
||||
|
||||
glog.Info("Checking server power state")
|
||||
idracCredentials := idracCredentials{
|
||||
url: "https://" + *idracHost,
|
||||
username: *idracUsername,
|
||||
password: *idracPassword,
|
||||
}
|
||||
powerState, err := checkPowerState(idracCredentials)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to check power state: %v", err)
|
||||
}
|
||||
glog.Infof("Server power state: %s", powerState)
|
||||
|
||||
glog.Info("Checking UPS state")
|
||||
snmp := getSNMPClient()
|
||||
// Connect to the SNMP agent
|
||||
err = snmp.Connect()
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to connect to UPS SNMP agent: %v", err)
|
||||
}
|
||||
defer snmp.Conn.Close()
|
||||
|
||||
upsState, err := getPowerState(snmp)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to get UPS power state: %v", err)
|
||||
}
|
||||
|
||||
if powerState == "On" {
|
||||
handleWhenServerOn(upsState, idracCredentials)
|
||||
} else if powerState == "Off" {
|
||||
handleWhenServerOff(upsState, idracCredentials)
|
||||
} else {
|
||||
glog.Fatalf("Unknown server state %s", powerState)
|
||||
}
|
||||
}
|
||||
func handleWhenServerOn(upsState UPSPowerState, idracCredentials idracCredentials) {
|
||||
if upsState.inputVoltage > 0 {
|
||||
glog.Infof("UPS is on AC power: %d. Nothing to do.\n", upsState.inputVoltage)
|
||||
return
|
||||
} else {
|
||||
glog.Warningln("UPS is on Battery power")
|
||||
if upsState.minutesRemaining < upsMinutesRemainingThreshold {
|
||||
glog.Warningf("Minutes remaining is too low - %d Turning off server.", upsState.minutesRemaining)
|
||||
// Perform a graceful shutdown of the server
|
||||
performGracefulShutdown(idracCredentials)
|
||||
} else {
|
||||
glog.Warningf("Minutes remaining is %d. Server will not be shutdown yet.", upsState.minutesRemaining)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func handleWhenServerOff(upsState UPSPowerState, idracCredentials idracCredentials) {
|
||||
if upsState.inputVoltage > 0 {
|
||||
glog.Infof("UPS is on AC power: %d\n", upsState.inputVoltage)
|
||||
if upsState.minutesRemaining < upsMinutesRemainingThreshold {
|
||||
glog.Infof("UPS battery is still too low - %d minutes remaining. Not turning on server yet.\n", upsState.minutesRemaining)
|
||||
} else {
|
||||
glog.Infof("UPS is on AC power and battery has charged - %d minutes remaining. Turning on server...\n", upsState.minutesRemaining)
|
||||
// Perform startup of the server
|
||||
performPowerOn(idracCredentials)
|
||||
}
|
||||
} else {
|
||||
glog.Warningln("UPS is still on battery power")
|
||||
return
|
||||
}
|
||||
}
|
||||
func tryGetLock() (*lockfile.Lockfile, error) {
|
||||
lock, err := lockfile.New("/tmp/server_safe_poweroff.pid")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to create lock file: %v", err)
|
||||
}
|
||||
err = lock.TryLock()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &lock, nil
|
||||
}
|
||||
|
|
@ -1,23 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# This is used to run the main program on synology nas and log all messages to synology's log system
|
||||
|
||||
cd /var/services/homes/Administrator/server-power-cycle
|
||||
echo "Starting powercheck"
|
||||
./powercheck-armv8 -log_dir=./logs
|
||||
|
||||
echo "script completed successfully, logging to synlogy's logs"
|
||||
|
||||
|
||||
while IFS= read -r line; do
|
||||
# for line in $(cat ./logs/powercheck-armv8.INFO); do
|
||||
msg=$(echo $line | grep -E '^[IWEF][0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{6}'| awk '{$1=$2=$3=$4=""; print $0}' | sed 's/^ *//')
|
||||
#echo $line
|
||||
echo $msg
|
||||
if [[ -n $msg ]]; then
|
||||
synologset1 sys info 0x11800000 "$msg"
|
||||
fi
|
||||
done < "./logs/powercheck-armv8.INFO"
|
||||
|
||||
# Cleanup logs
|
||||
find ./logs -type f -mtime +7 -exec rm {} \;
|
||||
|
|
@ -1,46 +0,0 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"github.com/gosnmp/gosnmp"
|
||||
)
|
||||
|
||||
type UPSPowerState = struct {
|
||||
inputVoltage int
|
||||
minutesRemaining uint
|
||||
}
|
||||
|
||||
func getSNMPClient() *gosnmp.GoSNMP {
|
||||
|
||||
// Define SNMP connection parameters
|
||||
target := "192.168.1.5"
|
||||
community := "Public0"
|
||||
|
||||
// Create a new SNMP client
|
||||
snmp := &gosnmp.GoSNMP{
|
||||
Target: target,
|
||||
Port: 161, // Default SNMP port
|
||||
Community: community,
|
||||
Version: gosnmp.Version2c, // Use SNMP v2c
|
||||
Timeout: time.Duration(5) * time.Second,
|
||||
}
|
||||
return snmp
|
||||
}
|
||||
func getPowerState(snmp *gosnmp.GoSNMP) (UPSPowerState, error) {
|
||||
oids := []string{
|
||||
// "1.3.6.1.2.1.33.1.2.2.0", // seconds on battery
|
||||
"1.3.6.1.2.1.33.1.3.3.1.3.1", // input voltage
|
||||
"1.3.6.1.2.1.33.1.2.3.0", // minutes remaining
|
||||
}
|
||||
// Perform an SNMP GET request to retrieve the values for the specified OIDs
|
||||
result, err := snmp.Get(oids)
|
||||
if err != nil {
|
||||
glog.Fatalf("Failed to perform SNMP GET request: %v", err)
|
||||
}
|
||||
|
||||
inputVoltage := (result.Variables[0].Value).(int)
|
||||
minutesRemaining := result.Variables[1].Value.(uint)
|
||||
return UPSPowerState{inputVoltage, minutesRemaining}, nil
|
||||
}
|
||||
|
|
@ -1,115 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
############################################
|
||||
# CONFIGURATION
|
||||
############################################
|
||||
|
||||
# Internal pull-through registry endpoint
|
||||
# Examples:
|
||||
# http://registry.internal:5000
|
||||
# https://registry.internal
|
||||
INTERNAL_REGISTRY="http://10.0.20.10:5002"
|
||||
|
||||
# Path where containerd reads registry configs
|
||||
CERTS_DIR="/etc/containerd/certs.d"
|
||||
|
||||
# Optional: path to CA file if INTERNAL_REGISTRY uses HTTPS with custom CA
|
||||
# Leave empty if not needed
|
||||
INTERNAL_CA_PATH=""
|
||||
|
||||
# Restart containerd at the end
|
||||
RESTART_CONTAINERD=true
|
||||
|
||||
############################################
|
||||
# REGISTRIES TO MIRROR
|
||||
############################################
|
||||
|
||||
REGISTRIES=(
|
||||
"docker.io"
|
||||
"registry-1.docker.io"
|
||||
"registry.k8s.io"
|
||||
"quay.io"
|
||||
"ghcr.io"
|
||||
"gcr.io"
|
||||
"us-docker.pkg.dev"
|
||||
"public.ecr.aws"
|
||||
"mcr.microsoft.com"
|
||||
)
|
||||
|
||||
############################################
|
||||
# FUNCTIONS
|
||||
############################################
|
||||
|
||||
require_root() {
|
||||
if [[ "$(id -u)" -ne 0 ]]; then
|
||||
echo "ERROR: must be run as root" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_containerd_config_path() {
|
||||
local cfg="/etc/containerd/config.toml"
|
||||
|
||||
if [[ ! -f "$cfg" ]]; then
|
||||
echo "Generating default containerd config"
|
||||
containerd config default > "$cfg"
|
||||
fi
|
||||
|
||||
if ! grep -q 'config_path *= *"/etc/containerd/certs.d"' "$cfg"; then
|
||||
echo "Enabling config_path in containerd config"
|
||||
|
||||
# Minimal and safe append if section exists
|
||||
if grep -q '\[plugins\."io.containerd.grpc.v1.cri".registry\]' "$cfg"; then
|
||||
sed -i '/\[plugins\."io.containerd.grpc.v1.cri".registry\]/a \ config_path = "/etc/containerd/certs.d"' "$cfg"
|
||||
else
|
||||
cat >> "$cfg" <<'EOF'
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".registry]
|
||||
config_path = "/etc/containerd/certs.d"
|
||||
EOF
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
write_hosts_toml() {
|
||||
local registry="$1"
|
||||
local dir="$CERTS_DIR/$registry"
|
||||
local file="$dir/hosts.toml"
|
||||
|
||||
mkdir -p "$dir"
|
||||
|
||||
cat > "$file" <<EOF
|
||||
server = "https://$registry"
|
||||
|
||||
[host."$INTERNAL_REGISTRY"]
|
||||
capabilities = ["pull", "resolve"]
|
||||
EOF
|
||||
|
||||
if [[ -n "$INTERNAL_CA_PATH" ]]; then
|
||||
cat >> "$file" <<EOF
|
||||
ca = "$INTERNAL_CA_PATH"
|
||||
EOF
|
||||
fi
|
||||
}
|
||||
|
||||
############################################
|
||||
# MAIN
|
||||
############################################
|
||||
|
||||
require_root
|
||||
ensure_containerd_config_path
|
||||
|
||||
echo "Creating registry mirror configurations..."
|
||||
|
||||
for r in "${REGISTRIES[@]}"; do
|
||||
echo " - $r"
|
||||
write_hosts_toml "$r"
|
||||
done
|
||||
|
||||
if [[ "$RESTART_CONTAINERD" == "true" ]]; then
|
||||
echo "Restarting containerd"
|
||||
systemctl restart containerd
|
||||
fi
|
||||
|
||||
echo "Done."
|
||||
|
|
@ -1,60 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# One-shot deployment of the forgejo.viktorbarzin.me containerd hosts.toml
|
||||
# entry across every k8s node. Cloud-init only fires on VM provision, so
|
||||
# existing nodes need this manual rollout.
|
||||
#
|
||||
# What it does, per node:
|
||||
# 1. drain (ignore-daemonsets, delete-emptydir-data)
|
||||
# 2. ssh in: mkdir + write /etc/containerd/certs.d/forgejo.viktorbarzin.me/hosts.toml
|
||||
# 3. systemctl restart containerd
|
||||
# 4. uncordon
|
||||
#
|
||||
# hosts.toml is documented as hot-reloaded but the post-2026-04-19
|
||||
# containerd corruption playbook calls for an explicit restart so the
|
||||
# config is unambiguously in effect. Running drain/uncordon around it
|
||||
# avoids pulling against an in-flight containerd restart.
|
||||
#
|
||||
# Re-run is safe: writes are idempotent.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
CERTS_DIR=/etc/containerd/certs.d/forgejo.viktorbarzin.me
|
||||
HOSTS_TOML='server = "https://forgejo.viktorbarzin.me"
|
||||
|
||||
[host."https://10.0.20.203"]
|
||||
capabilities = ["pull", "resolve"]
|
||||
skip_verify = true
|
||||
'
|
||||
|
||||
NODES=$(kubectl get nodes -o name | sed 's|^node/||')
|
||||
if [[ -z "$NODES" ]]; then
|
||||
echo "ERROR: no nodes returned from kubectl get nodes" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for n in $NODES; do
|
||||
echo "=== $n ==="
|
||||
kubectl drain "$n" --ignore-daemonsets --delete-emptydir-data --force --grace-period=60
|
||||
|
||||
ssh -o StrictHostKeyChecking=accept-new "wizard@$n" sudo bash <<EOF
|
||||
set -euo pipefail
|
||||
mkdir -p "$CERTS_DIR"
|
||||
cat > "$CERTS_DIR/hosts.toml" <<'TOML'
|
||||
$HOSTS_TOML
|
||||
TOML
|
||||
systemctl restart containerd
|
||||
EOF
|
||||
|
||||
kubectl uncordon "$n"
|
||||
|
||||
# Wait for the node to report Ready before moving to the next one.
|
||||
for i in {1..30}; do
|
||||
if kubectl get node "$n" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' | grep -q True; then
|
||||
echo " node Ready"
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
done
|
||||
|
||||
echo "All nodes updated."
|
||||
|
|
@ -1,231 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# Setup script for the Forgejo task ingestion pipeline.
|
||||
# Creates Authentik OAuth2 provider/application, configures Forgejo OAuth2 auth source,
|
||||
# creates "tasks" repo, and sets up webhook to n8n.
|
||||
#
|
||||
# Prerequisites:
|
||||
# - Authentik admin API token
|
||||
# - Forgejo admin API token (create at https://forgejo.viktorbarzin.me/user/settings/applications)
|
||||
#
|
||||
# Usage:
|
||||
# AUTHENTIK_TOKEN="..." FORGEJO_TOKEN="..." bash scripts/setup-task-pipeline.sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
AUTHENTIK_URL="${AUTHENTIK_URL:-https://authentik.viktorbarzin.me}"
|
||||
FORGEJO_URL="${FORGEJO_URL:-https://forgejo.viktorbarzin.me}"
|
||||
N8N_WEBHOOK_URL="${N8N_WEBHOOK_URL:-https://n8n.viktorbarzin.me/webhook/forgejo-tasks}"
|
||||
FORGEJO_ADMIN_USER="${FORGEJO_ADMIN_USER:-viktor}"
|
||||
|
||||
: "${AUTHENTIK_TOKEN:?Set AUTHENTIK_TOKEN (Authentik admin API token)}"
|
||||
: "${FORGEJO_TOKEN:?Set FORGEJO_TOKEN (Forgejo admin API token)}"
|
||||
|
||||
ak_api() { curl -sf -H "Authorization: Bearer $AUTHENTIK_TOKEN" -H "Content-Type: application/json" "$@"; }
|
||||
fg_api() { curl -sf -H "Authorization: token $FORGEJO_TOKEN" -H "Content-Type: application/json" "$@"; }
|
||||
|
||||
echo "=== Step 1: Create Authentik group 'Task Submitters' ==="
|
||||
GROUP_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/core/groups/" -d '{
|
||||
"name": "Task Submitters",
|
||||
"is_superuser": false,
|
||||
"parent": null
|
||||
}' 2>/dev/null) || {
|
||||
echo " Group may already exist, checking..."
|
||||
GROUP_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/core/groups/?name=Task+Submitters" | python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(json.dumps(r[0]) if r else '')")
|
||||
if [ -z "$GROUP_RESP" ]; then echo "ERROR: Failed to create or find group"; exit 1; fi
|
||||
}
|
||||
GROUP_PK=$(echo "$GROUP_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['pk'])")
|
||||
echo " Group PK: $GROUP_PK"
|
||||
|
||||
echo ""
|
||||
echo "=== Step 2: Create Authentik OAuth2 Provider for Forgejo ==="
|
||||
# Find the explicit consent authorization flow
|
||||
AUTH_FLOW=$(ak_api "$AUTHENTIK_URL/api/v3/flows/instances/?designation=authorization&search=explicit" | \
|
||||
python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(r[0]['pk'] if r else '')")
|
||||
if [ -z "$AUTH_FLOW" ]; then
|
||||
echo " WARNING: Could not find explicit consent flow, using implicit"
|
||||
AUTH_FLOW=$(ak_api "$AUTHENTIK_URL/api/v3/flows/instances/?designation=authorization" | \
|
||||
python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(r[0]['pk'] if r else '')")
|
||||
fi
|
||||
echo " Authorization flow: $AUTH_FLOW"
|
||||
|
||||
PROVIDER_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/providers/oauth2/" -d "{
|
||||
\"name\": \"Forgejo\",
|
||||
\"authorization_flow\": \"$AUTH_FLOW\",
|
||||
\"client_type\": \"confidential\",
|
||||
\"redirect_uris\": \"$FORGEJO_URL/user/oauth2/Authentik/callback\",
|
||||
\"property_mappings\": [],
|
||||
\"sub_mode\": \"hashed_user_id\",
|
||||
\"include_claims_in_id_token\": true,
|
||||
\"access_code_validity\": \"minutes=1\",
|
||||
\"access_token_validity\": \"minutes=5\",
|
||||
\"refresh_token_validity\": \"days=30\"
|
||||
}" 2>/dev/null) || {
|
||||
echo " Provider may already exist, checking..."
|
||||
PROVIDER_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/providers/oauth2/?name=Forgejo" | \
|
||||
python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(json.dumps(r[0]) if r else '')")
|
||||
if [ -z "$PROVIDER_RESP" ]; then echo "ERROR: Failed to create or find provider"; exit 1; fi
|
||||
}
|
||||
PROVIDER_PK=$(echo "$PROVIDER_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['pk'])")
|
||||
CLIENT_ID=$(echo "$PROVIDER_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['client_id'])")
|
||||
CLIENT_SECRET=$(echo "$PROVIDER_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('client_secret','<already-created>'))")
|
||||
echo " Provider PK: $PROVIDER_PK"
|
||||
echo " Client ID: $CLIENT_ID"
|
||||
echo " Client Secret: $CLIENT_SECRET"
|
||||
|
||||
echo ""
|
||||
echo "=== Step 3: Create Authentik Application for Forgejo ==="
|
||||
APP_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/core/applications/" -d "{
|
||||
\"name\": \"Forgejo\",
|
||||
\"slug\": \"forgejo\",
|
||||
\"provider\": $PROVIDER_PK,
|
||||
\"meta_launch_url\": \"$FORGEJO_URL\",
|
||||
\"policy_engine_mode\": \"any\"
|
||||
}" 2>/dev/null) || {
|
||||
echo " Application may already exist, checking..."
|
||||
APP_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/core/applications/?slug=forgejo" | \
|
||||
python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(json.dumps(r[0]) if r else '')")
|
||||
}
|
||||
APP_SLUG=$(echo "$APP_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['slug'])")
|
||||
echo " Application slug: $APP_SLUG"
|
||||
|
||||
echo ""
|
||||
echo "=== Step 4: Bind 'Task Submitters' group to Forgejo application ==="
|
||||
# Create a policy binding that restricts access to the Task Submitters group
|
||||
ak_api "$AUTHENTIK_URL/api/v3/policies/bindings/" -d "{
|
||||
\"target\": \"$(echo "$APP_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['pk'])")\",
|
||||
\"group\": \"$GROUP_PK\",
|
||||
\"enabled\": true,
|
||||
\"order\": 0,
|
||||
\"negate\": false,
|
||||
\"timeout\": 30
|
||||
}" > /dev/null 2>&1 || echo " Binding may already exist (OK)"
|
||||
echo " Group binding created"
|
||||
|
||||
echo ""
|
||||
echo "=== Step 5: Add users to 'Task Submitters' group ==="
|
||||
echo " Adding Viktor Barzin..."
|
||||
VIKTOR_PK=$(ak_api "$AUTHENTIK_URL/api/v3/core/users/?search=vbarzin" | \
|
||||
python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(r[0]['pk'] if r else '')")
|
||||
if [ -n "$VIKTOR_PK" ]; then
|
||||
ak_api "$AUTHENTIK_URL/api/v3/core/groups/$GROUP_PK/" -X PATCH -d "{}" > /dev/null 2>&1 || true
|
||||
ak_api -X POST "$AUTHENTIK_URL/api/v3/core/groups/$GROUP_PK/add_user/" -d "{\"pk\": $VIKTOR_PK}" > /dev/null 2>&1 || true
|
||||
echo " Added Viktor (PK: $VIKTOR_PK)"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== Step 6: Configure Forgejo OAuth2 authentication source ==="
|
||||
fg_api "$FORGEJO_URL/api/v1/admin/identity-sources" -d "{
|
||||
\"authentication_source\": {
|
||||
\"name\": \"Authentik\",
|
||||
\"type\": \"oauth2\",
|
||||
\"is_active\": true,
|
||||
\"is_sync_enabled\": false,
|
||||
\"oauth2\": {
|
||||
\"provider\": \"openidConnect\",
|
||||
\"client_id\": \"$CLIENT_ID\",
|
||||
\"client_secret\": \"$CLIENT_SECRET\",
|
||||
\"open_id_connect_auto_discovery_url\": \"$AUTHENTIK_URL/application/o/forgejo/.well-known/openid-configuration\",
|
||||
\"scopes\": [\"openid\", \"profile\", \"email\"],
|
||||
\"required_claim_name\": \"\",
|
||||
\"required_claim_value\": \"\",
|
||||
\"group_claim_name\": \"\",
|
||||
\"admin_group\": \"\",
|
||||
\"restricted_group\": \"\",
|
||||
\"icon_url\": \"\",
|
||||
\"skip_local_2fa\": true,
|
||||
\"attribute_ssn\": \"\"
|
||||
}
|
||||
}
|
||||
}" > /dev/null 2>&1 && echo " OAuth2 source created" || {
|
||||
echo " Forgejo identity-sources API may not be available."
|
||||
echo " Falling back to legacy authentication-source API..."
|
||||
fg_api "$FORGEJO_URL/api/v1/admin/auths" -d "{
|
||||
\"name\": \"Authentik\",
|
||||
\"type\": 6,
|
||||
\"is_active\": true,
|
||||
\"is_sync_enabled\": false,
|
||||
\"cfg\": {
|
||||
\"Provider\": \"openidConnect\",
|
||||
\"ClientID\": \"$CLIENT_ID\",
|
||||
\"ClientSecret\": \"$CLIENT_SECRET\",
|
||||
\"OpenIDConnectAutoDiscoveryURL\": \"$AUTHENTIK_URL/application/o/forgejo/.well-known/openid-configuration\",
|
||||
\"Scopes\": [\"openid\", \"profile\", \"email\"],
|
||||
\"SkipLocalTwoFA\": true
|
||||
}
|
||||
}" > /dev/null 2>&1 && echo " OAuth2 source created (legacy API)" || {
|
||||
echo " ERROR: Could not create OAuth2 source via API."
|
||||
echo " Please create it manually in Forgejo admin panel:"
|
||||
echo " 1. Go to $FORGEJO_URL/-/admin/auths/new"
|
||||
echo " 2. Auth Type: OAuth2"
|
||||
echo " 3. Name: Authentik"
|
||||
echo " 4. OAuth2 Provider: OpenID Connect"
|
||||
echo " 5. Client ID: $CLIENT_ID"
|
||||
echo " 6. Client Secret: $CLIENT_SECRET"
|
||||
echo " 7. Discovery URL: $AUTHENTIK_URL/application/o/forgejo/.well-known/openid-configuration"
|
||||
echo " 8. Scopes: openid profile email"
|
||||
}
|
||||
}
|
||||
|
||||
echo ""
|
||||
echo "=== Step 7: Create 'tasks' repository in Forgejo ==="
|
||||
REPO_RESP=$(fg_api "$FORGEJO_URL/api/v1/user/repos" -d '{
|
||||
"name": "tasks",
|
||||
"description": "Task queue for OpenClaw AI agent. Create an issue to submit a task.",
|
||||
"private": false,
|
||||
"auto_init": true,
|
||||
"default_branch": "main"
|
||||
}' 2>/dev/null) && echo " Repository created" || {
|
||||
echo " Repository may already exist (OK)"
|
||||
REPO_RESP=$(fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_ADMIN_USER/tasks")
|
||||
}
|
||||
echo " Repo: $FORGEJO_URL/$FORGEJO_ADMIN_USER/tasks"
|
||||
|
||||
echo ""
|
||||
echo "=== Step 8: Disable non-issue features on tasks repo ==="
|
||||
fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_ADMIN_USER/tasks" -X PATCH -d '{
|
||||
"has_pull_requests": false,
|
||||
"has_wiki": false,
|
||||
"has_projects": false,
|
||||
"has_releases": false,
|
||||
"has_packages": false,
|
||||
"has_actions": false
|
||||
}' > /dev/null 2>&1 && echo " Disabled PRs, wiki, projects, releases, packages, actions" || echo " Some features may not be disableable (OK)"
|
||||
|
||||
echo ""
|
||||
echo "=== Step 9: Create issue labels ==="
|
||||
for label_data in \
|
||||
'{"name":"pending","color":"#0075ca","description":"Task waiting to be processed"}' \
|
||||
'{"name":"processing","color":"#e4e669","description":"Task currently being processed by OpenClaw"}' \
|
||||
'{"name":"completed","color":"#0e8a16","description":"Task completed successfully"}' \
|
||||
'{"name":"failed","color":"#d73a4a","description":"Task failed during processing"}'; do
|
||||
fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_ADMIN_USER/tasks/labels" -d "$label_data" > /dev/null 2>&1 || true
|
||||
done
|
||||
echo " Labels created: pending, processing, completed, failed"
|
||||
|
||||
echo ""
|
||||
echo "=== Step 10: Create webhook on tasks repo → n8n ==="
|
||||
fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_ADMIN_USER/tasks/hooks" -d "{
|
||||
\"type\": \"gitea\",
|
||||
\"config\": {
|
||||
\"url\": \"$N8N_WEBHOOK_URL\",
|
||||
\"content_type\": \"json\",
|
||||
\"secret\": \"\"
|
||||
},
|
||||
\"events\": [\"issues\"],
|
||||
\"active\": true
|
||||
}" > /dev/null 2>&1 && echo " Webhook created → $N8N_WEBHOOK_URL" || echo " Webhook may already exist (OK)"
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "Setup complete!"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo " 1. Add SOPS secrets:"
|
||||
echo " forgejo_authentik_client_id = \"$CLIENT_ID\""
|
||||
echo " forgejo_authentik_client_secret = \"$CLIENT_SECRET\""
|
||||
echo " 2. Run: scripts/tg apply -target=module.forgejo"
|
||||
echo " 3. Create n8n workflow (webhook trigger → OpenClaw exec → Forgejo comment)"
|
||||
echo " 4. Add more users to 'Task Submitters' group in Authentik"
|
||||
echo " 5. Test: Create an issue at $FORGEJO_URL/$FORGEJO_ADMIN_USER/tasks/issues/new"
|
||||
echo "=========================================="
|
||||
|
|
@ -1,54 +0,0 @@
|
|||
#!/bin/bash
|
||||
# setup_containerd_mirrors.sh
|
||||
# Replaces deprecated wildcard registry mirror with per-registry hosts.toml config.
|
||||
# Run on each K8s WORKER node: ssh wizard@<node-ip> 'sudo bash -s' < scripts/setup_containerd_mirrors.sh
|
||||
# NOTE: Do NOT run on k8s-master (containerd 1.6.x has conflicts with config_path + mirrors coexisting)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
TIMESTAMP=$(date +%s)
|
||||
CONFIG="/etc/containerd/config.toml"
|
||||
CERTS_DIR="/etc/containerd/certs.d"
|
||||
|
||||
echo "=== Backing up containerd config ==="
|
||||
cp "$CONFIG" "${CONFIG}.bak.${TIMESTAMP}"
|
||||
|
||||
echo "=== Removing deprecated mirror entries ==="
|
||||
# Remove wildcard mirror and its endpoint
|
||||
sed -i '/\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."\*"\]/d' "$CONFIG"
|
||||
sed -i '/endpoint = \["http:\/\/10\.0\.20\.10:5000"\]/d' "$CONFIG"
|
||||
# Remove any other per-registry mirror sections (e.g. docker.io) to avoid config_path conflict
|
||||
sed -i '/\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."docker\.io"\]/d' "$CONFIG"
|
||||
sed -i '/endpoint = \["https:\/\/registry-1\.docker\.io"\]/d' "$CONFIG"
|
||||
# Remove the mirrors parent section header if it's now empty
|
||||
sed -i '/\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\]$/d' "$CONFIG"
|
||||
|
||||
echo "=== Setting config_path ==="
|
||||
# Replace empty config_path with certs.d path
|
||||
if grep -q 'config_path = ""' "$CONFIG"; then
|
||||
sed -i 's|config_path = ""|config_path = "/etc/containerd/certs.d"|' "$CONFIG"
|
||||
elif grep -q 'config_path = "/etc/containerd/certs.d"' "$CONFIG"; then
|
||||
echo "config_path already set, skipping"
|
||||
else
|
||||
# If config_path line doesn't exist at all, add it under [plugins."io.containerd.grpc.v1.cri".registry]
|
||||
sed -i '/\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\]/a\ config_path = "/etc/containerd/certs.d"' "$CONFIG"
|
||||
fi
|
||||
|
||||
echo "=== Creating hosts.toml files ==="
|
||||
|
||||
# docker.io (Docker Hub) — proxy first, upstream fallback
|
||||
mkdir -p "$CERTS_DIR/docker.io"
|
||||
printf 'server = "https://registry-1.docker.io"\n\n[host."http://10.0.20.10:5000"]\n capabilities = ["pull", "resolve"]\n\n[host."https://registry-1.docker.io"]\n capabilities = ["pull", "resolve"]\n' > "$CERTS_DIR/docker.io/hosts.toml"
|
||||
|
||||
# ghcr.io — proxy first, upstream fallback
|
||||
mkdir -p "$CERTS_DIR/ghcr.io"
|
||||
printf 'server = "https://ghcr.io"\n\n[host."http://10.0.20.10:5010"]\n capabilities = ["pull", "resolve"]\n\n[host."https://ghcr.io"]\n capabilities = ["pull", "resolve"]\n' > "$CERTS_DIR/ghcr.io/hosts.toml"
|
||||
|
||||
# Low-traffic registries (quay.io, registry.k8s.io, reg.kyverno.io) pull directly — no proxy.
|
||||
# Remove stale hosts.toml from previous config if present.
|
||||
rm -f "$CERTS_DIR/quay.io/hosts.toml" "$CERTS_DIR/registry.k8s.io/hosts.toml" "$CERTS_DIR/reg.kyverno.io/hosts.toml"
|
||||
rmdir "$CERTS_DIR/quay.io" "$CERTS_DIR/registry.k8s.io" "$CERTS_DIR/reg.kyverno.io" 2>/dev/null || true
|
||||
|
||||
# No containerd restart needed — hosts.toml is re-read on each pull
|
||||
|
||||
echo "=== Done ==="
|
||||
|
|
@ -1,129 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||
STATE_DIR="$REPO_ROOT/state/stacks"
|
||||
VAULT_ADDR="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
|
||||
|
||||
cmd="${1:-help}"
|
||||
stack="${2:-}" # optional: operate on single stack
|
||||
|
||||
# Check if Vault token is valid
|
||||
vault_available() {
|
||||
VAULT_ADDR="$VAULT_ADDR" vault token lookup &>/dev/null 2>&1
|
||||
}
|
||||
|
||||
# Per-stack Transit key URI
|
||||
transit_uri() {
|
||||
local stack_name="$1"
|
||||
echo "${VAULT_ADDR}/v1/transit/keys/sops-state-${stack_name}"
|
||||
}
|
||||
|
||||
# Extract stack name from directory path
|
||||
stack_name_from_dir() {
|
||||
basename "$1"
|
||||
}
|
||||
|
||||
# Tier 0 stacks keep SOPS-encrypted local state; Tier 1 uses PG backend
|
||||
TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
|
||||
is_tier0() {
|
||||
echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
|
||||
}
|
||||
|
||||
# Read age recipients from .sops.yaml
|
||||
AGE_RECIPIENTS="$(python3 -c "
|
||||
import yaml, sys
|
||||
with open('$REPO_ROOT/.sops.yaml') as f: c = yaml.safe_load(f)
|
||||
for r in c.get('creation_rules', []):
|
||||
age = r.get('age', '')
|
||||
if age:
|
||||
print(age.replace('\n', '').strip())
|
||||
break
|
||||
" 2>/dev/null || echo "")"
|
||||
|
||||
encrypt_state() {
|
||||
local dir="$1"
|
||||
local src="$dir/terraform.tfstate"
|
||||
local dst="$dir/terraform.tfstate.enc"
|
||||
local name
|
||||
name="$(stack_name_from_dir "$dir")"
|
||||
[ -f "$src" ] || return 0
|
||||
# Only re-encrypt if state is newer than encrypted version
|
||||
if [ ! -f "$dst" ] || [ "$src" -nt "$dst" ]; then
|
||||
sops -e --input-type json --output-type json \
|
||||
--hc-vault-transit "$(transit_uri "$name")" \
|
||||
--age "$AGE_RECIPIENTS" \
|
||||
"$src" > "$dst"
|
||||
fi
|
||||
}
|
||||
|
||||
decrypt_state() {
|
||||
local dir="$1"
|
||||
local src="$dir/terraform.tfstate.enc"
|
||||
local dst="$dir/terraform.tfstate"
|
||||
[ -f "$src" ] || return 0
|
||||
|
||||
if vault_available; then
|
||||
# Vault Transit — per-stack key, no local key needed
|
||||
sops -d --input-type json --output-type json "$src" > "$dst"
|
||||
elif [ -f "${SOPS_AGE_KEY_FILE:-$HOME/.config/sops/age/keys.txt}" ]; then
|
||||
# Fallback: age key on disk (bootstrap / Vault down)
|
||||
echo "state-sync: Vault unavailable, falling back to age key" >&2
|
||||
SOPS_AGE_KEY_FILE="${SOPS_AGE_KEY_FILE:-$HOME/.config/sops/age/keys.txt}" \
|
||||
sops -d --input-type json --output-type json "$src" > "$dst"
|
||||
else
|
||||
echo "state-sync: ERROR — no Vault token and no age key at ~/.config/sops/age/keys.txt" >&2
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
case "$cmd" in
|
||||
encrypt)
|
||||
if [ -n "$stack" ]; then
|
||||
if is_tier0 "$stack"; then
|
||||
encrypt_state "$STATE_DIR/$stack"
|
||||
else
|
||||
echo "state-sync: skipping Tier 1 stack '$stack' (PG backend)" >&2
|
||||
fi
|
||||
else
|
||||
for dir in "$STATE_DIR"/*/; do
|
||||
_name="$(stack_name_from_dir "$dir")"
|
||||
if is_tier0 "$_name"; then
|
||||
encrypt_state "$dir"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
;;
|
||||
decrypt)
|
||||
if [ -n "$stack" ]; then
|
||||
if is_tier0 "$stack"; then
|
||||
decrypt_state "$STATE_DIR/$stack"
|
||||
else
|
||||
echo "state-sync: skipping Tier 1 stack '$stack' (PG backend)" >&2
|
||||
fi
|
||||
else
|
||||
for dir in "$STATE_DIR"/*/; do
|
||||
_name="$(stack_name_from_dir "$dir")"
|
||||
if is_tier0 "$_name"; then
|
||||
decrypt_state "$dir"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
;;
|
||||
commit)
|
||||
# Only Tier 0 stacks have encrypted state in git
|
||||
"$0" encrypt
|
||||
cd "$REPO_ROOT"
|
||||
git add state/stacks/*/terraform.tfstate.enc
|
||||
if ! git diff --cached --quiet; then
|
||||
git commit -m "state: update encrypted terraform state"
|
||||
fi
|
||||
;;
|
||||
help)
|
||||
echo "Usage: state-sync {encrypt|decrypt|commit} [stack-name]"
|
||||
echo "Operates on Tier 0 stacks only (infra, platform, cnpg, vault, dbaas, external-secrets)."
|
||||
echo "Tier 1 stacks use the PG backend and don't need local state sync."
|
||||
echo "Encrypt uses per-stack Vault Transit key (transit/keys/sops-state-<stack>)."
|
||||
echo "Decrypt uses Vault Transit if logged in, falls back to age key."
|
||||
;;
|
||||
esac
|
||||
|
|
@ -1,48 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Stop services that may become in a corrupted state if storage is suddenly disconnected
|
||||
|
||||
|
||||
set -euxo pipefail
|
||||
|
||||
function scale() { kubectl scale deployment --replicas=$3 --namespace $1 $2; }
|
||||
|
||||
### ============================
|
||||
### MAIN
|
||||
### ============================
|
||||
cmd="${1:-stop}"
|
||||
case "$cmd" in
|
||||
stop)
|
||||
scale redis redis 0
|
||||
scale uptime-kuma uptime-kuma 0
|
||||
scale paperless-ngx paperless-ngx 0
|
||||
scale vaultwarden vaultwarden 0
|
||||
scale immich immich-postgresql 0
|
||||
scale nextcloud nextcloud 0
|
||||
scale monitoring prometheus-server 0
|
||||
|
||||
scale technitium technitium 0
|
||||
scale dbaas mysql 0
|
||||
scale dbaas postgresql 0
|
||||
;;
|
||||
start)
|
||||
scale dbaas mysql 1
|
||||
scale dbaas postgresql 1
|
||||
scale technitium technitium 1
|
||||
scale immich immich-postgresql 1
|
||||
scale nextcloud nextcloud 1
|
||||
scale paperless-ngx paperless-ngx 1
|
||||
scale monitoring prometheus-server 1
|
||||
scale redis redis 1
|
||||
scale uptime-kuma uptime-kuma 1
|
||||
scale vaultwarden vaultwarden 1
|
||||
;;
|
||||
# echo "[!] Cleanup only removes links (not flushing all iptables to avoid surprises)."
|
||||
# ip netns list | grep -qw "$NS_NAME" && sudo ip netns del "$NS_NAME" || true
|
||||
# has_link "$HOST_VETH" && sudo ip link del "$HOST_VETH" || true
|
||||
# ;;
|
||||
*)
|
||||
echo "Usage: $0 [stop|start]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
# The t3-dispatch service (unprivileged user t3-dispatch) may run ONLY the
|
||||
# t3-mint wrapper, as root. t3-mint validates the target user against
|
||||
# /etc/ttyd-user-map and mints a one-time t3 pairing token as that user.
|
||||
# A compromise of the network-facing dispatch service can therefore mint
|
||||
# pairing tokens for already-mapped users at most — never arbitrary root.
|
||||
t3-dispatch ALL=(root) NOPASSWD: /usr/local/bin/t3-mint
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
[Unit]
|
||||
Description=Track latest t3 nightly (health-checked, idle-only restart)
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/t3-autoupdate
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# Track the latest t3 nightly — with a health-check + auto-rollback (lesson from
|
||||
# the Keel auto-update incidents: never blindly trust a new build) and idle-only
|
||||
# restarts (never kill an in-flight coding session). Runs as root via the unit.
|
||||
set -uo pipefail
|
||||
LOG() { logger -t t3-autoupdate "$*"; echo "t3-autoupdate: $*"; }
|
||||
|
||||
ver() { t3 --version 2>/dev/null | awk '{print $NF}' | sed 's/^v//'; }
|
||||
|
||||
before=$(ver); LOG "current: ${before:-unknown}"
|
||||
npm i -g t3@nightly >/dev/null 2>&1 || { LOG "npm install failed; staying on ${before:-current}"; exit 0; }
|
||||
after=$(ver)
|
||||
|
||||
if [[ -z "$after" || "$after" == "$before" ]]; then
|
||||
LOG "already latest (${before:-?}); nothing to do"; exit 0
|
||||
fi
|
||||
LOG "installed $after (was $before); health-checking…"
|
||||
|
||||
# Health-check the NEW binary on a throwaway port/base-dir before trusting it.
|
||||
SMOKE_PORT=3799; SMOKE_DIR=$(mktemp -d)
|
||||
t3 serve --host 127.0.0.1 --port "$SMOKE_PORT" --base-dir "$SMOKE_DIR" >/dev/null 2>&1 &
|
||||
smoke=$!; ok=0
|
||||
for _ in $(seq 1 15); do
|
||||
[[ "$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://127.0.0.1:$SMOKE_PORT/" 2>/dev/null)" == "200" ]] && { ok=1; break; }
|
||||
sleep 2
|
||||
done
|
||||
kill "$smoke" 2>/dev/null; wait "$smoke" 2>/dev/null; rm -rf "$SMOKE_DIR"
|
||||
|
||||
if [[ "$ok" != "1" ]]; then
|
||||
LOG "HEALTH-CHECK FAILED for $after — rolling back to $before"
|
||||
if [[ -n "$before" ]] && npm i -g "t3@$before" >/dev/null 2>&1; then
|
||||
LOG "rolled back to $before"
|
||||
else
|
||||
LOG "ROLLBACK FAILED — manual fix needed (t3 may be broken)"
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
LOG "health OK; restarting idle instances"
|
||||
|
||||
# Restart only IDLE per-user instances; defer any with an active agent child.
|
||||
for unit in $(systemctl list-units --type=service --state=running --no-legend 't3-serve@*' | awk '{print $1}'); do
|
||||
pid=$(systemctl show -p MainPID --value "$unit")
|
||||
if [[ -n "$pid" && "$pid" != 0 ]] && pgrep -aP "$pid" 2>/dev/null | grep -qiE 'claude|codex|opencode'; then
|
||||
LOG "deferring $unit (active agent) — updates next cycle when idle"
|
||||
else
|
||||
systemctl restart "$unit" && LOG "restarted $unit -> $after"
|
||||
fi
|
||||
done
|
||||
LOG "update complete: $after"
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
[Unit]
|
||||
Description=Daily t3 nightly auto-update
|
||||
|
||||
[Timer]
|
||||
OnCalendar=*-*-* 04:00:00
|
||||
RandomizedDelaySec=1h
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
[Unit]
|
||||
Description=t3 per-user dispatch + auto-pair (X-authentik-username -> user instance)
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
# Unprivileged dedicated user; the only privileged action is `sudo t3-mint`
|
||||
# (scoped in /etc/sudoers.d/t3-autopair). Compromise => mint tokens at most.
|
||||
User=t3-dispatch
|
||||
ExecStart=/usr/local/bin/t3-dispatch
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
module t3-dispatch
|
||||
|
||||
go 1.22
|
||||
|
|
@ -1,139 +0,0 @@
|
|||
// t3-dispatch: per-user dispatch + auto-pair for t3code.
|
||||
// Sits behind Traefik+Authentik (which injects X-authentik-username) and routes
|
||||
// each authenticated user to their own `t3 serve` instance. On a user's first
|
||||
// visit (no t3 session cookie) it mints a pairing token for that user's instance
|
||||
// and exchanges it for the session cookie, which it injects into the browser —
|
||||
// so an Authentik login lands straight in the user's workspace.
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/http/httputil"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
type entry struct {
|
||||
OsUser string `json:"os_user"`
|
||||
Port int `json:"port"`
|
||||
}
|
||||
|
||||
const (
|
||||
cookieName = "t3_session" // discovered: apps/server/src/auth/utils.ts (web mode)
|
||||
listenAddr = ":3780"
|
||||
dispatchFile = "/etc/t3-serve/dispatch.json"
|
||||
)
|
||||
|
||||
var (
|
||||
mu sync.RWMutex
|
||||
table map[string]entry
|
||||
)
|
||||
|
||||
func loadTable() error {
|
||||
b, err := os.ReadFile(dispatchFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
m := map[string]entry{}
|
||||
if err := json.Unmarshal(b, &m); err != nil {
|
||||
return err
|
||||
}
|
||||
mu.Lock()
|
||||
table = m
|
||||
mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func lookup(ak string) (entry, bool) {
|
||||
mu.RLock()
|
||||
defer mu.RUnlock()
|
||||
e, ok := table[ak]
|
||||
return e, ok
|
||||
}
|
||||
|
||||
// autoPair mints a one-time pairing token for the user's instance (as that OS
|
||||
// user, via the scoped sudoers entry) and exchanges it at the instance's
|
||||
// /api/auth/bootstrap, relaying the returned t3_session Set-Cookie to the browser.
|
||||
func autoPair(e entry, w http.ResponseWriter, r *http.Request) {
|
||||
// t3-mint (root, via scoped sudoers) validates the OS user is in
|
||||
// /etc/ttyd-user-map, then mints as that user. The dispatch service itself
|
||||
// runs unprivileged and can invoke nothing else.
|
||||
out, err := exec.Command("sudo", "-n", "/usr/local/bin/t3-mint", e.OsUser).Output()
|
||||
if err != nil {
|
||||
log.Printf("mint for %s failed: %v", e.OsUser, err)
|
||||
http.Error(w, "pairing mint failed", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
var pc struct {
|
||||
Credential string `json:"credential"` // CLI returns the token under "credential"
|
||||
}
|
||||
if err := json.Unmarshal(out, &pc); err != nil || pc.Credential == "" {
|
||||
http.Error(w, "unparseable pairing output", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
body, _ := json.Marshal(map[string]string{"credential": pc.Credential})
|
||||
resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d/api/auth/bootstrap", e.Port),
|
||||
"application/json", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
http.Error(w, "bootstrap request failed", http.StatusBadGateway)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
log.Printf("bootstrap for %s returned %d", e.OsUser, resp.StatusCode)
|
||||
http.Error(w, "bootstrap rejected", http.StatusBadGateway)
|
||||
return
|
||||
}
|
||||
for _, c := range resp.Cookies() {
|
||||
http.SetCookie(w, c) // relays t3_session (HttpOnly; Path=/; SameSite=Lax)
|
||||
}
|
||||
http.Redirect(w, r, "/", http.StatusFound)
|
||||
}
|
||||
|
||||
func handler(w http.ResponseWriter, r *http.Request) {
|
||||
ak := r.Header.Get("X-authentik-username")
|
||||
// Authentik injects the full email (e.g. vbarzin@gmail.com); /etc/ttyd-user-map
|
||||
// (and thus dispatch.json) keys on the local part. Strip @domain, matching the
|
||||
// terminal stack's tmux-attach.sh (`${auth_user%%@*}`).
|
||||
if i := strings.IndexByte(ak, '@'); i >= 0 {
|
||||
ak = ak[:i]
|
||||
}
|
||||
e, ok := lookup(ak)
|
||||
if !ok {
|
||||
http.Error(w, "no t3 instance provisioned for this user", http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
if _, err := r.Cookie(cookieName); err != nil {
|
||||
autoPair(e, w, r)
|
||||
return
|
||||
}
|
||||
// Steady state: reverse-proxy (incl. WebSocket upgrade) to the user's instance.
|
||||
target, _ := url.Parse(fmt.Sprintf("http://127.0.0.1:%d", e.Port))
|
||||
httputil.NewSingleHostReverseProxy(target).ServeHTTP(w, r)
|
||||
}
|
||||
|
||||
func main() {
|
||||
if err := loadTable(); err != nil {
|
||||
log.Fatalf("load %s: %v", dispatchFile, err)
|
||||
}
|
||||
go func() {
|
||||
for range time.Tick(60 * time.Second) {
|
||||
if err := loadTable(); err != nil {
|
||||
log.Printf("reload %s: %v", dispatchFile, err)
|
||||
}
|
||||
}
|
||||
}()
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { _, _ = w.Write([]byte("ok\n")) })
|
||||
mux.HandleFunc("/", handler)
|
||||
log.Printf("t3-dispatch listening on %s", listenAddr)
|
||||
log.Fatal(http.ListenAndServe(listenAddr, mux))
|
||||
}
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# Mint a one-time t3 pairing token for a mapped OS user.
|
||||
# Runs as root via the scoped sudoers entry for the t3-dispatch service user.
|
||||
# Validates the requested user is an actual t3 OS user (a value on the RHS of
|
||||
# /etc/ttyd-user-map) before minting as that user. Prints the t3 CLI JSON.
|
||||
set -euo pipefail
|
||||
os_user="${1:-}"
|
||||
[[ "$os_user" =~ ^[a-z_][a-z0-9_-]{0,31}$ ]] || { echo "invalid user" >&2; exit 2; }
|
||||
# Must be a mapped t3 OS user (RHS of a non-comment "authentik=os" line).
|
||||
awk -F= '!/^[[:space:]]*#/ && NF==2 { gsub(/[[:space:]]/, "", $2); print $2 }' /etc/ttyd-user-map \
|
||||
| grep -qxF "$os_user" || { echo "user not mapped" >&2; exit 3; }
|
||||
exec runuser -u "$os_user" -- /usr/bin/t3 auth pairing create \
|
||||
--base-dir "/home/${os_user}/.t3" --ttl 5m --json
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
[Unit]
|
||||
Description=Reconcile per-user t3 instances from /etc/ttyd-user-map
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/t3-provision-users
|
||||
|
|
@ -1,171 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# Reconcile per-user t3 Workstation instances from roster.yaml (the single source
|
||||
# of truth). roster_engine.py derives the desired state (accounts, per-tier groups,
|
||||
# sticky ports, /etc/ttyd-user-map, dispatch.json); this script APPLIES it.
|
||||
#
|
||||
# ADDITIVE-ONLY for existing users: never removes a group, never replaces a home,
|
||||
# never re-locks/re-chmods an existing account — so a routine (hourly) reconcile is
|
||||
# always safe for live users. Destructive offboarding (userdel) is a SEPARATE, gated
|
||||
# path, never here. Runs hourly as root via t3-provision-users.timer; root has no
|
||||
# Vault token, so tier validation is best-effort (skipped when k8s_users is unreachable).
|
||||
#
|
||||
# DRY_RUN=1 prints actions without mutating. WORKSTATION_DIR overrides the roster/engine location.
|
||||
set -euo pipefail
|
||||
|
||||
WORKSTATION_DIR="${WORKSTATION_DIR:-/home/wizard/code/infra/scripts/workstation}"
|
||||
ENGINE="$WORKSTATION_DIR/roster_engine.py"
|
||||
ROSTER="$WORKSTATION_DIR/roster.yaml"
|
||||
ENVDIR=/etc/t3-serve
|
||||
MAP=/etc/ttyd-user-map
|
||||
DRY_RUN="${DRY_RUN:-0}"
|
||||
# Public infra repo for the locked clone (no auth; the monorepo has no remote).
|
||||
INFRA_REMOTE="${INFRA_REMOTE:-https://github.com/ViktorBarzin/infra.git}"
|
||||
# Per-user OIDC kubeconfig (kubelogin/PKCE; cluster server+CA copied from the admin kubeconfig).
|
||||
OIDC_ISSUER="${OIDC_ISSUER:-https://authentik.viktorbarzin.me/application/o/kubernetes/}"
|
||||
ADMIN_KUBECONFIG="${ADMIN_KUBECONFIG:-/home/wizard/.kube/config}"
|
||||
|
||||
log() { echo "[t3-provision] $*"; }
|
||||
run() { if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] $*"; else "$@"; fi; }
|
||||
|
||||
# Per-non-admin writable, git-crypt-LOCKED infra clone at ~/code. Keyless +
|
||||
# filter=cat ⇒ code/docs are plaintext, git-crypt'd secret files stay ciphertext.
|
||||
# Writable + ungated (push != apply; applies are admin-only). NEVER touches an
|
||||
# existing ~/code (so emo's symlink survives until the gated cutover).
|
||||
install_locked_clone() {
|
||||
local user="$1" home
|
||||
home="$(getent passwd "$user" | cut -d: -f6)"
|
||||
[[ -z "$home" ]] && return 0
|
||||
[[ -e "$home/code" || -L "$home/code" ]] && return 0
|
||||
if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] locked infra clone -> $user:$home/code"; return 0; fi
|
||||
log "clone locked infra -> $user:~/code"
|
||||
runuser -u "$user" -- git clone --quiet --no-checkout "$INFRA_REMOTE" "$home/code"
|
||||
runuser -u "$user" -- git -C "$home/code" config filter.git-crypt.smudge cat
|
||||
runuser -u "$user" -- git -C "$home/code" config filter.git-crypt.clean cat
|
||||
runuser -u "$user" -- git -C "$home/code" config filter.git-crypt.required false
|
||||
runuser -u "$user" -- git -C "$home/code" checkout --quiet master
|
||||
}
|
||||
|
||||
# Per-user OIDC kubeconfig (kubelogin/PKCE — the `kubernetes` Authentik client is
|
||||
# public, no secret). Identical for all users: identity comes from each user's own
|
||||
# interactive OIDC login, which the apiserver maps (email claim) to their RBAC.
|
||||
# Cluster server + CA are copied from the admin kubeconfig. If-absent, never clobber.
|
||||
install_user_kubeconfig() {
|
||||
local user="$1" home kc server ca
|
||||
home="$(getent passwd "$user" | cut -d: -f6)"
|
||||
[[ -z "$home" ]] && return 0
|
||||
kc="$home/.kube/config"
|
||||
[[ -f "$kc" ]] && return 0
|
||||
[[ -r "$ADMIN_KUBECONFIG" ]] || { log "WARN: $ADMIN_KUBECONFIG unreadable -> skip kubeconfig for $user"; return 0; }
|
||||
if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] OIDC kubeconfig -> $user:$kc"; return 0; fi
|
||||
server="$(KUBECONFIG="$ADMIN_KUBECONFIG" kubectl config view --raw --minify -o jsonpath='{.clusters[0].cluster.server}')"
|
||||
ca="$(KUBECONFIG="$ADMIN_KUBECONFIG" kubectl config view --raw --minify -o jsonpath='{.clusters[0].cluster.certificate-authority-data}')"
|
||||
[[ -n "$server" && -n "$ca" ]] || { log "WARN: could not read cluster server/CA -> skip kubeconfig for $user"; return 0; }
|
||||
install -d -o "$user" -g "$user" -m 0700 "$home/.kube"
|
||||
cat > "$kc" <<EOF
|
||||
apiVersion: v1
|
||||
kind: Config
|
||||
clusters:
|
||||
- name: homelab
|
||||
cluster:
|
||||
server: $server
|
||||
certificate-authority-data: $ca
|
||||
contexts:
|
||||
- name: oidc@homelab
|
||||
context:
|
||||
cluster: homelab
|
||||
user: oidc
|
||||
current-context: oidc@homelab
|
||||
users:
|
||||
- name: oidc
|
||||
user:
|
||||
exec:
|
||||
apiVersion: client.authentication.k8s.io/v1beta1
|
||||
command: kubectl
|
||||
args:
|
||||
- oidc-login
|
||||
- get-token
|
||||
- --oidc-issuer-url=$OIDC_ISSUER
|
||||
- --oidc-client-id=kubernetes
|
||||
- --oidc-extra-scope=email
|
||||
- --oidc-extra-scope=profile
|
||||
- --oidc-extra-scope=groups
|
||||
interactiveMode: IfAvailable
|
||||
EOF
|
||||
chown "$user:$user" "$kc"; chmod 0600 "$kc"
|
||||
log "wrote OIDC kubeconfig -> $user:~/.kube/config"
|
||||
}
|
||||
|
||||
[[ $EUID -eq 0 ]] || { echo "t3-provision-users: must run as root" >&2; exit 1; }
|
||||
for bin in python3 jq; do command -v "$bin" >/dev/null || { echo "missing $bin" >&2; exit 1; }; done
|
||||
[[ -f "$ROSTER" && -f "$ENGINE" ]] || { echo "roster/engine not under $WORKSTATION_DIR" >&2; exit 1; }
|
||||
install -d -m 0755 "$ENVDIR"
|
||||
|
||||
# 1) current sticky ports from existing .env files -> {os_user: port}
|
||||
ports_file="$(mktemp)"; trap 'rm -f "$ports_file" "${desired_file:-}"' EXIT
|
||||
{ echo "{}"; for f in "$ENVDIR"/*.env; do
|
||||
[[ -e "$f" ]] || continue
|
||||
u="$(basename "$f" .env)"; p="$(grep -oE 'T3_PORT=[0-9]+' "$f" | cut -d= -f2)"
|
||||
[[ -n "$p" ]] && jq -n --arg u "$u" --argjson p "$p" '{($u): $p}'
|
||||
done; } | jq -s 'add' > "$ports_file"
|
||||
|
||||
# 2) tier validation vs live k8s_users (best-effort; aborts only on a real conflict)
|
||||
if command -v vault >/dev/null; then
|
||||
export VAULT_ADDR="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
|
||||
if k8s_raw="$(vault kv get -field=k8s_users secret/platform 2>/dev/null)"; then
|
||||
k8s_file="$(mktemp)"; echo "$k8s_raw" | jq -c 'map_values(.role)' > "$k8s_file"
|
||||
if ! python3 "$ENGINE" validate --roster "$ROSTER" --k8s-users-json "$k8s_file"; then
|
||||
rm -f "$k8s_file"; echo "[t3-provision] ABORT: roster tier conflicts with k8s_users" >&2; exit 1
|
||||
fi
|
||||
rm -f "$k8s_file"
|
||||
else
|
||||
log "WARN: k8s_users unreachable (no Vault token?) -> skipping tier validation"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 3) derive desired state
|
||||
desired_file="$(mktemp)"
|
||||
python3 "$ENGINE" derive --roster "$ROSTER" --ports-json "$ports_file" > "$desired_file"
|
||||
jq -e . "$desired_file" >/dev/null || { echo "[t3-provision] derive produced invalid JSON" >&2; exit 1; }
|
||||
|
||||
# 4) per-account: create-if-absent + ADDITIVE tier groups (never strip) + locked clone
|
||||
while IFS=$'\t' read -r os_user tier shell groups_csv; do
|
||||
if ! id "$os_user" >/dev/null 2>&1; then
|
||||
log "create account: $os_user (shell $shell)"
|
||||
run useradd -m -s "$shell" "$os_user"
|
||||
run passwd -l "$os_user" # SSO/t3 only — no local password
|
||||
run chmod 700 "/home/$os_user"
|
||||
fi
|
||||
if [[ -n "$groups_csv" ]]; then
|
||||
current="$(id -nG "$os_user" 2>/dev/null | tr ' ' '\n')"
|
||||
IFS=',' read -ra want <<< "$groups_csv"
|
||||
for g in "${want[@]}"; do
|
||||
grep -qx "$g" <<< "$current" && continue # already a member -> skip
|
||||
getent group "$g" >/dev/null 2>&1 || continue # group must exist
|
||||
log "add $os_user -> group $g"; run gpasswd -a "$os_user" "$g" >/dev/null
|
||||
done
|
||||
fi
|
||||
if [[ "$tier" != admin ]]; then # non-admins: locked ~/code clone + OIDC kubeconfig
|
||||
install_locked_clone "$os_user"
|
||||
install_user_kubeconfig "$os_user"
|
||||
fi
|
||||
done < <(jq -r '.accounts[] | [.os_user, .tier, .shell, (.groups|join(","))] | @tsv' "$desired_file")
|
||||
|
||||
# 5) per-user .env (sticky port) + enable t3-serve@
|
||||
while IFS=$'\t' read -r os_user port; do
|
||||
envf="$ENVDIR/$os_user.env"
|
||||
if [[ ! -f "$envf" ]] || ! grep -qx "T3_PORT=$port" "$envf"; then
|
||||
run bash -c "printf 'T3_PORT=%s\n' '$port' > '$envf'"
|
||||
fi
|
||||
id "$os_user" >/dev/null 2>&1 && run systemctl enable --now "t3-serve@$os_user.service" >/dev/null 2>&1 || true
|
||||
done < <(jq -r '.ports | to_entries[] | [.key, .value] | @tsv' "$desired_file")
|
||||
|
||||
# 6) regenerate /etc/ttyd-user-map + dispatch.json from the desired state (SSoT:
|
||||
# a roster entry removed here DISAPPEARS, which is what the offboarding cut relies on)
|
||||
if [[ "$DRY_RUN" == 1 ]]; then
|
||||
log "[dry-run] would regenerate $MAP + $ENVDIR/dispatch.json"
|
||||
else
|
||||
jq -r '.ttyd_user_map' "$desired_file" > "$MAP.tmp" && install -m 0644 "$MAP.tmp" "$MAP" && rm -f "$MAP.tmp"
|
||||
jq -c '.dispatch' "$desired_file" > "$ENVDIR/dispatch.json.tmp" && install -m 0644 "$ENVDIR/dispatch.json.tmp" "$ENVDIR/dispatch.json" && rm -f "$ENVDIR/dispatch.json.tmp"
|
||||
fi
|
||||
|
||||
log "reconcile complete ($([[ "$DRY_RUN" == 1 ]] && echo DRY-RUN || echo applied))"
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
[Unit]
|
||||
Description=Periodic t3 per-user reconcile
|
||||
|
||||
[Timer]
|
||||
OnBootSec=2min
|
||||
OnCalendar=hourly
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
[Unit]
|
||||
Description=T3 Code server for %i (t3 serve, per-user)
|
||||
Documentation=https://github.com/pingdotgg/t3code
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=%i
|
||||
Group=%i
|
||||
Environment=HOME=/home/%i
|
||||
Environment=PATH=/usr/local/bin:/usr/bin:/bin:/home/%i/.local/bin
|
||||
Environment=NODE_ENV=production
|
||||
EnvironmentFile=/etc/t3-serve/%i.env
|
||||
WorkingDirectory=/home/%i
|
||||
ExecStart=/usr/bin/t3 serve --host 0.0.0.0 --port ${T3_PORT} --base-dir /home/%i/.t3
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
@ -1,261 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# Task processor for the Forgejo → OpenClaw pipeline.
|
||||
# Polls Forgejo for new issues in the tasks repo, sends them to OpenClaw
|
||||
# for processing, and posts results back as comments.
|
||||
#
|
||||
# Runs inside the OpenClaw pod via kubectl exec from a CronJob.
|
||||
#
|
||||
# Environment:
|
||||
# FORGEJO_TOKEN — Forgejo API token with repo access
|
||||
# FORGEJO_URL — Forgejo base URL (default: https://forgejo.viktorbarzin.me)
|
||||
# FORGEJO_REPO — Repo in format "owner/repo" (default: vbarzin/tasks)
|
||||
# OPENCLAW_URL — OpenClaw gateway URL (default: http://127.0.0.1:18789)
|
||||
# OPENCLAW_TOKEN — OpenClaw gateway token
|
||||
# SLACK_WEBHOOK_URL — Optional Slack webhook for notifications
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
FORGEJO_URL="${FORGEJO_URL:-https://forgejo.viktorbarzin.me}"
|
||||
FORGEJO_REPO="${FORGEJO_REPO:-viktor/tasks}"
|
||||
OPENCLAW_URL="${OPENCLAW_URL:-https://integrate.api.nvidia.com}"
|
||||
SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}"
|
||||
|
||||
: "${FORGEJO_TOKEN:?FORGEJO_TOKEN is required}"
|
||||
: "${OPENCLAW_TOKEN:?OPENCLAW_TOKEN is required}"
|
||||
FORGEJO_BOT_USER="${FORGEJO_BOT_USER:-viktor}"
|
||||
|
||||
fg_api() {
|
||||
curl -sf -H "Authorization: token $FORGEJO_TOKEN" -H "Content-Type: application/json" "$@"
|
||||
}
|
||||
|
||||
get_label_id() {
|
||||
local label_name="$1"
|
||||
fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/labels?limit=50" | \
|
||||
python3 -c "
|
||||
import sys, json
|
||||
labels = json.load(sys.stdin)
|
||||
name = sys.argv[1]
|
||||
for l in labels:
|
||||
if l['name'] == name:
|
||||
print(l['id'])
|
||||
break
|
||||
else:
|
||||
print(0)
|
||||
" "$label_name"
|
||||
}
|
||||
|
||||
add_label() {
|
||||
local issue_id="$1" label_name="$2"
|
||||
local label_id
|
||||
label_id=$(get_label_id "$label_name")
|
||||
if [ "$label_id" != "0" ]; then
|
||||
fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id/labels" \
|
||||
-d "{\"labels\":[$label_id]}" > /dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
remove_label() {
|
||||
local issue_id="$1" label_name="$2"
|
||||
local label_id
|
||||
label_id=$(get_label_id "$label_name")
|
||||
if [ "$label_id" != "0" ]; then
|
||||
fg_api -X DELETE "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id/labels/$label_id" > /dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
post_comment() {
|
||||
local issue_id="$1"
|
||||
# Read comment body from stdin to avoid quoting issues
|
||||
python3 -c "
|
||||
import sys, json
|
||||
body = sys.stdin.read()
|
||||
print(json.dumps({'body': body}))
|
||||
" | fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id/comments" -d @- > /dev/null 2>&1
|
||||
}
|
||||
|
||||
close_issue() {
|
||||
local issue_id="$1"
|
||||
fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id" \
|
||||
-X PATCH -d '{"state": "closed"}' > /dev/null 2>&1
|
||||
}
|
||||
|
||||
get_comment_history() {
|
||||
local issue_id="$1"
|
||||
fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id/comments?limit=20" 2>/dev/null | \
|
||||
python3 -c "
|
||||
import sys, json
|
||||
bot_user = sys.argv[1]
|
||||
comments = json.load(sys.stdin)
|
||||
history = []
|
||||
for c in comments:
|
||||
user = c.get('user', {}).get('login', 'unknown')
|
||||
body = c.get('body', '')
|
||||
# Skip bot's own comments to keep context clean
|
||||
if user == bot_user:
|
||||
# Include a short summary of previous responses
|
||||
if '## OpenClaw Task Result' in body:
|
||||
# Extract just the result content (skip header/footer)
|
||||
lines = body.split('\n')
|
||||
content = [l for l in lines if not l.startswith('## ') and not l.startswith('---') and not l.startswith('*Processed')]
|
||||
summary = '\n'.join(content).strip()[:500]
|
||||
if summary:
|
||||
history.append(f'[Previous AI response]: {summary}')
|
||||
else:
|
||||
history.append(f'[{user}]: {body}')
|
||||
print('\n\n'.join(history))
|
||||
" "$FORGEJO_BOT_USER" 2>/dev/null
|
||||
}
|
||||
|
||||
notify_slack() {
|
||||
if [ -n "$SLACK_WEBHOOK_URL" ]; then
|
||||
python3 -c "
|
||||
import json, sys
|
||||
print(json.dumps({'text': sys.argv[1]}))
|
||||
" "$1" | curl -sf -X POST "$SLACK_WEBHOOK_URL" \
|
||||
-H "Content-Type: application/json" -d @- > /dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
process_issue() {
|
||||
local issue_id="$1" title="$2" body="$3" author="$4"
|
||||
|
||||
echo "Processing issue #$issue_id: $title (by $author)"
|
||||
|
||||
# Mark as processing
|
||||
add_label "$issue_id" "processing"
|
||||
remove_label "$issue_id" "pending"
|
||||
remove_label "$issue_id" "completed"
|
||||
|
||||
# Fetch comment history for context
|
||||
local comment_history
|
||||
comment_history=$(get_comment_history "$issue_id")
|
||||
|
||||
# Call OpenClaw gateway API (OpenAI-compatible chat completions)
|
||||
# Use python to safely build the JSON payload
|
||||
local response
|
||||
response=$(python3 -c "
|
||||
import json, sys
|
||||
title = sys.argv[1]
|
||||
body = sys.argv[2]
|
||||
author = sys.argv[3]
|
||||
comment_history = sys.argv[4]
|
||||
|
||||
prompt = f'''You are processing a task submitted by {author} via the Forgejo task queue.
|
||||
|
||||
Task title: {title}
|
||||
|
||||
Task description:
|
||||
{body}'''
|
||||
|
||||
if comment_history.strip():
|
||||
prompt += f'''
|
||||
|
||||
Conversation history (follow-up comments):
|
||||
{comment_history}
|
||||
|
||||
The latest comment is the most recent request. Address it in context of the original task and prior conversation.'''
|
||||
|
||||
prompt += '''
|
||||
|
||||
Please execute this task. When done, provide a clear summary of what was done and any results.
|
||||
If the task requires infrastructure changes, describe what changes would be needed but do NOT apply them automatically — list the commands/changes for review.'''
|
||||
|
||||
payload = {
|
||||
'model': 'mistralai/mistral-large-3-675b-instruct-2512',
|
||||
'messages': [
|
||||
{'role': 'system', 'content': 'You are an infrastructure AI assistant. Process the task and provide actionable results. Be concise.'},
|
||||
{'role': 'user', 'content': prompt}
|
||||
],
|
||||
'max_tokens': 8192,
|
||||
'temperature': 0.3
|
||||
}
|
||||
print(json.dumps(payload))
|
||||
" "$title" "$body" "$author" "$comment_history" | \
|
||||
curl -sf --max-time 300 \
|
||||
-H "Authorization: Bearer $OPENCLAW_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
"$OPENCLAW_URL/v1/chat/completions" \
|
||||
-d @- 2>&1) || {
|
||||
echo " ERROR: OpenClaw API call failed"
|
||||
echo "Failed to process this task. OpenClaw API returned an error. Please check the CronJob logs or process manually." | \
|
||||
post_comment "$issue_id"
|
||||
add_label "$issue_id" "failed"
|
||||
remove_label "$issue_id" "processing"
|
||||
notify_slack ":x: Task #$issue_id failed: $title"
|
||||
return 1
|
||||
}
|
||||
|
||||
# Extract the response content and post as comment
|
||||
python3 -c "
|
||||
import sys, json
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
msg = data['choices'][0]['message']
|
||||
# Some models put content in reasoning_content instead of content
|
||||
result = msg.get('content') or msg.get('reasoning_content') or msg.get('reasoning') or 'No response generated.'
|
||||
except Exception as e:
|
||||
result = f'Error parsing OpenClaw response: {e}'
|
||||
|
||||
body = f'## OpenClaw Task Result\n\n{result}\n\n---\n*Processed automatically by the OpenClaw task pipeline.*'
|
||||
print(body)
|
||||
" <<< "$response" | post_comment "$issue_id"
|
||||
|
||||
# Update labels and close
|
||||
add_label "$issue_id" "completed"
|
||||
remove_label "$issue_id" "processing"
|
||||
close_issue "$issue_id"
|
||||
|
||||
echo " Issue #$issue_id processed and closed"
|
||||
notify_slack ":white_check_mark: Task #$issue_id completed: $title"
|
||||
}
|
||||
|
||||
# --- Main ---
|
||||
|
||||
echo "=== Task Processor $(date -u +%Y-%m-%dT%H:%M:%SZ) ==="
|
||||
|
||||
# List open issues
|
||||
ISSUES=$(fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues?state=open&type=issues&limit=10&sort=created&direction=asc" 2>/dev/null) || {
|
||||
echo "ERROR: Could not fetch issues from Forgejo"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Parse pending issues into a temp file (avoids delimiter issues)
|
||||
PENDING_FILE=$(mktemp)
|
||||
trap 'rm -f "$PENDING_FILE"' EXIT
|
||||
|
||||
python3 -c "
|
||||
import sys, json
|
||||
issues = json.load(sys.stdin)
|
||||
for issue in issues:
|
||||
labels = [l['name'] for l in issue.get('labels', [])]
|
||||
# Process if: no processing label AND (no completed label OR issue was reopened)
|
||||
if 'processing' not in labels:
|
||||
# Write each issue as a JSON line
|
||||
print(json.dumps({
|
||||
'id': issue['number'],
|
||||
'title': issue['title'],
|
||||
'body': (issue.get('body') or '')[:4000],
|
||||
'author': issue['user']['login']
|
||||
}))
|
||||
" <<< "$ISSUES" > "$PENDING_FILE"
|
||||
|
||||
ISSUE_COUNT=$(wc -l < "$PENDING_FILE" | tr -d ' ')
|
||||
|
||||
if [ "$ISSUE_COUNT" = "0" ]; then
|
||||
echo "No pending issues to process"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Found $ISSUE_COUNT pending issue(s)"
|
||||
|
||||
# Process each pending issue (one JSON object per line)
|
||||
while IFS= read -r line; do
|
||||
issue_id=$(python3 -c "import json,sys; print(json.loads(sys.argv[1])['id'])" "$line")
|
||||
title=$(python3 -c "import json,sys; print(json.loads(sys.argv[1])['title'])" "$line")
|
||||
body=$(python3 -c "import json,sys; print(json.loads(sys.argv[1])['body'])" "$line")
|
||||
author=$(python3 -c "import json,sys; print(json.loads(sys.argv[1])['author'])" "$line")
|
||||
process_issue "$issue_id" "$title" "$body" "$author" || true
|
||||
done < "$PENDING_FILE"
|
||||
|
||||
echo "=== Task processing complete ==="
|
||||
|
|
@ -1,85 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# Unit tests for the pure functions in fan-control.sh.
|
||||
# Sources the script (main is guarded), exercises curve/decide/resolve/presence/parse.
|
||||
# Run: bash infra/scripts/test-fan-control.sh
|
||||
|
||||
set -uo pipefail
|
||||
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# shellcheck source=/dev/null
|
||||
source "$DIR/fan-control.sh"
|
||||
|
||||
pass=0 fail=0
|
||||
eq() { # <description> <expected> <actual>
|
||||
if [[ "$2" == "$3" ]]; then pass=$((pass + 1)); else
|
||||
fail=$((fail + 1)); printf 'FAIL: %s — expected [%s] got [%s]\n' "$1" "$2" "$3"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- COOL curve (continuous linear: 30% @50C .. 100% @83C) ---
|
||||
eq "cool <=T_LO clamps" 30 "$(fc_curve cool 40)"
|
||||
eq "cool 50 -> 30" 30 "$(fc_curve cool 50)"
|
||||
eq "cool 55 -> 41" 41 "$(fc_curve cool 55)"
|
||||
eq "cool 60 -> 51" 51 "$(fc_curve cool 60)"
|
||||
eq "cool 64 -> 60" 60 "$(fc_curve cool 64)"
|
||||
eq "cool 70 -> 72" 72 "$(fc_curve cool 70)"
|
||||
eq "cool 75 -> 83" 83 "$(fc_curve cool 75)"
|
||||
eq "cool 83 -> 100" 100 "$(fc_curve cool 83)"
|
||||
eq "cool >=T_HI clamps" 100 "$(fc_curve cool 90)"
|
||||
|
||||
# --- QUIET curve (continuous linear: 20% @68C .. 100% @83C) ---
|
||||
eq "quiet <=T_LO clamps" 20 "$(fc_curve quiet 60)"
|
||||
eq "quiet 68 -> 20" 20 "$(fc_curve quiet 68)"
|
||||
eq "quiet 70 -> 31" 31 "$(fc_curve quiet 70)"
|
||||
eq "quiet 75 -> 57" 57 "$(fc_curve quiet 75)"
|
||||
eq "quiet 80 -> 84" 84 "$(fc_curve quiet 80)"
|
||||
eq "quiet 83 -> 100" 100 "$(fc_curve quiet 83)"
|
||||
|
||||
# --- decide: asymmetric hysteresis (ramp up now, ease down only past the deadband) ---
|
||||
eq "decide uninit -> target" 68 "$(fc_decide cool 68 -1 3)"
|
||||
eq "decide ramp up now" 68 "$(fc_decide cool 68 25 3)"
|
||||
eq "decide equal holds" 62 "$(fc_decide cool 65 62 3)"
|
||||
eq "decide down held" 72 "$(fc_decide cool 68 72 3)" # curve(68)=68<72 but curve(71)=75 !<72 -> hold
|
||||
eq "decide down past" 60 "$(fc_decide cool 64 72 3)" # curve(64)=60, curve(67)=66<72 -> drop
|
||||
|
||||
# --- fc_clamp / fc_resolve: HA mode resolution ---
|
||||
eq "clamp over 100" 100 "$(fc_clamp 150)"
|
||||
eq "clamp under 0" 0 "$(fc_clamp -5)"
|
||||
eq "clamp passthrough" 45 "$(fc_clamp 45)"
|
||||
eq "resolve manual=slider" 42 "$(fc_resolve manual 64 42 cool -1 3)"
|
||||
eq "resolve manual clamped" 100 "$(fc_resolve manual 64 150 cool -1 3)"
|
||||
eq "resolve cool=cool curve" 51 "$(fc_resolve cool 60 0 cool -1 3)"
|
||||
eq "resolve quiet=quiet curve" 73 "$(fc_resolve quiet 78 0 cool -1 3)"
|
||||
eq "resolve auto+empty=cool" 51 "$(fc_resolve auto 60 0 cool -1 3)"
|
||||
eq "resolve auto+present=quiet" 31 "$(fc_resolve auto 70 0 quiet -1 3)"
|
||||
|
||||
# --- fc_fan_watts: estimated fan power from RPM (cube-law, calibrated to the sweep) ---
|
||||
eq "fan_watts 0" 0 "$(fc_fan_watts 0)"
|
||||
eq "fan_watts 4800" 2 "$(fc_fan_watts 4800)"
|
||||
eq "fan_watts 9360" 16 "$(fc_fan_watts 9360)"
|
||||
eq "fan_watts 12720" 42 "$(fc_fan_watts 12720)"
|
||||
eq "fan_watts 16920" 99 "$(fc_fan_watts 16920)"
|
||||
|
||||
# --- presence ---
|
||||
now=1000000
|
||||
eq "presence open -> quiet" quiet "$(fc_presence_mode Отворена 0 $now 900 Отворена)"
|
||||
eq "presence closed recent -> quiet" quiet "$(fc_presence_mode Затворена $((now - 100)) $now 900 Отворена)"
|
||||
eq "presence closed stale -> cool" cool "$(fc_presence_mode Затворена $((now - 1000)) $now 900 Отворена)"
|
||||
eq "presence closed edge -> cool" cool "$(fc_presence_mode Затворена $((now - 900)) $now 900 Отворена)"
|
||||
|
||||
# --- temp parsing ---
|
||||
eq "parse temp line" 74 "$(fc_parse_temp 'Temp | 0Eh | ok | 3.1 | 74 degrees C')"
|
||||
eq "parse temp 7C" 72 "$(fc_parse_temp 'Temp | 0Eh | ok | 3.1 | 72 degrees C')"
|
||||
|
||||
# --- json field (jq-free) ---
|
||||
J='{"entity_id":"sensor.garage_door_state_bg","state":"Отворена","attributes":{"friendly_name":"Garage Door State BG"},"last_changed":"2026-06-04T16:55:20.517745+00:00","last_updated":"2026-06-04T16:55:20.517745+00:00"}'
|
||||
eq "json state" "Отворена" "$(fc_json_str_field "$J" state)"
|
||||
eq "json last_changed" "2026-06-04T16:55:20.517745+00:00" "$(fc_json_str_field "$J" last_changed)"
|
||||
|
||||
# --- hex conversion ---
|
||||
eq "hex 20" 0x14 "$(fc_pct_to_hex 20)"
|
||||
eq "hex 45" 0x2d "$(fc_pct_to_hex 45)"
|
||||
eq "hex 100" 0x64 "$(fc_pct_to_hex 100)"
|
||||
eq "hex 5" 0x05 "$(fc_pct_to_hex 5)"
|
||||
|
||||
printf '\n%d passed, %d failed\n' "$pass" "$fail"
|
||||
(( fail == 0 ))
|
||||
|
|
@ -1,57 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# Unit tests for the pure drift-guard functions in vault-token-renew.sh.
|
||||
# Sources the script (vtr_main is guarded) and exercises the decision logic that
|
||||
# decides whether ~/.vault-token is OUR periodic admin token (renew) or a foreign
|
||||
# token that clobbered the file (refuse, fail loud). This is exactly the logic
|
||||
# whose ABSENCE let the 2026-06-05 woodpecker-token clobber be silently renewed
|
||||
# for two days. Run: bash infra/scripts/test-vault-token-renew.sh
|
||||
set -uo pipefail
|
||||
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# shellcheck source=/dev/null
|
||||
source "$DIR/vault-token-renew.sh"
|
||||
|
||||
pass=0 fail=0
|
||||
ok() { # <description> <cmd...> — expects the command to succeed (renew-OK)
|
||||
if "${@:2}"; then pass=$((pass + 1)); else
|
||||
fail=$((fail + 1)); printf 'FAIL: %s — expected OK, got refuse\n' "$1"
|
||||
fi
|
||||
}
|
||||
no() { # <description> <cmd...> — expects the command to fail (drift, refuse)
|
||||
if "${@:2}"; then
|
||||
fail=$((fail + 1)); printf 'FAIL: %s — expected DRIFT, got OK\n' "$1"
|
||||
else pass=$((pass + 1)); fi
|
||||
}
|
||||
eq() { # <description> <expected> <actual>
|
||||
if [[ "$2" == "$3" ]]; then pass=$((pass + 1)); else
|
||||
fail=$((fail + 1)); printf 'FAIL: %s — expected [%s] got [%s]\n' "$1" "$2" "$3"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- vtr_drift_ok: ONLY our periodic admin token (right name AND vault-admin) renews ---
|
||||
ok "our token renews" vtr_drift_ok token-devvm-wizard "default,sops-admin,vault-admin"
|
||||
ok "vault-admin anywhere in list" vtr_drift_ok token-devvm-wizard "default,vault-admin"
|
||||
ok "policy order irrelevant" vtr_drift_ok token-devvm-wizard "vault-admin,default"
|
||||
no "woodpecker clobber refused" vtr_drift_ok kubernetes-woodpecker-default "ci,default,terraform-state"
|
||||
no "oidc token (admin but wrong dn)" vtr_drift_ok oidc-vbarzin "default,sops-admin,vault-admin"
|
||||
no "right name, no vault-admin" vtr_drift_ok token-devvm-wizard "default,sops-admin"
|
||||
no "empty display_name" vtr_drift_ok "" "vault-admin"
|
||||
no "empty policies" vtr_drift_ok token-devvm-wizard ""
|
||||
no "no substring false-positive" vtr_drift_ok token-devvm-wizard "default,vault-admin-ro"
|
||||
|
||||
# --- vtr_display_name / vtr_policies_csv: parse real `vault token lookup -format=json` ---
|
||||
LOOKUP_OURS='{"data":{"display_name":"token-devvm-wizard","policies":["default","sops-admin","vault-admin"],"identity_policies":null}}'
|
||||
LOOKUP_OIDC='{"data":{"display_name":"oidc-vbarzin","policies":["default"],"identity_policies":["sops-admin","vault-admin"]}}'
|
||||
LOOKUP_WP='{"data":{"display_name":"kubernetes-woodpecker-default","policies":["ci","default","terraform-state"],"identity_policies":[]}}'
|
||||
eq "dn ours" "token-devvm-wizard" "$(vtr_display_name "$LOOKUP_OURS")"
|
||||
eq "dn oidc" "oidc-vbarzin" "$(vtr_display_name "$LOOKUP_OIDC")"
|
||||
eq "pols ours" "default,sops-admin,vault-admin" "$(vtr_policies_csv "$LOOKUP_OURS")"
|
||||
eq "pols oidc merges token+identity" "default,sops-admin,vault-admin" "$(vtr_policies_csv "$LOOKUP_OIDC")"
|
||||
eq "pols woodpecker" "ci,default,terraform-state" "$(vtr_policies_csv "$LOOKUP_WP")"
|
||||
|
||||
# --- parse + decide end-to-end (the real lookup-JSON -> renew/refuse path) ---
|
||||
ok "ours: parse+decide renews" vtr_drift_ok "$(vtr_display_name "$LOOKUP_OURS")" "$(vtr_policies_csv "$LOOKUP_OURS")"
|
||||
no "woodpecker: parse+decide refused" vtr_drift_ok "$(vtr_display_name "$LOOKUP_WP")" "$(vtr_policies_csv "$LOOKUP_WP")"
|
||||
no "oidc: parse+decide refused" vtr_drift_ok "$(vtr_display_name "$LOOKUP_OIDC")" "$(vtr_policies_csv "$LOOKUP_OIDC")"
|
||||
|
||||
printf '\n%d passed, %d failed\n' "$pass" "$fail"
|
||||
(( fail == 0 ))
|
||||
169
scripts/tg
169
scripts/tg
|
|
@ -1,169 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# scripts/tg — wrapper: decrypt state before, encrypt+commit after mutating ops
|
||||
# Usage: scripts/tg apply --non-interactive
|
||||
# scripts/tg plan
|
||||
# Auth: `vault login -method=oidc` (token at ~/.vault-token)
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||
SYNC="$REPO_ROOT/scripts/state-sync"
|
||||
|
||||
# Enable provider cache (shared across stacks)
|
||||
export TF_PLUGIN_CACHE_DIR="${TF_PLUGIN_CACHE_DIR:-$HOME/.terraform.d/plugin-cache}"
|
||||
export TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1
|
||||
mkdir -p "$TF_PLUGIN_CACHE_DIR"
|
||||
|
||||
# Determine stack name from cwd (relative to stacks/)
|
||||
STACK_NAME=""
|
||||
cwd="$(pwd)"
|
||||
stacks_dir="$REPO_ROOT/stacks"
|
||||
if [[ "$cwd" == "$stacks_dir"/* ]]; then
|
||||
rel="${cwd#$stacks_dir/}"
|
||||
STACK_NAME="${rel%%/*}"
|
||||
fi
|
||||
|
||||
# ── Tier detection ──
|
||||
TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
|
||||
is_tier0() {
|
||||
echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
|
||||
}
|
||||
|
||||
# ── Advisory lock via Vault KV ──
|
||||
LOCK_MAX_AGE=1800 # 30 minutes — stale lock threshold
|
||||
acquire_lock() {
|
||||
local stack="$1"
|
||||
local vault_addr="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
|
||||
local lock_path="secret/data/locks/$stack"
|
||||
local holder="pid=$$,host=$(hostname -s),user=$(whoami)"
|
||||
|
||||
# Check if lock exists and is not stale
|
||||
local existing
|
||||
existing=$(vault kv get -format=json "secret/locks/$stack" 2>/dev/null || echo '{}')
|
||||
local locked=$(echo "$existing" | jq -r '.data.data.locked // "false"')
|
||||
local acquired=$(echo "$existing" | jq -r '.data.data.acquired // "0"')
|
||||
local existing_holder=$(echo "$existing" | jq -r '.data.data.holder // ""')
|
||||
|
||||
if [ "$locked" = "true" ]; then
|
||||
local now=$(date +%s)
|
||||
local age=$((now - acquired))
|
||||
if [ "$age" -lt "$LOCK_MAX_AGE" ]; then
|
||||
echo "ERROR: Stack '$stack' is locked by: $existing_holder (${age}s ago)"
|
||||
echo " Wait for it to finish or run: vault kv delete secret/locks/$stack"
|
||||
return 1
|
||||
fi
|
||||
echo "WARNING: Breaking stale lock on '$stack' (held ${age}s by $existing_holder)"
|
||||
fi
|
||||
|
||||
vault kv put "secret/locks/$stack" locked=true holder="$holder" acquired="$(date +%s)" >/dev/null
|
||||
}
|
||||
|
||||
release_lock() {
|
||||
local stack="$1"
|
||||
vault kv delete "secret/locks/$stack" >/dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
# ── Pre-flight: decrypt state (Tier 0) or fetch PG creds (Tier 1) ──
|
||||
if [ -n "$STACK_NAME" ]; then
|
||||
if is_tier0 "$STACK_NAME"; then
|
||||
# Tier 0: SOPS-encrypted local state
|
||||
if [ -f "$REPO_ROOT/state/stacks/$STACK_NAME/terraform.tfstate.enc" ]; then
|
||||
"$SYNC" decrypt "$STACK_NAME"
|
||||
fi
|
||||
else
|
||||
# Tier 1: PG backend — fetch credentials from Vault
|
||||
if [ -z "${PG_CONN_STR:-}" ]; then
|
||||
# Pre-flight: vault CLI must be available. Previously CI failed with a
|
||||
# misleading "Cannot read PG credentials" message because the Alpine CI
|
||||
# image lacked the vault binary — the 2>/dev/null below swallowed the
|
||||
# real "vault: not found" error. Fail fast with a clear message instead.
|
||||
if ! command -v vault >/dev/null 2>&1; then
|
||||
echo "ERROR: vault CLI not found on PATH. Install it or use an image that includes it (ci/Dockerfile)." >&2
|
||||
exit 1
|
||||
fi
|
||||
VAULT_OUT=$(vault read -format=json database/static-creds/pg-terraform-state 2>&1) || {
|
||||
echo "ERROR: Cannot read PG credentials from Vault. Vault output follows:" >&2
|
||||
echo "$VAULT_OUT" >&2
|
||||
echo "" >&2
|
||||
echo "Hint: humans run 'vault login -method=oidc'; CI auths via K8s SA (role=ci)." >&2
|
||||
exit 1
|
||||
}
|
||||
PG_USER=$(echo "$VAULT_OUT" | jq -r .data.username)
|
||||
PG_PASS=$(echo "$VAULT_OUT" | jq -r .data.password)
|
||||
export PG_CONN_STR="postgres://${PG_USER}:${PG_PASS}@10.0.20.200:5432/terraform_state?sslmode=disable"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Detect if this is a mutating operation
|
||||
is_mutating=false
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
apply|destroy|import|state) is_mutating=true ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Detect if this is a plan/apply/destroy/refresh — anything that reads or
|
||||
# writes infra state. Cheap pre-flight check below scans only the current
|
||||
# stack's .tf files for the ingress_factory auth-comment convention. Other
|
||||
# tg verbs (init, fmt, validate) skip the check.
|
||||
is_tf_op=false
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
plan|apply|destroy|refresh) is_tf_op=true ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Anti-exposure guard: every `auth = "app"` or `auth = "none"` in this stack
|
||||
# must have a preceding `# auth = "<tier>":` comment documenting what gates
|
||||
# the app or why the endpoint is intentionally public. See:
|
||||
# - infra/modules/kubernetes/ingress_factory/main.tf (variable description)
|
||||
# - infra/.claude/CLAUDE.md "Auth" section
|
||||
# Stack-scoped: untouched stacks aren't blocked from future applies until
|
||||
# they're actually edited, at which point the convention applies.
|
||||
if $is_tf_op && [ -n "$STACK_NAME" ]; then
|
||||
if ! "$REPO_ROOT/scripts/check-ingress-auth-comments.py" "$REPO_ROOT/stacks/$STACK_NAME"; then
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Acquire lock for mutating operations (Tier 0 only — Tier 1 uses pg_advisory_lock)
|
||||
if $is_mutating && [ -n "$STACK_NAME" ] && is_tier0 "$STACK_NAME"; then
|
||||
if command -v vault &>/dev/null && [ -n "${VAULT_TOKEN:-}" ]; then
|
||||
acquire_lock "$STACK_NAME"
|
||||
trap 'release_lock "$STACK_NAME"' EXIT
|
||||
fi
|
||||
fi
|
||||
|
||||
# If running apply with --non-interactive, add -auto-approve for Terraform
|
||||
args=("$@")
|
||||
has_apply=false
|
||||
has_non_interactive=false
|
||||
for arg in "${args[@]}"; do
|
||||
case "$arg" in
|
||||
apply) has_apply=true ;;
|
||||
--non-interactive) has_non_interactive=true ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if $has_apply && $has_non_interactive; then
|
||||
new_args=()
|
||||
for arg in "${args[@]}"; do
|
||||
new_args+=("$arg")
|
||||
if [ "$arg" = "apply" ]; then
|
||||
new_args+=("-auto-approve")
|
||||
fi
|
||||
done
|
||||
terragrunt "${new_args[@]}"
|
||||
else
|
||||
terragrunt "$@"
|
||||
fi
|
||||
|
||||
# After mutating operations: encrypt+commit (Tier 0) or no-op (Tier 1 — PG is authoritative)
|
||||
if $is_mutating && [ -n "$STACK_NAME" ] && is_tier0 "$STACK_NAME"; then
|
||||
"$SYNC" encrypt "$STACK_NAME"
|
||||
cd "$REPO_ROOT"
|
||||
git add "state/stacks/$STACK_NAME/terraform.tfstate.enc"
|
||||
if ! git diff --cached --quiet; then
|
||||
git commit -m "state($STACK_NAME): update encrypted state"
|
||||
fi
|
||||
fi
|
||||
|
|
@ -1,28 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
set -e
|
||||
from=$1
|
||||
to=$2
|
||||
|
||||
if [ -z "$from" ] || [ -z "$to" ]; then
|
||||
echo 'pass 2 positional parameters - $from and $to'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
commands=()
|
||||
# Update terraform modules
|
||||
for file in $(grep -rni "\"istio-injection\" : \"$from\"" . | grep -v '#' | awk '{print $1}' | cut -d':' -f1); do
|
||||
echo $file
|
||||
sed -i "s/istio-injection\" : \"$from\"/istio-injection\" : \"$to\"/" $file
|
||||
|
||||
ns=$(echo $file | cut -d'/' -f 4)
|
||||
commands+=("kubectl -n $ns get deployments --no-headers | awk '{print \$1}' | xargs kubectl -n $ns rollout restart deployment")
|
||||
done
|
||||
|
||||
# Apply changes
|
||||
terraform apply -auto-approve
|
||||
|
||||
# Restart deployments
|
||||
for cmd in "${commands[@]}"; do
|
||||
echo $cmd
|
||||
bash -c "$cmd"
|
||||
done
|
||||
|
|
@ -1,123 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# K8s component upgrader. Run on a single node (master OR worker) at a time.
|
||||
# The caller is responsible for:
|
||||
# - draining + uncordoning the node (this script does not touch kubectl)
|
||||
# - sequencing nodes (master first, then workers one at a time)
|
||||
# - pre-flight checks (etcd snapshot, halt-on-alert, etc)
|
||||
#
|
||||
# Used by:
|
||||
# - the k8s-version-upgrade agent (infra/.claude/agents/k8s-version-upgrade.md)
|
||||
# - manual operators following the runbook (infra/docs/runbooks/k8s-version-upgrade.md)
|
||||
#
|
||||
# Old manual orchestration loop (kept for reference — the agent does the
|
||||
# equivalent now):
|
||||
# for n in $(kbn | grep 'k8s-node' | awk '{print $1}'); do
|
||||
# kb drain $n --ignore-daemonsets --delete-emptydir-data
|
||||
# s wizard@$n 'bash -s' < update_k8s.sh --role worker --release 1.34.5
|
||||
# kb uncordon $n
|
||||
# done
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
ROLE=""
|
||||
RELEASE=""
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 --role <master|worker> --release <X.Y.Z>
|
||||
|
||||
--role master|worker (required)
|
||||
--release kubeadm/kubelet/kubectl target patch version, e.g. 1.34.5
|
||||
|
||||
Behavior:
|
||||
- Rewrites /etc/apt/sources.list.d/kubernetes.list to the v\$MINOR/deb repo
|
||||
derived from --release (so a 1.34.x release uses v1.34/deb, 1.35.x uses
|
||||
v1.35/deb, etc).
|
||||
- apt-get install kubeadm=<release>-* (apt-mark unhold first).
|
||||
- master: kubeadm upgrade plan && kubeadm upgrade apply v<release> -y
|
||||
- worker: kubeadm upgrade node
|
||||
- apt-get install kubelet=<release>-* kubectl=<release>-* then re-hold.
|
||||
- systemctl daemon-reload && systemctl restart kubelet
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--role) ROLE="$2"; shift 2;;
|
||||
--release) RELEASE="$2"; shift 2;;
|
||||
-h|--help) usage; exit 0;;
|
||||
*) echo "Unknown arg: $1" >&2; usage; exit 2;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -z "$ROLE" || -z "$RELEASE" ]]; then
|
||||
echo "ERROR: --role and --release are required" >&2
|
||||
usage
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [[ "$ROLE" != "master" && "$ROLE" != "worker" ]]; then
|
||||
echo "ERROR: --role must be 'master' or 'worker' (got: $ROLE)" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Derive minor track (e.g. 1.34.5 → 1.34)
|
||||
STABLE_VERSION="$(echo "$RELEASE" | awk -F. '{print $1"."$2}')"
|
||||
|
||||
echo "==> Upgrading $(hostname) ($ROLE) to v$RELEASE (track v$STABLE_VERSION)"
|
||||
|
||||
# Apt repo URL is pinned per minor track. Rewrite + re-import the signing key
|
||||
# every run — cheap, idempotent, and handles the minor-bump case where the
|
||||
# old track's repo no longer carries the target version.
|
||||
echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v$STABLE_VERSION/deb/ /" \
|
||||
| sudo tee /etc/apt/sources.list.d/kubernetes.list
|
||||
sudo mkdir -p /etc/apt/keyrings
|
||||
curl -fsSL "https://pkgs.k8s.io/core:/stable:/v$STABLE_VERSION/deb/Release.key" \
|
||||
| sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg --batch --yes
|
||||
|
||||
sudo apt-mark unhold kubeadm kubelet kubectl
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y "kubeadm=$RELEASE-*"
|
||||
|
||||
if [[ "$ROLE" == "master" ]]; then
|
||||
echo "==> Master path: kubeadm upgrade plan + apply"
|
||||
sudo kubeadm upgrade plan
|
||||
# The first apply may fail with "static Pod hash for component <X> did
|
||||
# not change after 5m0s" — kubeadm's 5min wait for the kubelet to reload
|
||||
# a static pod is too tight on our cluster (apiserver-to-kubelet status
|
||||
# sync latency post-master-reboot can exceed it). The etcd image IS
|
||||
# actually updated by then, so a 2nd attempt sees etcd already on
|
||||
# target and skips it. Up to 3 attempts with a 30s delay between.
|
||||
# First attempt: full kubeadm upgrade (incl. etcd). On the static-pod-
|
||||
# hash 5min-timeout failure, retry with --etcd-upgrade=false. The
|
||||
# timeout happens reliably for patch upgrades where etcd's image
|
||||
# doesn't change (kubeadm writes identical manifest → hash doesn't
|
||||
# change → kubeadm waits forever for a change that will never come).
|
||||
# Skipping the etcd phase on retry is safe IF etcd is already on the
|
||||
# right version (which is the only case where this timeout fires).
|
||||
attempt=1
|
||||
extra_flags=""
|
||||
while ! sudo kubeadm upgrade apply "v$RELEASE" -y $extra_flags; do
|
||||
if (( attempt >= 3 )); then
|
||||
echo "ERROR: kubeadm upgrade apply failed after 3 attempts" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "==> kubeadm apply attempt $attempt failed. Retrying with --etcd-upgrade=false (etcd image is unchanged for patch upgrades; kubeadm's static-pod-hash watch is the only thing failing)."
|
||||
extra_flags="--etcd-upgrade=false"
|
||||
sleep 30
|
||||
attempt=$(( attempt + 1 ))
|
||||
done
|
||||
echo "==> kubeadm upgrade apply succeeded on attempt $attempt (flags: '$extra_flags')"
|
||||
else
|
||||
echo "==> Worker path: kubeadm upgrade node"
|
||||
sudo kubeadm upgrade node
|
||||
fi
|
||||
|
||||
sudo apt-get install -y "kubelet=$RELEASE-*" "kubectl=$RELEASE-*"
|
||||
sudo apt-mark hold kubeadm kubelet kubectl
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl restart kubelet
|
||||
|
||||
echo "==> Done: $(hostname) is on v$RELEASE"
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# OS-major upgrade (Ubuntu do-release-upgrade). NOT in the auto-upgrade
|
||||
# pipeline — minor apt patches are handled by unattended-upgrades + kured;
|
||||
# K8s component bumps are handled by the k8s-version-upgrade agent. Run this
|
||||
# script manually when bumping Ubuntu LTS major versions.
|
||||
#
|
||||
# See:
|
||||
# - infra/docs/runbooks/k8s-node-auto-upgrades.md (apt + reboot)
|
||||
# - infra/docs/runbooks/k8s-version-upgrade.md (kubeadm/kubelet/kubectl)
|
||||
|
||||
# sudo apt update && sudo apt autoremove -y && sudo apt upgrade -y
|
||||
sudo do-release-upgrade
|
||||
sudo apt update && sudo apt autoremove -y && sudo apt upgrade -y
|
||||
|
|
@ -1,619 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# upgrade_state.sh — survey the three autonomous-upgrade pipelines.
|
||||
#
|
||||
# Companion to cluster_healthcheck.sh, surfaced via the /upgrade-state skill.
|
||||
# Read-only by design — no --fix.
|
||||
#
|
||||
# The three pipelines:
|
||||
# 1. Apps — Keel polls registries hourly and rolls Deployments tagged
|
||||
# keel.sh/policy. Metrics on container :9300/metrics.
|
||||
# 2. OS — unattended-upgrades patches in-release per node; kured
|
||||
# reboots within a daily 02:00-06:00 London window.
|
||||
# 3. K8s — k8s-version-check CronJob (Sun 12:00 UTC) detects new
|
||||
# kubeadm patch/minor releases; Job-chain drains+upgrades
|
||||
# node-by-node. Pushgateway holds k8s_upgrade_* gauges.
|
||||
#
|
||||
# Exit codes: 0 healthy, 1 attention warranted, 2 something stalled.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# --- Colors ---
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[0;33m'
|
||||
BLUE='\033[0;34m'
|
||||
BOLD='\033[1m'
|
||||
NC='\033[0m'
|
||||
|
||||
# --- Globals ---
|
||||
JSON=false
|
||||
KUBECONFIG_PATH="${KUBECONFIG:-${HOME}/.kube/config}"
|
||||
[[ -f "$KUBECONFIG_PATH" ]] || KUBECONFIG_PATH="/home/wizard/code/infra/config"
|
||||
KUBECTL=""
|
||||
NODES=(k8s-master:10.0.20.100 k8s-node1:10.0.20.101 k8s-node2:10.0.20.102 k8s-node3:10.0.20.103 k8s-node4:10.0.20.104)
|
||||
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no)
|
||||
NOW_EPOCH=$(date -u +%s)
|
||||
HIGHEST_EXIT=0 # 0 healthy, 1 attention, 2 stalled
|
||||
|
||||
# Results — collectors fill these.
|
||||
APPS_STATUS_ICON=""; APPS_STATUS_TEXT=""
|
||||
APPS_LAST_CHECK=""; APPS_NEXT=""; APPS_NOTES=""
|
||||
APPS_ENROLLED=0; APPS_PENDING=0; APPS_UPDATES_LINE=""; APPS_ERROR_LINE=""
|
||||
|
||||
OS_STATUS_ICON=""; OS_STATUS_TEXT=""
|
||||
OS_LAST_CHECK=""; OS_NEXT=""; OS_NOTES=""
|
||||
OS_DISTRO_SUMMARY=""; OS_KERNEL_SUMMARY=""
|
||||
OS_PENDING_REBOOT_NODES=""; OS_HELD_DETAIL=""
|
||||
OS_LAST_UU=""; OS_LAST_KURED=""
|
||||
|
||||
K8S_STATUS_ICON=""; K8S_STATUS_TEXT=""
|
||||
K8S_LAST_CHECK=""; K8S_NEXT=""; K8S_NOTES=""
|
||||
K8S_RUNNING=""; K8S_PATCH=""; K8S_MINOR=""
|
||||
K8S_LAST_DETECT_LINE=""; K8S_IN_FLIGHT="no"; K8S_LAST_CHAIN=""
|
||||
|
||||
# --- Helpers ---
|
||||
log() { [[ "$JSON" == true ]] && return 0; echo -e "$*"; }
|
||||
|
||||
raise_exit() {
|
||||
local n="$1"
|
||||
if [[ "$n" -gt "$HIGHEST_EXIT" ]]; then HIGHEST_EXIT="$n"; fi
|
||||
return 0
|
||||
}
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [--json] [--kubeconfig <path>]
|
||||
|
||||
Read-only audit of the three autonomous-upgrade pipelines (apps, OS, k8s).
|
||||
|
||||
--json machine-readable JSON
|
||||
--kubeconfig PATH override kubeconfig
|
||||
|
||||
Exit codes: 0 healthy, 1 attention warranted, 2 something stalled.
|
||||
EOF
|
||||
}
|
||||
|
||||
parse_args() {
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--json) JSON=true; shift ;;
|
||||
--kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH"
|
||||
}
|
||||
|
||||
# Prometheus query — Prometheus + reload + backup share a network namespace,
|
||||
# so reaching localhost:9090 works from any of the three sidecars.
|
||||
prom_q() {
|
||||
local q="$1"
|
||||
$KUBECTL -n monitoring exec deploy/prometheus-server -c prometheus-server -- \
|
||||
wget -qO- "http://localhost:9090/api/v1/query?query=${q}" 2>/dev/null || true
|
||||
}
|
||||
|
||||
pg_metrics() {
|
||||
$KUBECTL -n monitoring exec deploy/prometheus-server -c prometheus-server -- \
|
||||
wget -qO- "http://prometheus-prometheus-pushgateway:9091/metrics" 2>/dev/null || true
|
||||
}
|
||||
|
||||
ssh_node() {
|
||||
local ip="$1"; shift
|
||||
ssh "${SSH_OPTS[@]}" "wizard@$ip" "$@" 2>/dev/null || true
|
||||
}
|
||||
|
||||
human_age() {
|
||||
local secs="$1"
|
||||
if [[ "$secs" -lt 60 ]]; then printf '%ds ago' "$secs"
|
||||
elif [[ "$secs" -lt 3600 ]]; then printf '%dm ago' $((secs/60))
|
||||
elif [[ "$secs" -lt 86400 ]]; then printf '%dh ago' $((secs/3600))
|
||||
else printf '%dd ago' $((secs/86400))
|
||||
fi
|
||||
}
|
||||
|
||||
# Pushgateway emits floats and scientific notation — coerce to integer
|
||||
# epoch seconds. Returns 0 if the input is empty / zero / unparseable.
|
||||
to_epoch_int() {
|
||||
local v="${1:-}"
|
||||
if [[ -z "$v" || "$v" == "0" ]]; then echo 0; return; fi
|
||||
python3 -c "import sys; v=sys.argv[1]; print(int(float(v)))" "$v" 2>/dev/null || echo 0
|
||||
}
|
||||
|
||||
# --- 1. Apps (Keel) ---
|
||||
collect_apps() {
|
||||
local pending tracked enrolled updates_24h errors
|
||||
|
||||
# Enrolled: count Deployments with keel.sh/policy != never (Keel itself
|
||||
# is policy=never). The Kyverno auto-injection labels namespaces
|
||||
# keel.sh/enrolled=true, but the annotation is what Keel watches.
|
||||
enrolled=$($KUBECTL get deploy -A -o json 2>/dev/null | python3 -c '
|
||||
import json, sys
|
||||
data = json.load(sys.stdin)
|
||||
n = sum(1 for d in data["items"]
|
||||
if (d["metadata"].get("annotations") or {}).get("keel.sh/policy", "never") != "never")
|
||||
print(n)
|
||||
' 2>/dev/null || echo 0)
|
||||
APPS_ENROLLED="$enrolled"
|
||||
|
||||
# Pending approvals (sum across Keel pods).
|
||||
pending=$(prom_q 'sum(pending_approvals)' | python3 -c '
|
||||
import json, sys
|
||||
try:
|
||||
r = json.load(sys.stdin)["data"]["result"]
|
||||
print(int(float(r[0]["value"][1])) if r else 0)
|
||||
except Exception:
|
||||
print(0)
|
||||
' 2>/dev/null || echo 0)
|
||||
APPS_PENDING="$pending"
|
||||
|
||||
# Tracked images — proxy for "is the scrape live?".
|
||||
tracked=$(prom_q 'count(count by (image) (registries_scanned_total))' | python3 -c '
|
||||
import json, sys
|
||||
try:
|
||||
r = json.load(sys.stdin)["data"]["result"]
|
||||
print(int(float(r[0]["value"][1])) if r else 0)
|
||||
except Exception:
|
||||
print(0)
|
||||
' 2>/dev/null || echo 0)
|
||||
|
||||
# Last scrape age — `up{job="kubernetes-pods", app="keel"}` is 1 if the
|
||||
# most recent scrape succeeded. We surface the wallclock age via a tiny
|
||||
# `time() - timestamp(up{...})` query.
|
||||
APPS_LAST_CHECK=$(prom_q 'time()-timestamp(up{job="kubernetes-pods",app="keel"})' | python3 -c '
|
||||
import json, sys
|
||||
try:
|
||||
r = json.load(sys.stdin)["data"]["result"]
|
||||
if not r: print("scrape not live")
|
||||
else:
|
||||
secs = int(float(r[0]["value"][1]))
|
||||
if secs < 60: print(f"{secs}s ago")
|
||||
elif secs < 3600: print(f"{secs//60}m ago")
|
||||
else: print(f"{secs//3600}h ago")
|
||||
except Exception:
|
||||
print("?")
|
||||
' 2>/dev/null || echo "?")
|
||||
|
||||
# Recent updates: count lines in Keel logs that report a successful
|
||||
# rollout. Keel logs an "update completed" message per rollout.
|
||||
local log_24h
|
||||
log_24h=$($KUBECTL -n keel logs deploy/keel --since=24h --tail=2000 2>/dev/null || true)
|
||||
updates_24h=$(echo "$log_24h" | grep -cE 'update completed|successfully updated|deployment updated' 2>/dev/null || true)
|
||||
[[ -z "$updates_24h" ]] && updates_24h=0
|
||||
APPS_UPDATES_LINE="$updates_24h in last 24h (tracked images: $tracked)"
|
||||
|
||||
# Known-benign Keel error patterns to suppress. Each is a real error
|
||||
# line Keel emits, but the surrounding behaviour is fine, so flagging
|
||||
# them in /upgrade-state is just noise.
|
||||
# - `bot.Run(): can not get configuration for bot [slack]` — Keel
|
||||
# 1.2.0 registers a Slack socket-mode bot whenever SLACK_BOT_TOKEN
|
||||
# is set, then fails because we don't supply an `xapp-` app-level
|
||||
# token. We don't want the interactive bot (no approvals; opt-out
|
||||
# auto-update). The Slack NOTIFICATION sender works independently
|
||||
# of the bot, so rollout messages still post to #general.
|
||||
# - `failed to check digest` with a transient network error —
|
||||
# Keel polls ~175 image manifests against public registries
|
||||
# hourly. Occasional `i/o timeout` / `connection refused` /
|
||||
# `TLS handshake timeout` / `no such host` / `EOF` /
|
||||
# `context deadline exceeded` are inherent to public-internet
|
||||
# polling at that scale and auto-recover on the next poll.
|
||||
# Actionable digest-check failures surface as HTTP 401/404
|
||||
# (auth, removed-tag) — those are NOT filtered.
|
||||
# - `failed to check digest` with HTTP 5xx — upstream registry
|
||||
# having a problem (DockerHub maintenance, Forgejo restart,
|
||||
# etc.). Same recovery pattern as network errors: next hourly
|
||||
# poll succeeds once upstream is back. Persistent 5xx for >24h
|
||||
# would indicate a real registry-side issue, but that surfaces
|
||||
# via the registry's own monitoring (e.g. forgejo-integrity-probe
|
||||
# + RegistryCatalogInaccessible), not via Keel logs.
|
||||
local benign_re='bot\.Run\(\): can not get configuration for bot \[slack\]'
|
||||
benign_re+='|SLACK_APP_TOKEN must have the (previf|prefix)'
|
||||
benign_re+='|failed to check digest.*(i/o timeout|connection refused|connection reset|context deadline exceeded|TLS handshake timeout|no such host|: EOF)'
|
||||
benign_re+='|failed to check digest.*non-successful response \(status=5[0-9][0-9]'
|
||||
errors=$(echo "$log_24h" | grep -iE '"level":"(error|fatal)"|level=error' | grep -vE "$benign_re" | tail -3 || true)
|
||||
if [[ -z "$errors" ]]; then
|
||||
APPS_ERROR_LINE="(none in last 24h)"
|
||||
else
|
||||
APPS_ERROR_LINE="$(echo "$errors" | wc -l | tr -d ' ') error(s); newest: $(echo "$errors" | tail -1 | cut -c1-120)"
|
||||
fi
|
||||
|
||||
# Keel pod state.
|
||||
local pod_status
|
||||
pod_status=$($KUBECTL -n keel get pods -l app=keel -o jsonpath='{.items[*].status.phase}' 2>/dev/null || true)
|
||||
|
||||
if [[ "$pod_status" != *"Running"* ]]; then
|
||||
APPS_STATUS_ICON="✗"; APPS_STATUS_TEXT="down"
|
||||
APPS_NOTES="Keel pod not Running ($pod_status)"
|
||||
raise_exit 2
|
||||
elif [[ "$pending" -gt 0 || -n "$errors" ]]; then
|
||||
APPS_STATUS_ICON="⚠"; APPS_STATUS_TEXT="attn"
|
||||
APPS_NOTES="$enrolled enrolled; $pending pending; $(echo "$errors" | wc -l | tr -d ' ') recent error(s)"
|
||||
raise_exit 1
|
||||
else
|
||||
APPS_STATUS_ICON="✓"; APPS_STATUS_TEXT="healthy"
|
||||
APPS_NOTES="$enrolled enrolled, 0 pending, 0 errors"
|
||||
fi
|
||||
|
||||
APPS_NEXT="rolling, hourly poll"
|
||||
}
|
||||
|
||||
# --- 2. OS (apt + kured) ---
|
||||
collect_os() {
|
||||
local distros kernels distro_uniq kernel_uniq
|
||||
distros=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.osImage}{"\n"}{end}' 2>/dev/null)
|
||||
kernels=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.kernelVersion}{"\n"}{end}' 2>/dev/null)
|
||||
distro_uniq=$(echo "$distros" | sort -u | tr '\n' ',' | sed 's/,$//; s/,/, /g')
|
||||
kernel_uniq=$(echo "$kernels" | sort -u | tr '\n' ',' | sed 's/,$//; s/,/, /g')
|
||||
OS_DISTRO_SUMMARY="$distro_uniq"
|
||||
OS_KERNEL_SUMMARY="$kernel_uniq"
|
||||
|
||||
# SSH fan-out — parallel background subshells, write per-node results to tmp files.
|
||||
local tmpdir; tmpdir=$(mktemp -d)
|
||||
trap 'rm -rf "$tmpdir"' RETURN
|
||||
local entry name ip
|
||||
for entry in "${NODES[@]}"; do
|
||||
name="${entry%%:*}"; ip="${entry##*:}"
|
||||
(
|
||||
local out reboot held upgradable uu_log
|
||||
reboot=$(ssh_node "$ip" 'test -f /var/run/reboot-required && echo yes || echo no')
|
||||
held=$(ssh_node "$ip" 'apt-mark showhold 2>/dev/null')
|
||||
upgradable=$(ssh_node "$ip" 'apt list --upgradable 2>/dev/null | tail -n +2')
|
||||
uu_log=$(ssh_node "$ip" 'tail -1 /var/log/unattended-upgrades/unattended-upgrades.log 2>/dev/null')
|
||||
printf 'reboot=%s\n' "$reboot" > "$tmpdir/$name"
|
||||
printf 'held<<<EOF\n%s\nEOF\n' "$held" >> "$tmpdir/$name"
|
||||
printf 'upgradable<<<EOF\n%s\nEOF\n' "$upgradable" >> "$tmpdir/$name"
|
||||
printf 'uu_log=%s\n' "$uu_log" >> "$tmpdir/$name"
|
||||
) &
|
||||
done
|
||||
wait
|
||||
|
||||
# Aggregate.
|
||||
local pending_reboots=() held_with_bumps_lines=() newest_uu_ts=0 newest_uu_iso=""
|
||||
for entry in "${NODES[@]}"; do
|
||||
name="${entry%%:*}"
|
||||
[[ -f "$tmpdir/$name" ]] || continue
|
||||
local reboot held upgradable uu_log uu_ts
|
||||
reboot=$(awk -F= '/^reboot=/{print $2}' "$tmpdir/$name")
|
||||
held=$(awk '/^held<<<EOF$/,/^EOF$/' "$tmpdir/$name" | sed '1d;$d')
|
||||
upgradable=$(awk '/^upgradable<<<EOF$/,/^EOF$/' "$tmpdir/$name" | sed '1d;$d')
|
||||
uu_log=$(awk -F= '/^uu_log=/{sub(/^uu_log=/,""); print}' "$tmpdir/$name")
|
||||
|
||||
[[ "$reboot" == "yes" ]] && pending_reboots+=("$name")
|
||||
|
||||
# Held + upgradable, excluding k8s components (managed by k8s pipeline).
|
||||
local pkg from to bump
|
||||
while IFS= read -r line; do
|
||||
[[ -z "$line" ]] && continue
|
||||
pkg=$(echo "$line" | awk -F/ '{print $1}')
|
||||
# Skip k8s and kernel/linux-image — the chain handles those.
|
||||
case "$pkg" in
|
||||
kubeadm|kubectl|kubelet) continue ;;
|
||||
linux-image-*|linux-headers-*|linux-modules-*|linux-generic|linux-headers-generic|linux-image-generic) continue ;;
|
||||
esac
|
||||
# Only flag if the package is held.
|
||||
if echo "$held" | grep -qx "$pkg"; then
|
||||
to=$(echo "$line" | awk '{print $2}')
|
||||
from=$(echo "$line" | sed -n 's/.*from: \([^ ]*\).*/\1/p')
|
||||
bump="$pkg ${from%-*}→${to%-*}"
|
||||
held_with_bumps_lines+=("$name: $bump")
|
||||
fi
|
||||
done <<<"$upgradable"
|
||||
|
||||
# Newest uu timestamp (ISO at start of log line).
|
||||
uu_ts=$(echo "$uu_log" | sed -E 's/^([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}).*/\1/')
|
||||
if [[ -n "$uu_ts" ]]; then
|
||||
local epoch; epoch=$(date -u -d "$uu_ts" +%s 2>/dev/null || echo 0)
|
||||
if [[ "$epoch" -gt "$newest_uu_ts" ]]; then
|
||||
newest_uu_ts="$epoch"; newest_uu_iso="$uu_ts"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
OS_PENDING_REBOOT_NODES="${pending_reboots[*]:-}"
|
||||
if [[ ${#held_with_bumps_lines[@]} -gt 0 ]]; then
|
||||
OS_HELD_DETAIL=$(printf '%s\n' "${held_with_bumps_lines[@]}" | sort -u | paste -sd '; ' -)
|
||||
fi
|
||||
|
||||
if [[ "$newest_uu_ts" -gt 0 ]]; then
|
||||
local age=$((NOW_EPOCH - newest_uu_ts))
|
||||
OS_LAST_UU="$newest_uu_iso UTC ($(human_age "$age"))"
|
||||
OS_LAST_CHECK="$(human_age "$age") (uu daily)"
|
||||
else
|
||||
OS_LAST_UU="(no uu log accessible)"
|
||||
OS_LAST_CHECK="?"
|
||||
fi
|
||||
|
||||
# Last kured reboot — newest Ready transition across worker nodes.
|
||||
# `Ready -> True` is what kured causes when the node returns; we surface
|
||||
# the most recent timestamp and the node it belongs to.
|
||||
local kured_raw kured_iso kured_node kured_ep kured_age
|
||||
kured_raw=$($KUBECTL get nodes -o json 2>/dev/null | python3 -c '
|
||||
import json, sys
|
||||
from datetime import datetime
|
||||
data = json.load(sys.stdin)
|
||||
best = (0, "", "")
|
||||
for n in data["items"]:
|
||||
name = n["metadata"]["name"]
|
||||
for c in n["status"].get("conditions", []):
|
||||
if c["type"] == "Ready":
|
||||
dt = datetime.strptime(c["lastTransitionTime"], "%Y-%m-%dT%H:%M:%SZ")
|
||||
ep = int(dt.timestamp())
|
||||
if ep > best[0]:
|
||||
best = (ep, name, c["lastTransitionTime"])
|
||||
print(f"{best[0]}|{best[1]}|{best[2]}")
|
||||
' 2>/dev/null || echo "0||")
|
||||
kured_ep="${kured_raw%%|*}"
|
||||
kured_node=$(echo "$kured_raw" | cut -d'|' -f2)
|
||||
kured_iso=$(echo "$kured_raw" | cut -d'|' -f3)
|
||||
if [[ "$kured_ep" -gt 0 ]]; then
|
||||
kured_age=$((NOW_EPOCH - kured_ep))
|
||||
OS_LAST_KURED="$kured_iso ($kured_node, $(human_age "$kured_age"))"
|
||||
else
|
||||
OS_LAST_KURED="?"
|
||||
fi
|
||||
|
||||
OS_NEXT="daily 02:00-06:00 London"
|
||||
|
||||
# Kured pod health.
|
||||
local kured_pods kured_unhealthy
|
||||
kured_pods=$($KUBECTL -n kured get pods -l app.kubernetes.io/name=kured -o jsonpath='{range .items[*]}{.status.phase}{"\n"}{end}' 2>/dev/null)
|
||||
kured_unhealthy=$(echo "$kured_pods" | grep -cv '^Running$' 2>/dev/null || true)
|
||||
|
||||
local notes=()
|
||||
[[ -n "$OS_HELD_DETAIL" ]] && notes+=("held with bumps: $OS_HELD_DETAIL")
|
||||
[[ -n "$OS_PENDING_REBOOT_NODES" ]] && notes+=("pending reboot: $OS_PENDING_REBOOT_NODES")
|
||||
|
||||
if [[ "$kured_unhealthy" -gt 0 ]]; then
|
||||
OS_STATUS_ICON="✗"; OS_STATUS_TEXT="kured down"
|
||||
OS_NOTES="kured pods not all Running"
|
||||
raise_exit 2
|
||||
elif [[ ${#notes[@]} -gt 0 ]]; then
|
||||
OS_STATUS_ICON="⚠"; OS_STATUS_TEXT="attn"
|
||||
OS_NOTES="${notes[*]}"
|
||||
raise_exit 1
|
||||
else
|
||||
OS_STATUS_ICON="✓"; OS_STATUS_TEXT="healthy"
|
||||
OS_NOTES="distros uniform; no held bumps; no pending reboots"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- 3. K8s (kubeadm/kubelet/kubectl) ---
|
||||
collect_k8s() {
|
||||
local kver_list kver_uniq metrics target_patch target_minor last_run in_flight started
|
||||
|
||||
kver_list=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.kubeletVersion}{"\n"}{end}' 2>/dev/null)
|
||||
kver_uniq=$(echo "$kver_list" | sort -u)
|
||||
local n_uniq; n_uniq=$(echo "$kver_uniq" | wc -l | tr -d ' ')
|
||||
if [[ "$n_uniq" -eq 1 ]]; then
|
||||
K8S_RUNNING="$kver_uniq across $(echo "$kver_list" | wc -l | tr -d ' ')/$(echo "$kver_list" | wc -l | tr -d ' ') nodes"
|
||||
else
|
||||
K8S_RUNNING="mixed: $(echo "$kver_uniq" | paste -sd', ' -)"
|
||||
fi
|
||||
local running_ver; running_ver=$(echo "$kver_uniq" | head -1)
|
||||
|
||||
metrics=$(pg_metrics)
|
||||
# All five may legitimately be absent (cluster never ran the upgrade
|
||||
# chain, kind="minor" not detected, etc.) — `|| true` keeps pipefail
|
||||
# from killing the script on no-match.
|
||||
target_patch=$(echo "$metrics" | { grep -E '^k8s_upgrade_available\{[^}]*kind="patch"' || true; } | sed -n 's/.*target="\([^"]*\)".*/\1/p' | head -1)
|
||||
target_minor=$(echo "$metrics" | { grep -E '^k8s_upgrade_available\{[^}]*kind="minor"' || true; } | sed -n 's/.*target="\([^"]*\)".*/\1/p' | head -1)
|
||||
# Pushgateway emits these with `{instance="",job="..."}` labels — the
|
||||
# `awk '$1 ~ /^name(\{|$)/'` form matches both bare and labelled metrics.
|
||||
last_run=$(echo "$metrics" | awk '$1 ~ /^k8s_version_check_last_run_timestamp(\{|$)/{print $2}' | head -1 || true)
|
||||
in_flight=$(echo "$metrics" | awk '$1 ~ /^k8s_upgrade_in_flight(\{|$)/{print $2}' | head -1 || true)
|
||||
started=$(echo "$metrics" | awk '$1 ~ /^k8s_upgrade_started_timestamp(\{|$)/{print $2}' | head -1 || true)
|
||||
|
||||
# Pushgateway timestamps come back in scientific notation
|
||||
# (e.g. 1.779052159e+09) — convert to plain integer seconds.
|
||||
local last_run_int started_int
|
||||
last_run_int=$(to_epoch_int "$last_run")
|
||||
started_int=$(to_epoch_int "$started")
|
||||
|
||||
if [[ "$last_run_int" -gt 0 ]]; then
|
||||
local age=$((NOW_EPOCH - last_run_int))
|
||||
K8S_LAST_CHECK="$(human_age "$age") (daily cron)"
|
||||
if [[ -n "$target_patch" ]]; then
|
||||
K8S_LAST_DETECT_LINE="last run $(human_age "$age"): available v$target_patch (patch)"
|
||||
elif [[ -n "$target_minor" ]]; then
|
||||
K8S_LAST_DETECT_LINE="last run $(human_age "$age"): available v$target_minor (minor)"
|
||||
else
|
||||
K8S_LAST_DETECT_LINE="last run $(human_age "$age"): no upgrade available"
|
||||
fi
|
||||
else
|
||||
K8S_LAST_CHECK="(metric missing)"
|
||||
K8S_LAST_DETECT_LINE="(no k8s_version_check_last_run_timestamp in Pushgateway)"
|
||||
fi
|
||||
K8S_PATCH="${target_patch:-none}"
|
||||
K8S_MINOR="${target_minor:-none}"
|
||||
|
||||
# In-flight / last chain.
|
||||
if [[ "${in_flight:-0}" == "1" ]]; then
|
||||
K8S_IN_FLIGHT="yes"
|
||||
local since=0
|
||||
[[ "$started_int" -gt 0 ]] && since=$((NOW_EPOCH - started_int))
|
||||
K8S_LAST_CHAIN="in-flight (started $(human_age "$since"))"
|
||||
else
|
||||
K8S_IN_FLIGHT="no"
|
||||
if [[ "$started_int" -gt 0 ]]; then
|
||||
local age=$((NOW_EPOCH - started_int))
|
||||
K8S_LAST_CHAIN="$(human_age "$age")"
|
||||
else
|
||||
K8S_LAST_CHAIN="never (or zeroed)"
|
||||
fi
|
||||
fi
|
||||
|
||||
K8S_NEXT="$(next_daily_noon_utc)"
|
||||
|
||||
# Status logic.
|
||||
local stalled=0
|
||||
if [[ "${in_flight:-0}" == "1" && "$started_int" -gt 0 ]]; then
|
||||
# K8sUpgradeStalled fires after 5400s (90m) per monitoring stack.
|
||||
local since=$((NOW_EPOCH - started_int))
|
||||
[[ "$since" -gt 5400 ]] && stalled=1
|
||||
fi
|
||||
local last_run_age=999999999
|
||||
[[ "$last_run_int" -gt 0 ]] && last_run_age=$((NOW_EPOCH - last_run_int))
|
||||
|
||||
if [[ "$stalled" == "1" ]]; then
|
||||
K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="stalled"
|
||||
K8S_NOTES="K8sUpgradeStalled would fire — chain in-flight >90m"
|
||||
raise_exit 2
|
||||
elif [[ "$last_run_age" -gt $((9*86400)) ]]; then
|
||||
K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="detection stale"
|
||||
K8S_NOTES="last detection >9d ago"
|
||||
raise_exit 2
|
||||
elif [[ "${in_flight:-0}" == "1" ]]; then
|
||||
K8S_STATUS_ICON="…"; K8S_STATUS_TEXT="in-flight"
|
||||
K8S_NOTES="upgrade chain running"
|
||||
raise_exit 1
|
||||
elif [[ -n "$target_patch" ]]; then
|
||||
K8S_STATUS_ICON="→"; K8S_STATUS_TEXT="$target_patch"
|
||||
K8S_NOTES="running $running_ver → v$target_patch (patch) available"
|
||||
raise_exit 1
|
||||
elif [[ -n "$target_minor" ]]; then
|
||||
K8S_STATUS_ICON="→"; K8S_STATUS_TEXT="$target_minor"
|
||||
K8S_NOTES="running $running_ver → v$target_minor (minor) available"
|
||||
raise_exit 1
|
||||
else
|
||||
K8S_STATUS_ICON="✓"; K8S_STATUS_TEXT="current"
|
||||
K8S_NOTES="running $running_ver, nothing newer"
|
||||
fi
|
||||
}
|
||||
|
||||
# Next daily 12:00 UTC — pure bash date math, no croniter. Schedule was
|
||||
# weekly Sunday until 2026-05-18; now `0 12 * * *` in the
|
||||
# k8s-version-upgrade stack. If we're still before today's 12:00 UTC,
|
||||
# the next run is today; otherwise it's tomorrow.
|
||||
next_daily_noon_utc() {
|
||||
local hr days_ahead
|
||||
hr=$(date -u +%H)
|
||||
if [[ "$hr" -lt 12 ]]; then days_ahead=0; else days_ahead=1; fi
|
||||
date -u -d "+$days_ahead days" +"%a %Y-%m-%d 12:00 UTC"
|
||||
}
|
||||
|
||||
# --- Renderers ---
|
||||
# The table uses `column -t` so we don't have to compute visual widths
|
||||
# manually (the status icons are multi-byte UTF-8 and ANSI escapes don't
|
||||
# play nice with `printf %-Xs`). Trade-off: no in-cell colour, but the
|
||||
# icon character already carries the signal.
|
||||
render_table() {
|
||||
echo
|
||||
printf "${BOLD}Upgrade state — %s${NC}\n" "$(date -u +'%Y-%m-%d %H:%M UTC')"
|
||||
echo
|
||||
{
|
||||
echo "Layer|Status|Last check|Next upgrade|Notes"
|
||||
echo "-----|------|----------|------------|-----"
|
||||
printf 'Apps|%s %s|%s|%s|%s\n' "$APPS_STATUS_ICON" "$APPS_STATUS_TEXT" "$APPS_LAST_CHECK" "$APPS_NEXT" "$APPS_NOTES"
|
||||
printf 'OS |%s %s|%s|%s|%s\n' "$OS_STATUS_ICON" "$OS_STATUS_TEXT" "$OS_LAST_CHECK" "$OS_NEXT" "$OS_NOTES"
|
||||
printf 'K8s |%s %s|%s|%s|%s\n' "$K8S_STATUS_ICON" "$K8S_STATUS_TEXT" "$K8S_LAST_CHECK" "$K8S_NEXT" "$K8S_NOTES"
|
||||
} | column -t -s '|' -o ' | '
|
||||
|
||||
echo
|
||||
printf "${BOLD}--- Apps (Keel) ---${NC}\n"
|
||||
echo "Enrolled deployments: $APPS_ENROLLED"
|
||||
echo "Recent rollouts: $APPS_UPDATES_LINE"
|
||||
echo "Pending approvals: $APPS_PENDING"
|
||||
echo "Last Keel error: $APPS_ERROR_LINE"
|
||||
|
||||
echo
|
||||
printf "${BOLD}--- OS (apt + kured) ---${NC}\n"
|
||||
echo "Ubuntu per node: $OS_DISTRO_SUMMARY"
|
||||
echo "Kernel per node: $OS_KERNEL_SUMMARY"
|
||||
echo "Pending reboot: ${OS_PENDING_REBOOT_NODES:-none}"
|
||||
echo "Held packages with upstream bumps: ${OS_HELD_DETAIL:-none (excluding k8s components)}"
|
||||
echo "Last uu run (newest across nodes): $OS_LAST_UU"
|
||||
echo "Last kured reboot (newest Ready transition): $OS_LAST_KURED"
|
||||
echo "Next kured window: $OS_NEXT"
|
||||
|
||||
echo
|
||||
printf "${BOLD}--- K8s (kubeadm/kubelet/kubectl) ---${NC}\n"
|
||||
echo "Running: $K8S_RUNNING"
|
||||
echo "Latest patch (apt): ${K8S_PATCH}"
|
||||
echo "Next minor available: ${K8S_MINOR}"
|
||||
echo "Detection: $K8S_LAST_DETECT_LINE"
|
||||
echo "In-flight: $K8S_IN_FLIGHT | Last chain start: $K8S_LAST_CHAIN"
|
||||
echo "Next detection: $K8S_NEXT"
|
||||
echo
|
||||
}
|
||||
|
||||
render_json() {
|
||||
# Pipe values into Python via env vars so we don't need to worry about
|
||||
# embedded quotes/backslashes in error lines.
|
||||
APPS_STATUS_ICON="$APPS_STATUS_ICON" APPS_STATUS_TEXT="$APPS_STATUS_TEXT" \
|
||||
APPS_LAST_CHECK="$APPS_LAST_CHECK" APPS_NEXT="$APPS_NEXT" APPS_NOTES="$APPS_NOTES" \
|
||||
APPS_ENROLLED="$APPS_ENROLLED" APPS_PENDING="$APPS_PENDING" \
|
||||
APPS_UPDATES_LINE="$APPS_UPDATES_LINE" APPS_ERROR_LINE="$APPS_ERROR_LINE" \
|
||||
OS_STATUS_ICON="$OS_STATUS_ICON" OS_STATUS_TEXT="$OS_STATUS_TEXT" \
|
||||
OS_LAST_CHECK="$OS_LAST_CHECK" OS_NEXT="$OS_NEXT" OS_NOTES="$OS_NOTES" \
|
||||
OS_DISTRO_SUMMARY="$OS_DISTRO_SUMMARY" OS_KERNEL_SUMMARY="$OS_KERNEL_SUMMARY" \
|
||||
OS_PENDING_REBOOT_NODES="$OS_PENDING_REBOOT_NODES" OS_HELD_DETAIL="$OS_HELD_DETAIL" \
|
||||
OS_LAST_UU="$OS_LAST_UU" OS_LAST_KURED="$OS_LAST_KURED" \
|
||||
K8S_STATUS_ICON="$K8S_STATUS_ICON" K8S_STATUS_TEXT="$K8S_STATUS_TEXT" \
|
||||
K8S_LAST_CHECK="$K8S_LAST_CHECK" K8S_NEXT="$K8S_NEXT" K8S_NOTES="$K8S_NOTES" \
|
||||
K8S_RUNNING="$K8S_RUNNING" K8S_PATCH="$K8S_PATCH" K8S_MINOR="$K8S_MINOR" \
|
||||
K8S_LAST_DETECT_LINE="$K8S_LAST_DETECT_LINE" K8S_IN_FLIGHT="$K8S_IN_FLIGHT" K8S_LAST_CHAIN="$K8S_LAST_CHAIN" \
|
||||
HIGHEST_EXIT="$HIGHEST_EXIT" \
|
||||
python3 -c '
|
||||
import json, os
|
||||
from datetime import datetime, timezone
|
||||
def env(k): return os.environ.get(k, "")
|
||||
out = {
|
||||
"as_of_utc": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"highest_exit": int(env("HIGHEST_EXIT")),
|
||||
"apps": {
|
||||
"status": env("APPS_STATUS_ICON"),
|
||||
"status_text": env("APPS_STATUS_TEXT"),
|
||||
"last_check": env("APPS_LAST_CHECK"),
|
||||
"next_upgrade": env("APPS_NEXT"),
|
||||
"notes": env("APPS_NOTES"),
|
||||
"enrolled": int(env("APPS_ENROLLED") or 0),
|
||||
"pending_approvals": int(env("APPS_PENDING") or 0),
|
||||
"updates_line": env("APPS_UPDATES_LINE"),
|
||||
"errors_line": env("APPS_ERROR_LINE"),
|
||||
},
|
||||
"os": {
|
||||
"status": env("OS_STATUS_ICON"),
|
||||
"status_text": env("OS_STATUS_TEXT"),
|
||||
"last_check": env("OS_LAST_CHECK"),
|
||||
"next_upgrade": env("OS_NEXT"),
|
||||
"notes": env("OS_NOTES"),
|
||||
"distros": env("OS_DISTRO_SUMMARY"),
|
||||
"kernels": env("OS_KERNEL_SUMMARY"),
|
||||
"pending_reboot_nodes": env("OS_PENDING_REBOOT_NODES"),
|
||||
"held_with_bumps": env("OS_HELD_DETAIL"),
|
||||
"last_uu_run": env("OS_LAST_UU"),
|
||||
"last_kured_reboot": env("OS_LAST_KURED"),
|
||||
},
|
||||
"k8s": {
|
||||
"status": env("K8S_STATUS_ICON"),
|
||||
"status_text": env("K8S_STATUS_TEXT"),
|
||||
"last_check": env("K8S_LAST_CHECK"),
|
||||
"next_upgrade": env("K8S_NEXT"),
|
||||
"notes": env("K8S_NOTES"),
|
||||
"running": env("K8S_RUNNING"),
|
||||
"patch_target": env("K8S_PATCH"),
|
||||
"minor_target": env("K8S_MINOR"),
|
||||
"last_detection_line": env("K8S_LAST_DETECT_LINE"),
|
||||
"in_flight": env("K8S_IN_FLIGHT"),
|
||||
"last_chain": env("K8S_LAST_CHAIN"),
|
||||
},
|
||||
}
|
||||
print(json.dumps(out, indent=2))
|
||||
'
|
||||
}
|
||||
|
||||
main() {
|
||||
parse_args "$@"
|
||||
collect_apps
|
||||
collect_os
|
||||
collect_k8s
|
||||
if [[ "$JSON" == true ]]; then
|
||||
render_json
|
||||
else
|
||||
render_table
|
||||
fi
|
||||
exit "$HIGHEST_EXIT"
|
||||
}
|
||||
|
||||
main "$@"
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# Generate a short-lived kubeconfig from Vault K8s secrets engine.
|
||||
# Requires: vault login -method=oidc (or VAULT_TOKEN set)
|
||||
set -euo pipefail
|
||||
|
||||
TOKEN=$(vault write -format=json kubernetes/creds/local-admin kubernetes_namespace=default | jq -r .data.service_account_token)
|
||||
kubectl config set-credentials vault-admin --token="$TOKEN"
|
||||
kubectl config set-context vault --cluster=kubernetes --user=vault-admin
|
||||
kubectl config use-context vault
|
||||
echo "Kubeconfig set with 1h token"
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
[Unit]
|
||||
Description=Renew the periodic Vault/OpenBao token in ~/.vault-token
|
||||
Documentation=https://github.com/ViktorBarzin/infra/blob/master/scripts/vault-token-renew.sh
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=%h/.local/bin/vault-token-renew
|
||||
|
|
@ -1,90 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# Renew the long-lived PERIODIC Vault/OpenBao token stored in ~/.vault-token.
|
||||
#
|
||||
# Background: wizard@devvm used to hold a 7-day OIDC login token (re-auth weekly
|
||||
# via `vault login -method=oidc`). On 2026-06-05 that was replaced with a
|
||||
# periodic, orphan token so it never expires. Periodic tokens have no max-TTL;
|
||||
# they only need renewing within each `period` (768h / 32d here). This unit
|
||||
# renews daily, so the token stays alive indefinitely with huge margin. If the
|
||||
# box is ever decommissioned and this stops running, the token self-expires
|
||||
# within ~32 days (unlike a root token, which would live forever).
|
||||
#
|
||||
# Token was minted with (vault-admin = path "*" sudo; sops-admin = transit for SOPS):
|
||||
# vault token create -orphan -period=768h \
|
||||
# -policy=vault-admin -policy=sops-admin -display-name=devvm-wizard
|
||||
# To recreate if ever lost: `vault login -method=oidc`, run the above with
|
||||
# `-field=token > ~/.vault-token`, then `chmod 600 ~/.vault-token`.
|
||||
#
|
||||
# Source of truth: infra/scripts/vault-token-renew.sh (deployed to
|
||||
# ~/.local/bin/vault-token-renew). Driven by the systemd USER units
|
||||
# vault-token-renew.{service,timer}. Deploy + recovery runbook:
|
||||
# infra/docs/runbooks/vault-token-renew-devvm.md
|
||||
|
||||
EXPECTED_DN="token-devvm-wizard"
|
||||
REQUIRED_POLICY="vault-admin"
|
||||
|
||||
# vtr_display_name <lookup-json> -> display_name (empty if absent).
|
||||
vtr_display_name() {
|
||||
printf '%s' "$1" | jq -r '.data.display_name // ""'
|
||||
}
|
||||
|
||||
# vtr_policies_csv <lookup-json> -> comma-joined token policies + identity policies.
|
||||
# Both are merged because a token minted via OIDC carries vault-admin only in
|
||||
# identity_policies, while .data.policies shows just [default] (misleading on its
|
||||
# own — see memory id=4211). Our periodic token carries them as token policies.
|
||||
vtr_policies_csv() {
|
||||
printf '%s' "$1" | jq -r '((.data.policies // []) + (.data.identity_policies // [])) | join(",")'
|
||||
}
|
||||
|
||||
# vtr_drift_ok <display_name> <policies-csv> -> 0 if this is OUR periodic admin
|
||||
# token (right display name AND vault-admin present), 1 otherwise. The comma
|
||||
# fencing makes the policy match exact (so "vault-admin-ro" never matches).
|
||||
vtr_drift_ok() {
|
||||
local dn="$1" pols="$2"
|
||||
[ "$dn" = "$EXPECTED_DN" ] || return 1
|
||||
printf ',%s,' "$pols" | grep -q ",$REQUIRED_POLICY," || return 1
|
||||
}
|
||||
|
||||
vtr_main() {
|
||||
set -euo pipefail
|
||||
export PATH="/usr/local/bin:/usr/bin:/bin:${PATH:-}"
|
||||
export VAULT_ADDR="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
|
||||
|
||||
local log info dn pols out ttl
|
||||
log="${XDG_STATE_HOME:-$HOME/.local/state}/vault-token-renew.log"
|
||||
mkdir -p "$(dirname "$log")"
|
||||
|
||||
if ! info=$(vault token lookup -format=json 2>&1); then
|
||||
printf '%s FAIL: token lookup: %s\n' "$(date -Is)" "$info" >>"$log"
|
||||
exit 1
|
||||
fi
|
||||
dn=$(vtr_display_name "$info")
|
||||
pols=$(vtr_policies_csv "$info")
|
||||
|
||||
# Drift guard (added 2026-06-07): the renewer must NOT keep a FOREIGN token alive.
|
||||
# On 2026-06-05 a stray `vault login -method=kubernetes` overwrote ~/.vault-token
|
||||
# with a read-only woodpecker token, and this script then silently renewed THAT
|
||||
# for two days — masking the loss of write access. So before renewing, confirm
|
||||
# the token is our periodic admin token; if it has drifted, fail loudly (systemd
|
||||
# marks the unit failed) instead of keeping someone else's token alive.
|
||||
if ! vtr_drift_ok "$dn" "$pols"; then
|
||||
printf '%s DRIFT: ~/.vault-token is dn=%q policies=%q (expected dn=%q with %q). Refusing to renew a foreign token. Re-mint: vault login -method=oidc && vault token create -orphan -period=768h -policy=vault-admin -policy=sops-admin -display-name=devvm-wizard -field=token > ~/.vault-token && chmod 600 ~/.vault-token\n' \
|
||||
"$(date -Is)" "$dn" "$pols" "$EXPECTED_DN" "$REQUIRED_POLICY" >>"$log"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# `vault token renew` with no argument renews the calling token (renew-self).
|
||||
# On success, log only the new TTL (never the raw JSON — it contains the token).
|
||||
if out=$(vault token renew -format=json 2>&1); then
|
||||
ttl=$(printf '%s' "$out" | jq -r '.auth.lease_duration' 2>/dev/null || echo '?')
|
||||
printf '%s OK renewed (dn=%s ttl=%ss)\n' "$(date -Is)" "$dn" "$ttl" >>"$log"
|
||||
else
|
||||
printf '%s FAIL: %s\n' "$(date -Is)" "$out" >>"$log"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Run main only when executed directly, so the test can source the pure functions.
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
vtr_main "$@"
|
||||
fi
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
[Unit]
|
||||
Description=Daily renewal of the periodic Vault token in ~/.vault-token
|
||||
|
||||
[Timer]
|
||||
OnCalendar=daily
|
||||
Persistent=true
|
||||
RandomizedDelaySec=300
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
|
|
@ -1,121 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# Programmatically register a Forgejo repo in Woodpecker without needing the
|
||||
# Web UI's OAuth flow.
|
||||
#
|
||||
# Earlier we believed only the OAuth login could create a working webhook
|
||||
# because the webhook URL contains a JWT signed with a server-side key.
|
||||
# That's true for the JWT, BUT the webhook is created server-side when the
|
||||
# repo is activated through POST /api/repos — Woodpecker handles the JWT
|
||||
# generation internally. We just need to call that endpoint as the right
|
||||
# user (the one whose forge OAuth token can read the repo).
|
||||
#
|
||||
# The Woodpecker admin token (mine, ViktorBarzin@github) is a session JWT
|
||||
# of the form `{"type":"user","user-id":"1"}` signed with the user's
|
||||
# `hash` column (per-user, stored in the `users` table). Forge-API calls
|
||||
# made on behalf of that user use the user's stored OAuth `access_token`
|
||||
# from the same row. My GitHub admin can't read Forgejo repos, so the
|
||||
# admin token can't activate Forgejo repos.
|
||||
#
|
||||
# The fix: mint a session JWT for the Forgejo `viktor` user (user_id=2)
|
||||
# using `viktor`'s `hash`. Then POST /api/repos as viktor — viktor's
|
||||
# stored Forgejo OAuth token has the access needed.
|
||||
#
|
||||
# Usage:
|
||||
# ./woodpecker-register-forgejo-repo.sh <forgejo-org/repo> [<forgejo-org/repo> ...]
|
||||
# Example:
|
||||
# ./woodpecker-register-forgejo-repo.sh viktor/broker-sync viktor/freedify
|
||||
#
|
||||
# Requires:
|
||||
# - vault CLI logged in (oidc or token), with read access to
|
||||
# secret/database/static-creds/pg-woodpecker AND a Forgejo PAT in
|
||||
# secret/viktor/forgejo_admin_token (or pass FORGEJO_TOKEN env var)
|
||||
# - kubectl with cluster access (for the temporary psql pod)
|
||||
# - openssl
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
NS=${NS:-woodpecker}
|
||||
WP_URL=${WP_URL:-https://ci.viktorbarzin.me}
|
||||
FORGEJO_URL=${FORGEJO_URL:-https://forgejo.viktorbarzin.me}
|
||||
FORGEJO_USER_LOGIN=${FORGEJO_USER_LOGIN:-viktor}
|
||||
|
||||
if [ "$#" -lt 1 ]; then
|
||||
echo "usage: $0 <org/repo> [<org/repo> ...]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Pull viktor's `hash` from the woodpecker DB (used to sign the session JWT)
|
||||
# and OAuth access_token (sanity check it exists).
|
||||
WP_DB_USER=$(vault read -format=json database/static-creds/pg-woodpecker | jq -r .data.username)
|
||||
WP_DB_PASS=$(vault read -format=json database/static-creds/pg-woodpecker | jq -r .data.password)
|
||||
|
||||
PG_POD=tmp-wp-register-$$
|
||||
cat <<EOF | kubectl apply -f - >/dev/null
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata: { name: $PG_POD, namespace: $NS }
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
containers:
|
||||
- name: psql
|
||||
image: postgres:15
|
||||
env: [{name: PGPASSWORD, value: "$WP_DB_PASS"}]
|
||||
command: ["sleep", "300"]
|
||||
EOF
|
||||
trap "kubectl delete pod -n $NS $PG_POD --wait=false >/dev/null 2>&1 || true" EXIT
|
||||
for _ in $(seq 1 30); do
|
||||
PHASE=$(kubectl get pod -n $NS $PG_POD -o jsonpath='{.status.phase}' 2>/dev/null || true)
|
||||
[ "$PHASE" = "Running" ] && break
|
||||
sleep 1
|
||||
done
|
||||
|
||||
VIKTOR_HASH=$(kubectl exec -n $NS $PG_POD -- psql -h pg-cluster-rw.dbaas -U "$WP_DB_USER" -d woodpecker -tA -c \
|
||||
"SELECT hash FROM users WHERE login='$FORGEJO_USER_LOGIN' AND forge_id=2" | tr -d '[:space:]')
|
||||
|
||||
if [ -z "$VIKTOR_HASH" ]; then
|
||||
echo "ERROR: no woodpecker user found for forge_id=2 login=$FORGEJO_USER_LOGIN" >&2
|
||||
echo " (have they ever logged in via Forgejo OAuth?)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Mint a session JWT (HS256) for that user.
|
||||
b64() { openssl base64 -A | tr '+/' '-_' | tr -d '='; }
|
||||
HEADER=$(printf '%s' '{"alg":"HS256","typ":"JWT"}' | b64)
|
||||
PAYLOAD=$(printf '{"type":"user","user-id":"%s"}' \
|
||||
"$(kubectl exec -n $NS $PG_POD -- psql -h pg-cluster-rw.dbaas -U "$WP_DB_USER" -d woodpecker -tA -c \
|
||||
"SELECT id FROM users WHERE login='$FORGEJO_USER_LOGIN' AND forge_id=2" | tr -d '[:space:]')" | b64)
|
||||
SIG=$(printf '%s.%s' "$HEADER" "$PAYLOAD" | openssl dgst -sha256 -hmac "$VIKTOR_HASH" -binary | b64)
|
||||
TOKEN="$HEADER.$PAYLOAD.$SIG"
|
||||
|
||||
# Sanity check: am I really logged in as viktor?
|
||||
ME=$(curl -sf "$WP_URL/api/user" -H "Authorization: Bearer $TOKEN" | jq -r '.login')
|
||||
if [ "$ME" != "$FORGEJO_USER_LOGIN" ]; then
|
||||
echo "ERROR: minted token authenticates as '$ME', not '$FORGEJO_USER_LOGIN'" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "Authenticated as: $ME"
|
||||
|
||||
# Activate each repo via POST /api/repos?forge_remote_id=N
|
||||
# Forgejo repo ID is fetched via the Forgejo API.
|
||||
FORGEJO_AUTH="${FORGEJO_TOKEN:-$(vault kv get -field=forgejo_admin_token secret/viktor 2>/dev/null || true)}"
|
||||
if [ -z "$FORGEJO_AUTH" ]; then
|
||||
echo "ERROR: set FORGEJO_TOKEN env or seed secret/viktor/forgejo_admin_token in vault" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for repo in "$@"; do
|
||||
FRID=$(curl -sf "$FORGEJO_URL/api/v1/repos/$repo" -H "Authorization: token $FORGEJO_AUTH" | jq -r .id 2>/dev/null || true)
|
||||
if [ -z "$FRID" ] || [ "$FRID" = "null" ]; then
|
||||
echo " $repo: ERROR resolving Forgejo repo id" >&2
|
||||
continue
|
||||
fi
|
||||
HTTP=$(curl -s -X POST "$WP_URL/api/repos?forge_remote_id=$FRID" \
|
||||
-H "Authorization: Bearer $TOKEN" \
|
||||
-o /tmp/wp-add-$FRID.json -w "%{http_code}")
|
||||
case "$HTTP" in
|
||||
200) echo " $repo: activated (id=$(jq -r .id /tmp/wp-add-$FRID.json))" ;;
|
||||
409) echo " $repo: already active" ;;
|
||||
*) echo " $repo: HTTP $HTTP — $(cat /tmp/wp-add-$FRID.json)" ;;
|
||||
esac
|
||||
rm -f /tmp/wp-add-$FRID.json
|
||||
done
|
||||
3
scripts/workstation/.gitignore
vendored
3
scripts/workstation/.gitignore
vendored
|
|
@ -1,3 +0,0 @@
|
|||
__pycache__/
|
||||
.pytest_cache/
|
||||
*.pyc
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
{
|
||||
"claudeMd": "# Viktor Barzin homelab — shared multi-user Claude Code Workstation (devvm)\n\nYou are running as a specific OS user on a SHARED devvm Workstation, not as the admin. These org-wide rules apply to EVERY user and sit at the top of settings precedence (they cannot be overridden by a user's own config):\n\n- Respect your permission tier. Your kubectl, Vault, and infra access are scoped to your RBAC tier (admin / power-user / namespace-owner). Do not attempt to escalate privileges or reach another user's resources.\n- Secrets are per-user. Never read another user's home directory, credentials, tokens, or ~/.claude secrets. Your own secrets live in your home at mode 600.\n- Infrastructure changes go through Terraform/Terragrunt (scripts/tg apply) — never direct kubectl apply/edit/patch. Pushing to git does NOT deploy; applies are manual and admin-gated, so your edits cannot take effect without an admin apply.\n- Follow the engineering rules in ~/.claude/rules/ (execution, planning, quality) and every CLAUDE.md in the repo tree.\n- The monorepo is at ~/code. Non-admins get a git-crypt-LOCKED clone: secret files read as ciphertext — that is expected, not an error."
|
||||
}
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
# Declarative host toolset for the devvm Workstation (apt packages, one per line).
|
||||
# Consumed by setup-devvm.sh: apt-get install -y $(grep -vE '^\s*(#|$)' packages.txt)
|
||||
# Comments (#) and blank lines are ignored. Tools NOT in the standard apt repos
|
||||
# are listed below as comments with their real install path (handled explicitly
|
||||
# in setup-devvm.sh) so this manifest stays a safe argument to `apt-get install`.
|
||||
git
|
||||
zsh
|
||||
tmux
|
||||
ripgrep
|
||||
fd-find
|
||||
jq
|
||||
curl
|
||||
ca-certificates
|
||||
python3
|
||||
python3-yaml
|
||||
python3-pip
|
||||
podman
|
||||
|
||||
# --- installed by setup-devvm.sh via NON-apt paths (not apt-installable) ---
|
||||
# nodejs + npm -> NodeSource repo (claude-code needs node >= 18; distro nodejs is too old)
|
||||
# @anthropic-ai/claude-code -> npm install -g
|
||||
# kubectl -> k8s apt repo OR pinned binary (already present on devvm)
|
||||
# vault -> HashiCorp apt repo OR pinned binary (already present on devvm)
|
||||
# kubelogin (kubectl oidc-login) -> `kubectl krew install oidc-login` or int128/kubelogin release.
|
||||
# NOTE: the apt package literally named "kubelogin" is the AZURE
|
||||
# tool, NOT the OIDC one we need -- do not apt-install it.
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
# THE single source of truth for the devvm Workstation lifecycle (onboard -> offboard).
|
||||
# Consumed by roster_engine.py (derive/validate) + t3-provision-users.sh (apply).
|
||||
#
|
||||
# os_user (the map KEY, pinned) -> authentik_user . k8s_user . tier . namespaces
|
||||
# The three identifiers differ per person (verified 2026-06-08) -- no email->username
|
||||
# derivation; record each explicitly.
|
||||
#
|
||||
# Tiers: admin | power-user | namespace-owner
|
||||
# admin - cluster-admin, unlocked tree, secrets (groups: sudo,docker,code-shared)
|
||||
# power-user - cluster-wide READ (no Secrets) via oidc-power-user-readonly; locked clone
|
||||
# namespace-owner - admin in their own namespace(s) only; locked clone
|
||||
#
|
||||
# wizard IS listed (as admin): the reconcile REGENERATES /etc/ttyd-user-map +
|
||||
# dispatch.json from this file, so omitting him would drop his t3 instance. The
|
||||
# provisioner skips account/group/clone mutations for already-existing users, so
|
||||
# listing him is safe (he keeps his unlocked tree + cluster-admin untouched).
|
||||
users:
|
||||
wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin} # base config author + cluster-admin
|
||||
emo: {authentik_user: emil.barzin, k8s_user: emo, tier: power-user} # NET-NEW k8s_users entry (add as power-user before provisioning)
|
||||
ancamilea: {authentik_user: ancaelena98, k8s_user: anca, tier: namespace-owner, namespaces: [plotting-book]} # ALREADY provisioned in-cluster -- assert, don't re-create
|
||||
# gheorghe: {authentik_user: vabbit81, k8s_user: vabbit81, tier: namespace-owner, namespaces: [vabbit81]} # already a cluster ns-owner; uncomment to give him a devvm workstation
|
||||
|
|
@ -1,299 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Pure derivation + offboarding-diff engine for the devvm Workstation roster.
|
||||
|
||||
Functional core (this module, unit-tested) / imperative shell (the bash
|
||||
provisioner that consumes the JSON this emits and performs the host mutations).
|
||||
No host I/O lives in the tested functions. See PRD ViktorBarzin/infra#9.
|
||||
|
||||
The roster (`roster.yaml`) is the single source of truth for the workstation
|
||||
lifecycle. `os_user` is the pinned key; `authentik_user` / `k8s_user` differ
|
||||
per person and are recorded explicitly (no email->username derivation).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Iterable
|
||||
|
||||
import yaml
|
||||
|
||||
BASE_PORT = 3773
|
||||
VALID_TIERS = ("admin", "power-user", "namespace-owner")
|
||||
# Tier -> supplementary groups the reconcile ENSURES (additive-only; never stripped).
|
||||
TIER_GROUPS: dict[str, tuple[str, ...]] = {
|
||||
"admin": ("code-shared", "docker", "sudo"),
|
||||
"power-user": (),
|
||||
"namespace-owner": (),
|
||||
}
|
||||
DEFAULT_SHELL = "/bin/zsh"
|
||||
_REVERSIBLE_OFFBOARD_KINDS = (
|
||||
"disable_instance",
|
||||
"unmap_dispatch",
|
||||
"remove_from_t3_group",
|
||||
"lock_login",
|
||||
"revoke_cluster_rbac",
|
||||
)
|
||||
|
||||
|
||||
class RosterError(ValueError):
|
||||
"""Raised when the roster is structurally invalid."""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class User:
|
||||
os_user: str
|
||||
authentik_user: str
|
||||
k8s_user: str
|
||||
tier: str
|
||||
namespaces: tuple[str, ...] = ()
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Roster:
|
||||
users: dict[str, User] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Account:
|
||||
os_user: str
|
||||
tier: str
|
||||
shell: str
|
||||
login_locked: bool
|
||||
groups: tuple[str, ...]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DesiredState:
|
||||
accounts: dict[str, Account]
|
||||
ttyd_user_map: str
|
||||
dispatch: dict[str, dict]
|
||||
ports: dict[str, int]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class OffboardAction:
|
||||
os_user: str
|
||||
kind: str
|
||||
reversible: bool
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Parsing + structural validation
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _parse_user(os_user: str, spec: dict) -> User:
|
||||
for required in ("authentik_user", "k8s_user", "tier"):
|
||||
if required not in spec:
|
||||
raise RosterError(f"user {os_user!r}: missing required field {required!r}")
|
||||
tier = spec["tier"]
|
||||
if tier not in VALID_TIERS:
|
||||
raise RosterError(
|
||||
f"user {os_user!r}: unknown tier {tier!r} (valid: {list(VALID_TIERS)})"
|
||||
)
|
||||
namespaces = tuple(spec.get("namespaces") or ())
|
||||
if tier == "namespace-owner" and not namespaces:
|
||||
raise RosterError(f"user {os_user!r}: namespace-owner requires namespaces")
|
||||
if tier != "namespace-owner" and namespaces:
|
||||
raise RosterError(f"user {os_user!r}: only namespace-owner may set namespaces")
|
||||
return User(os_user, spec["authentik_user"], spec["k8s_user"], tier, namespaces)
|
||||
|
||||
|
||||
def load_roster(text: str) -> Roster:
|
||||
data = yaml.safe_load(text) or {}
|
||||
users_raw = data.get("users") or {}
|
||||
return Roster({name: _parse_user(name, spec) for name, spec in users_raw.items()})
|
||||
|
||||
|
||||
def load_roster_file(path: str) -> Roster:
|
||||
with open(path, encoding="utf-8") as fh:
|
||||
return load_roster(fh.read())
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Tier validation against live k8s_users (fail-loud)
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ValidationIssue:
|
||||
os_user: str
|
||||
severity: str # "error" = tier conflict (abort) | "warn" = absent (grant pending)
|
||||
message: str
|
||||
|
||||
|
||||
def validate_tiers(
|
||||
roster: Roster, k8s_user_tiers: dict[str, str]
|
||||
) -> list[ValidationIssue]:
|
||||
"""Compare each roster user's tier against the live `k8s_users` map. A real
|
||||
conflict (roster tier != cluster tier) is an "error" (abort). A net-new user
|
||||
not yet in `k8s_users` is a "warn" (onboarding proceeds; the kubectl grant is
|
||||
pending). Admins are exempt (cluster-admin is granted out of band). An empty
|
||||
list means the roster is consistent with the cluster."""
|
||||
issues = []
|
||||
for user in roster.users.values():
|
||||
if user.tier == "admin":
|
||||
continue
|
||||
actual = k8s_user_tiers.get(user.k8s_user)
|
||||
if actual is None:
|
||||
issues.append(
|
||||
ValidationIssue(
|
||||
user.os_user,
|
||||
"warn",
|
||||
f"{user.os_user}: tier {user.tier} but k8s_user {user.k8s_user!r} "
|
||||
f"absent from k8s_users (kubectl grant pending — add the entry)",
|
||||
)
|
||||
)
|
||||
elif actual != user.tier:
|
||||
issues.append(
|
||||
ValidationIssue(
|
||||
user.os_user,
|
||||
"error",
|
||||
f"{user.os_user}: roster tier {user.tier} != k8s_users tier "
|
||||
f"{actual} for {user.k8s_user!r}",
|
||||
)
|
||||
)
|
||||
return issues
|
||||
|
||||
|
||||
def has_blocking_errors(issues: list[ValidationIssue]) -> bool:
|
||||
return any(issue.severity == "error" for issue in issues)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Desired-state derivation (sticky ports, ttyd map, dispatch, accounts)
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _allocate_ports(roster: Roster, existing_ports: dict[str, int]) -> dict[str, int]:
|
||||
ports = {u: existing_ports[u] for u in roster.users if u in existing_ports}
|
||||
used = set(ports.values())
|
||||
for os_user in sorted(roster.users):
|
||||
if os_user in ports:
|
||||
continue
|
||||
candidate = BASE_PORT
|
||||
while candidate in used:
|
||||
candidate += 1
|
||||
ports[os_user] = candidate
|
||||
used.add(candidate)
|
||||
return ports
|
||||
|
||||
|
||||
_TTYD_MAP_HEADER = (
|
||||
"# Generated from roster.yaml by roster_engine.py — DO NOT EDIT BY HAND.\n"
|
||||
"# <authentik_user>=<os_user>; consumed by t3-dispatch.\n"
|
||||
)
|
||||
|
||||
|
||||
def derive_desired_state(
|
||||
roster: Roster, existing_ports: dict[str, int]
|
||||
) -> DesiredState:
|
||||
ports = _allocate_ports(roster, existing_ports)
|
||||
ordered = sorted(roster.users.values(), key=lambda u: ports[u.os_user])
|
||||
ttyd_lines = [f"{u.authentik_user}={u.os_user}" for u in ordered]
|
||||
ttyd_user_map = _TTYD_MAP_HEADER + "\n".join(ttyd_lines) + "\n"
|
||||
dispatch = {
|
||||
u.authentik_user: {"os_user": u.os_user, "port": ports[u.os_user]}
|
||||
for u in ordered
|
||||
}
|
||||
accounts = {
|
||||
u.os_user: Account(
|
||||
os_user=u.os_user,
|
||||
tier=u.tier,
|
||||
shell=DEFAULT_SHELL,
|
||||
login_locked=True,
|
||||
groups=TIER_GROUPS[u.tier],
|
||||
)
|
||||
for u in roster.users.values()
|
||||
}
|
||||
return DesiredState(accounts, ttyd_user_map, dispatch, ports)
|
||||
|
||||
|
||||
def groups_to_add(desired: Iterable[str], current: Iterable[str]) -> list[str]:
|
||||
"""Additive-only: the groups to `gpasswd -a`. Never proposes a removal, so a
|
||||
routine reconcile can't strip a pre-existing user's legacy groups."""
|
||||
return sorted(set(desired) - set(current))
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Offboarding diff (staged: reversible cut, then gated destructive removal)
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def to_deprovision(old: Roster, new: Roster) -> list[str]:
|
||||
return sorted(set(old.users) - set(new.users))
|
||||
|
||||
|
||||
def offboard_plan(
|
||||
old: Roster, new: Roster, *, include_destructive: bool
|
||||
) -> list[OffboardAction]:
|
||||
"""Staged offboarding actions for users dropped from the roster. The
|
||||
reversible cut (disable instance, unmap, lock, revoke RBAC) is always
|
||||
returned; the irreversible `userdel_archive` is included ONLY when
|
||||
explicitly requested, so it can never be auto-applied by a reconcile."""
|
||||
plan: list[OffboardAction] = []
|
||||
for os_user in to_deprovision(old, new):
|
||||
plan.extend(
|
||||
OffboardAction(os_user, kind, True) for kind in _REVERSIBLE_OFFBOARD_KINDS
|
||||
)
|
||||
if include_destructive:
|
||||
plan.append(OffboardAction(os_user, "userdel_archive", False))
|
||||
return plan
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# CLI adapter (imperative shell entrypoint — consumed by t3-provision-users.sh)
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _desired_state_to_dict(ds: DesiredState) -> dict:
|
||||
return {
|
||||
"accounts": {
|
||||
name: {
|
||||
"os_user": a.os_user,
|
||||
"tier": a.tier,
|
||||
"shell": a.shell,
|
||||
"login_locked": a.login_locked,
|
||||
"groups": list(a.groups),
|
||||
}
|
||||
for name, a in ds.accounts.items()
|
||||
},
|
||||
"ttyd_user_map": ds.ttyd_user_map,
|
||||
"dispatch": ds.dispatch,
|
||||
"ports": ds.ports,
|
||||
}
|
||||
|
||||
|
||||
def _main(argv: list[str]) -> int:
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Workstation roster engine")
|
||||
sub = parser.add_subparsers(dest="cmd", required=True)
|
||||
pv = sub.add_parser(
|
||||
"validate", help="exit 1 if roster tiers diverge from k8s_users"
|
||||
)
|
||||
pv.add_argument("--roster", required=True)
|
||||
pv.add_argument("--k8s-users-json", required=True, help="JSON map {k8s_user: tier}")
|
||||
pd = sub.add_parser("derive", help="emit desired state as JSON")
|
||||
pd.add_argument("--roster", required=True)
|
||||
pd.add_argument("--ports-json", required=True, help="JSON map {os_user: port}")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
roster = load_roster_file(args.roster)
|
||||
if args.cmd == "validate":
|
||||
with open(args.k8s_users_json, encoding="utf-8") as fh:
|
||||
issues = validate_tiers(roster, json.load(fh))
|
||||
for issue in issues:
|
||||
print(f"{issue.severity.upper()}: {issue.message}", file=sys.stderr)
|
||||
return 1 if has_blocking_errors(issues) else 0
|
||||
with open(args.ports_json, encoding="utf-8") as fh:
|
||||
desired = derive_desired_state(roster, json.load(fh))
|
||||
json.dump(_desired_state_to_dict(desired), sys.stdout, indent=2, sort_keys=True)
|
||||
sys.stdout.write("\n")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(_main(sys.argv[1:]))
|
||||
|
|
@ -1,80 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# Idempotent machine-wide host base for the devvm Claude Code Workstation.
|
||||
# Run as root. Sets up ONLY machine-wide state: the apt toolset, node + claude-code,
|
||||
# kubelogin, the ENFORCED managed Claude config, and /etc/skel defaults (launcher,
|
||||
# tmux UX, and live config-inheritance symlinks into the shared config base).
|
||||
#
|
||||
# PER-USER provisioning (accounts, per-tier groups, kubeconfig, secrets, infra
|
||||
# clone) lives in t3-provision-users.sh — NOT here. Safe to re-run.
|
||||
set -euo pipefail
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# The shared config base every user inherits from (live, chezmoi-versioned).
|
||||
# Coupled to the admin's home today; override to relocate to a neutral path.
|
||||
CONFIG_BASE="${WORKSTATION_CONFIG_BASE:-/home/wizard/.claude}"
|
||||
[[ $EUID -eq 0 ]] || { echo "setup-devvm.sh: must run as root" >&2; exit 1; }
|
||||
log() { echo "[setup-devvm] $*"; }
|
||||
|
||||
# 1) apt toolset (declarative manifest; comments/blank lines stripped)
|
||||
mapfile -t PKGS < <(grep -vE '^[[:space:]]*(#|$)' "$HERE/packages.txt")
|
||||
log "apt: ensuring ${#PKGS[@]} packages present"
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
apt-get update -qq
|
||||
apt-get install -y "${PKGS[@]}" >/dev/null
|
||||
|
||||
# 2) node >= 18 + claude-code (claude-code requires node >= 18)
|
||||
need_node=1
|
||||
if command -v node >/dev/null; then
|
||||
[[ "$(node -v | sed 's/^v\([0-9]*\).*/\1/')" -ge 18 ]] && need_node=0
|
||||
fi
|
||||
if [[ $need_node -eq 1 ]]; then
|
||||
log "node: installing NodeSource 22.x"
|
||||
curl -fsSL https://deb.nodesource.com/setup_22.x | bash - >/dev/null
|
||||
apt-get install -y nodejs >/dev/null
|
||||
fi
|
||||
command -v claude >/dev/null || { log "npm: installing @anthropic-ai/claude-code"; npm install -g @anthropic-ai/claude-code >/dev/null; }
|
||||
|
||||
# 3) kubelogin (kubectl oidc-login) system-wide — NOT the apt 'kubelogin' (= Azure tool)
|
||||
if [[ ! -x /usr/local/bin/kubelogin ]]; then
|
||||
log "kubelogin: installing int128/kubelogin"
|
||||
tmp="$(mktemp -d)"
|
||||
curl -fsSL -o "$tmp/kl.zip" https://github.com/int128/kubelogin/releases/latest/download/kubelogin_linux_amd64.zip
|
||||
( cd "$tmp" && { unzip -o kl.zip kubelogin >/dev/null 2>&1 || python3 -m zipfile -e kl.zip .; } )
|
||||
install -m 0755 "$tmp/kubelogin" /usr/local/bin/kubelogin
|
||||
ln -sf /usr/local/bin/kubelogin /usr/local/bin/kubectl-oidc_login
|
||||
rm -rf "$tmp"
|
||||
fi
|
||||
|
||||
# 4) machine-wide ENFORCED Claude config (org claudeMd; top precedence; NO secrets)
|
||||
install -d -m 0755 /etc/claude-code
|
||||
install -m 0644 "$HERE/managed-settings.json" /etc/claude-code/managed-settings.json
|
||||
log "managed-settings.json -> /etc/claude-code/ (enforced org claudeMd)"
|
||||
|
||||
# 5) /etc/skel for NEW accounts: launcher + tmux UX + live-inheritance symlinks.
|
||||
# A symlink placed in /etc/skel is copied (as a symlink) into each new home by
|
||||
# `useradd -m`, so new users' ~/.claude/{skills,rules,...} resolve to the shared
|
||||
# base and pick up the admin's edits live. Secrets + hooks are per-user (written
|
||||
# by the provisioner), NEVER symlinked here.
|
||||
install -d -m 0755 /etc/skel
|
||||
install -m 0755 "$HERE/skel/start-claude.sh" /etc/skel/start-claude.sh
|
||||
install -m 0644 "$HERE/skel/tmux.conf" /etc/skel/.tmux.conf
|
||||
install -d -m 0755 /etc/skel/.claude
|
||||
for d in skills rules agents commands; do
|
||||
[[ -d "$CONFIG_BASE/$d" ]] && ln -sfn "$CONFIG_BASE/$d" "/etc/skel/.claude/$d"
|
||||
done
|
||||
log "skel: launcher + tmux + inheritance symlinks (base=$CONFIG_BASE)"
|
||||
|
||||
# 6) deploy the roster-driven provisioner to /usr/local/bin (run hourly by
|
||||
# t3-provision-users.timer). Re-deployed here so its logic is reproducible.
|
||||
install -m 0755 "$HERE/../t3-provision-users.sh" /usr/local/bin/t3-provision-users
|
||||
log "t3-provision-users -> /usr/local/bin/ (roster-driven)"
|
||||
|
||||
# 7) harden the admin's unlocked tree: it holds git-crypt-DECRYPTED secrets, so it
|
||||
# must NOT be world-readable — only the admin + code-shared. Without this, ANY
|
||||
# devvm user (even outside code-shared) could read decrypted secrets by path.
|
||||
ADMIN_CODE="${ADMIN_CODE:-/home/wizard/code}"
|
||||
if [[ -d "$ADMIN_CODE" ]]; then
|
||||
chmod o-rx "$ADMIN_CODE"
|
||||
log "hardened $ADMIN_CODE (o-rx — not world-readable)"
|
||||
fi
|
||||
|
||||
log "OK (idempotent)"
|
||||
|
|
@ -1,42 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Per-user Claude Code Workstation launcher (devvm). Lands the user in their OWN
|
||||
# ~/code clone (NOT a hardcoded /home/wizard/code) and names the Claude session
|
||||
# after the tmux session so /resume, the prompt box, and the terminal title line
|
||||
# up. Deployed via /etc/skel by setup-devvm.sh, so new accounts get it on
|
||||
# `useradd -m`. Existing users are repointed to this during their migration.
|
||||
echo ""
|
||||
echo " Welcome, $(id -un)! 🚀"
|
||||
echo ""
|
||||
echo " Starting Claude Code in $HOME/code ..."
|
||||
echo " (Right-click for tmux menu, or Ctrl+B then | or - to split)"
|
||||
echo ""
|
||||
|
||||
name_args=()
|
||||
if [ -n "${TMUX:-}" ]; then
|
||||
sess="$(tmux display-message -p '#{session_name}' 2>/dev/null)"
|
||||
[ -n "$sess" ] && name_args=(--name "$sess")
|
||||
fi
|
||||
|
||||
cd "$HOME/code" 2>/dev/null || cd "$HOME"
|
||||
|
||||
# Prefer the system-wide `claude` (installed by setup-devvm.sh); fall back to npx.
|
||||
launch() {
|
||||
if command -v claude >/dev/null 2>&1; then
|
||||
claude "$@"
|
||||
else
|
||||
npx @anthropic-ai/claude-code "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
# Deliberately not `exec` so we can branch on the exit code: clean quit ends the
|
||||
# pane (ttyd closes the terminal); a crash drops to a shell so the tmux session
|
||||
# isn't destroyed-and-recreated in a ttyd auto-reconnect loop.
|
||||
launch --dangerously-skip-permissions --model claude-opus-4-8 "${name_args[@]}"
|
||||
code=$?
|
||||
[ "$code" -eq 0 ] && exit 0
|
||||
|
||||
echo ""
|
||||
echo " claude exited abnormally (status $code). Dropping to a shell — your tmux session is preserved."
|
||||
echo " Re-launch any time with: ~/start-claude.sh"
|
||||
echo ""
|
||||
exec "${SHELL:-/bin/bash}" -l
|
||||
|
|
@ -1,51 +0,0 @@
|
|||
# Workstation base tmux config (deployed to /etc/skel/.tmux.conf by
|
||||
# setup-devvm.sh; new accounts inherit it). Uses $HOME (expanded by the shell at
|
||||
# run time) so it works for ANY user — never a hardcoded /home/<name>.
|
||||
#
|
||||
# NOTE: the tmux-resurrect/continuum "persistence" block is owned by the separate
|
||||
# terminal-lobby tool, which appends its own managed section + installs tpm. This
|
||||
# base file intentionally omits it so a fresh account isn't left with broken
|
||||
# `run ~/.tmux/plugins/tpm/tpm` references before terminal-lobby runs.
|
||||
|
||||
# Launch the per-user Claude launcher in every new pane/window (lands in ~/code).
|
||||
set -g default-command "$HOME/start-claude.sh"
|
||||
|
||||
# Mouse support — click panes, drag to resize, scroll with wheel
|
||||
set -g mouse on
|
||||
|
||||
# Easy splits: Ctrl+b then | for vertical, - for horizontal
|
||||
bind | split-window -h -c "#{pane_current_path}"
|
||||
bind - split-window -v -c "#{pane_current_path}"
|
||||
bind c new-window -c "#{pane_current_path}"
|
||||
|
||||
# Right-click context menu — clickable actions popup
|
||||
bind -n MouseDown3Pane display-menu -T "#[align=centre]Terminal Menu" -x M -y M \
|
||||
"New Claude" w "new-window -c '#{pane_current_path}'" \
|
||||
"Split Horizontal" h "split-window -v -c '#{pane_current_path}'" \
|
||||
"Split Vertical" v "split-window -h -c '#{pane_current_path}'" \
|
||||
"" \
|
||||
"Shell" s "split-window -v -c '#{pane_current_path}' /bin/zsh" \
|
||||
"" \
|
||||
"Close Pane" x "confirm-before -p 'Close pane? (y/n)' kill-pane" \
|
||||
"Close Window" X "confirm-before -p 'Close window? (y/n)' kill-window" \
|
||||
"" \
|
||||
"Detach" d "detach-client"
|
||||
|
||||
# Clickable [+] button in the status bar — left-click to open the same menu
|
||||
set -g status-right '#[fg=black bg=green] [+] #[default] #[fg=cyan]Right-click for menu '
|
||||
set -g status-right-length 60
|
||||
bind -n MouseDown1StatusRight display-menu -T "#[align=centre]Terminal Menu" -x M -y S \
|
||||
"New Claude" w "new-window -c '#{pane_current_path}'" \
|
||||
"Split Horizontal" h "split-window -v -c '#{pane_current_path}'" \
|
||||
"Split Vertical" v "split-window -h -c '#{pane_current_path}'" \
|
||||
"" \
|
||||
"Shell" s "split-window -v -c '#{pane_current_path}' /bin/zsh" \
|
||||
"" \
|
||||
"Close Pane" x "confirm-before -p 'Close pane? (y/n)' kill-pane" \
|
||||
"Close Window" X "confirm-before -p 'Close window? (y/n)' kill-window"
|
||||
|
||||
# Status bar styling + 1-based numbering
|
||||
set -g status-style 'bg=colour235 fg=colour136'
|
||||
set -g status-left '#[fg=green][#S] '
|
||||
set -g base-index 1
|
||||
setw -g pane-base-index 1
|
||||
|
|
@ -1,280 +0,0 @@
|
|||
"""Unit tests for the pure roster derivation + offboarding-diff engine.
|
||||
|
||||
These exercise external behaviour only (parse -> validate -> derive -> diff);
|
||||
no host I/O is touched. Mirrors the pure-core pytest style used elsewhere in
|
||||
the monorepo. See PRD ViktorBarzin/infra#9 (modules #1 roster engine, #5
|
||||
offboarding diff).
|
||||
"""
|
||||
|
||||
import textwrap
|
||||
|
||||
import pytest
|
||||
|
||||
import roster_engine as eng
|
||||
|
||||
|
||||
def _roster(yaml_text: str) -> "eng.Roster":
|
||||
return eng.load_roster(textwrap.dedent(yaml_text))
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# load_roster: parsing + structural validation (module #1)
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_parses_user_fields_and_tier():
|
||||
r = _roster(
|
||||
"""
|
||||
users:
|
||||
emo: {authentik_user: emil.barzin, k8s_user: emo, tier: power-user}
|
||||
"""
|
||||
)
|
||||
u = r.users["emo"]
|
||||
assert u.os_user == "emo"
|
||||
assert u.authentik_user == "emil.barzin"
|
||||
assert u.k8s_user == "emo"
|
||||
assert u.tier == "power-user"
|
||||
assert u.namespaces == ()
|
||||
|
||||
|
||||
def test_namespace_owner_carries_namespaces():
|
||||
r = _roster(
|
||||
"""
|
||||
users:
|
||||
ancamilea: {authentik_user: ancaelena98, k8s_user: anca,
|
||||
tier: namespace-owner, namespaces: [plotting-book]}
|
||||
"""
|
||||
)
|
||||
assert r.users["ancamilea"].namespaces == ("plotting-book",)
|
||||
|
||||
|
||||
def test_admin_tier_is_accepted():
|
||||
r = _roster(
|
||||
"users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
|
||||
)
|
||||
assert r.users["wizard"].tier == "admin"
|
||||
|
||||
|
||||
def test_rejects_unknown_tier():
|
||||
with pytest.raises(eng.RosterError, match="tier"):
|
||||
_roster("users: {bob: {authentik_user: b, k8s_user: b, tier: wizard-king}}")
|
||||
|
||||
|
||||
def test_rejects_missing_required_field():
|
||||
with pytest.raises(eng.RosterError, match="authentik_user"):
|
||||
_roster("users: {bob: {k8s_user: b, tier: power-user}}")
|
||||
|
||||
|
||||
def test_namespace_owner_requires_namespaces():
|
||||
with pytest.raises(eng.RosterError, match="namespace"):
|
||||
_roster("users: {bob: {authentik_user: b, k8s_user: b, tier: namespace-owner}}")
|
||||
|
||||
|
||||
def test_non_namespace_owner_must_not_set_namespaces():
|
||||
with pytest.raises(eng.RosterError, match="namespace"):
|
||||
_roster(
|
||||
"users: {bob: {authentik_user: b, k8s_user: b, tier: power-user, "
|
||||
"namespaces: [x]}}"
|
||||
)
|
||||
|
||||
|
||||
def test_empty_roster_is_valid():
|
||||
assert _roster("users: {}").users == {}
|
||||
|
||||
|
||||
def test_missing_users_key_is_valid_empty():
|
||||
assert _roster("{}").users == {}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# validate_tiers: roster tier vs live k8s_users (fail-loud, module #1)
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_validate_ok_when_tiers_match():
|
||||
r = _roster(
|
||||
"users: {ancamilea: {authentik_user: a, k8s_user: anca, "
|
||||
"tier: namespace-owner, namespaces: [plotting-book]}}"
|
||||
)
|
||||
assert eng.validate_tiers(r, {"anca": "namespace-owner"}) == []
|
||||
|
||||
|
||||
def test_validate_flags_tier_mismatch_as_error():
|
||||
# roster says power-user, cluster says namespace-owner -> a real conflict -> ERROR (abort).
|
||||
r = _roster(
|
||||
"users: {ancamilea: {authentik_user: a, k8s_user: anca, tier: power-user}}"
|
||||
)
|
||||
issues = eng.validate_tiers(r, {"anca": "namespace-owner"})
|
||||
assert len(issues) == 1
|
||||
assert issues[0].severity == "error"
|
||||
assert issues[0].os_user == "ancamilea"
|
||||
assert "power-user" in issues[0].message and "namespace-owner" in issues[0].message
|
||||
|
||||
|
||||
def test_validate_flags_netnew_absent_as_warn():
|
||||
# emo is power-user in the roster but has no k8s_users entry yet. Onboarding the
|
||||
# workstation should still proceed; the kubectl grant is pending -> WARN, not error.
|
||||
r = _roster("users: {emo: {authentik_user: e, k8s_user: emo, tier: power-user}}")
|
||||
issues = eng.validate_tiers(r, {})
|
||||
assert len(issues) == 1
|
||||
assert issues[0].severity == "warn"
|
||||
assert "emo" in issues[0].message and "k8s_users" in issues[0].message
|
||||
|
||||
|
||||
def test_validate_skips_admin_tier():
|
||||
# wizard (admin) is cluster-admin via a separate mechanism, not k8s_users.
|
||||
r = _roster(
|
||||
"users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
|
||||
)
|
||||
assert eng.validate_tiers(r, {}) == []
|
||||
|
||||
|
||||
def test_has_blocking_errors_distinguishes_mismatch_from_absent():
|
||||
mismatch = _roster(
|
||||
"users: {ancamilea: {authentik_user: a, k8s_user: anca, tier: power-user}}"
|
||||
)
|
||||
absent = _roster(
|
||||
"users: {emo: {authentik_user: e, k8s_user: emo, tier: power-user}}"
|
||||
)
|
||||
assert (
|
||||
eng.has_blocking_errors(
|
||||
eng.validate_tiers(mismatch, {"anca": "namespace-owner"})
|
||||
)
|
||||
is True
|
||||
)
|
||||
assert eng.has_blocking_errors(eng.validate_tiers(absent, {})) is False
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# derive_desired_state: accounts, sticky ports, ttyd map, dispatch (module #1)
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
THREE = """
|
||||
users:
|
||||
wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}
|
||||
emo: {authentik_user: emil.barzin, k8s_user: emo, tier: power-user}
|
||||
ancamilea: {authentik_user: ancaelena98, k8s_user: anca, tier: namespace-owner, namespaces: [plotting-book]}
|
||||
"""
|
||||
|
||||
LIVE_PORTS = {"wizard": 3773, "emo": 3774, "ancamilea": 3775}
|
||||
|
||||
|
||||
def test_derive_preserves_existing_sticky_ports():
|
||||
ds = eng.derive_desired_state(_roster(THREE), LIVE_PORTS)
|
||||
assert ds.ports == {"wizard": 3773, "emo": 3774, "ancamilea": 3775}
|
||||
|
||||
|
||||
def test_derive_allocates_next_free_port_for_new_user():
|
||||
ds = eng.derive_desired_state(_roster(THREE), {"wizard": 3773})
|
||||
# emo + ancamilea are new -> next free from 3773 skipping the used 3773
|
||||
assert ds.ports["wizard"] == 3773
|
||||
assert sorted([ds.ports["emo"], ds.ports["ancamilea"]]) == [3774, 3775]
|
||||
|
||||
|
||||
def test_derive_dispatch_keyed_by_authentik_user():
|
||||
ds = eng.derive_desired_state(_roster(THREE), LIVE_PORTS)
|
||||
assert ds.dispatch == {
|
||||
"vbarzin": {"os_user": "wizard", "port": 3773},
|
||||
"emil.barzin": {"os_user": "emo", "port": 3774},
|
||||
"ancaelena98": {"os_user": "ancamilea", "port": 3775},
|
||||
}
|
||||
|
||||
|
||||
def test_derive_ttyd_map_has_one_mapping_per_user():
|
||||
ds = eng.derive_desired_state(_roster(THREE), LIVE_PORTS)
|
||||
body = [
|
||||
line
|
||||
for line in ds.ttyd_user_map.splitlines()
|
||||
if line.strip() and not line.lstrip().startswith("#")
|
||||
]
|
||||
assert set(body) == {"vbarzin=wizard", "emil.barzin=emo", "ancaelena98=ancamilea"}
|
||||
|
||||
|
||||
def test_derive_accounts_assign_tier_groups_and_shell():
|
||||
ds = eng.derive_desired_state(_roster(THREE), LIVE_PORTS)
|
||||
assert ds.accounts["wizard"].groups == ("code-shared", "docker", "sudo")
|
||||
assert ds.accounts["emo"].groups == ()
|
||||
assert ds.accounts["ancamilea"].groups == ()
|
||||
assert ds.accounts["emo"].shell == "/bin/zsh"
|
||||
|
||||
|
||||
def test_derive_is_deterministic():
|
||||
r = _roster(THREE)
|
||||
assert eng.derive_desired_state(r, LIVE_PORTS) == eng.derive_desired_state(
|
||||
r, LIVE_PORTS
|
||||
)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# groups_to_add: the additive-only invariant (module #1)
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_groups_to_add_returns_only_missing():
|
||||
assert eng.groups_to_add(("sudo", "docker", "code-shared"), ("docker",)) == [
|
||||
"code-shared",
|
||||
"sudo",
|
||||
]
|
||||
|
||||
|
||||
def test_groups_to_add_never_proposes_removal_of_extra_groups():
|
||||
# emo currently has code-shared+docker (legacy). A power-user reconcile wants
|
||||
# no groups -> must NOT strip anything (additive-only invariant).
|
||||
assert eng.groups_to_add((), ("code-shared", "docker")) == []
|
||||
|
||||
|
||||
def test_groups_to_add_idempotent_when_all_present():
|
||||
assert eng.groups_to_add(("sudo",), ("sudo", "docker")) == []
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# offboarding diff: staged plan, destructive never auto (module #5)
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_to_deprovision_is_old_minus_new():
|
||||
old = _roster(THREE)
|
||||
new = _roster(
|
||||
"""
|
||||
users:
|
||||
wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}
|
||||
emo: {authentik_user: emil.barzin, k8s_user: emo, tier: power-user}
|
||||
"""
|
||||
)
|
||||
assert eng.to_deprovision(old, new) == ["ancamilea"]
|
||||
|
||||
|
||||
def test_to_deprovision_empty_when_nothing_removed():
|
||||
r = _roster(THREE)
|
||||
assert eng.to_deprovision(r, r) == []
|
||||
|
||||
|
||||
def test_offboard_plan_reversible_cut_targets_exactly_the_removed_user():
|
||||
old = _roster(THREE)
|
||||
new = _roster(
|
||||
"users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
|
||||
)
|
||||
plan = eng.offboard_plan(old, new, include_destructive=False)
|
||||
cut_users = {a.os_user for a in plan}
|
||||
assert cut_users == {"emo", "ancamilea"}
|
||||
assert all(a.reversible for a in plan)
|
||||
|
||||
|
||||
def test_offboard_plan_excludes_destructive_by_default():
|
||||
old = _roster(THREE)
|
||||
new = _roster(
|
||||
"users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
|
||||
)
|
||||
auto = eng.offboard_plan(old, new, include_destructive=False)
|
||||
assert all(a.kind != "userdel_archive" for a in auto)
|
||||
|
||||
|
||||
def test_offboard_plan_includes_destructive_only_when_explicitly_requested():
|
||||
old = _roster(THREE)
|
||||
new = _roster(
|
||||
"users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
|
||||
)
|
||||
full = eng.offboard_plan(old, new, include_destructive=True)
|
||||
destructive = [a for a in full if a.kind == "userdel_archive"]
|
||||
assert {a.os_user for a in destructive} == {"emo", "ancamilea"}
|
||||
assert all(not a.reversible for a in destructive)
|
||||
Loading…
Add table
Add a link
Reference in a new issue