chore: add untracked stacks, scripts, and agent configs

- New stacks: beads-server, hermes-agent - Terragrunt tiers.tf for infra, phpipam, status-page - Secrets symlinks for vault, phpipam, hermes-agent - Scripts: cluster_manager, image_pull, containerd pullthrough setup - Frigate config, audiblez-web app source, n8n workflows dir - Claude agent: service-upgrade, reference: upgrade-config.json - Removed: claudeception skill, excalidraw empty submodule, temp listings [ci skip] Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 09:33:06 +00:00 · 2026-04-15 09:33:06 +00:00 · bcad200a23
commit bcad200a23
parent bd41bb9230
44 changed files with 3819 additions and 0 deletions
--- a/scripts/cluster_manager.py
+++ b/scripts/cluster_manager.py
@ -0,0 +1,277 @@
+import asyncio
+import click
+import logging
+import time
+from typing import List, Union, Optional
+from kubernetes_asyncio import client, config
+from kubernetes_asyncio.client.api_client import ApiClient
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+logger = logging.getLogger(__name__)
+
+
+async def wait_for_healthy(
+    api_instance: client.AppsV1Api,
+    resource_type: str,
+    namespace: str,
+    name: str,
+    target_replicas: int,
+    timeout: int = 300,
+) -> None:
+    start_time = time.time()
+    logger.info(
+        f"Waiting for {resource_type} {name} to reach {target_replicas} replicas..."
+    )
+
+    while True:
+        if time.time() - start_time > timeout:
+            logger.error(f"❌ Timeout reached for {resource_type} {name}")
+            return
+
+        try:
+            if resource_type.lower() == "deployment":
+                res = await api_instance.read_namespaced_deployment_status(
+                    name, namespace
+                )
+                ready = res.status.ready_replicas or 0
+                updated = res.status.updated_replicas or 0
+                if ready == target_replicas and updated == target_replicas:
+                    break
+            else:  # StatefulSet
+                res = await api_instance.read_namespaced_stateful_set_status(
+                    name, namespace
+                )
+                ready = res.status.ready_replicas or 0
+                if ready == target_replicas:
+                    break
+
+        except Exception as e:
+            logger.debug(f"Retrying status check for {name}: {e}")
+
+        await asyncio.sleep(5)
+
+    logger.info(f"✅ {resource_type} {name} is now healthy.")
+
+
+async def wait_for_zero(
+    api: client.AppsV1Api, kind: str, ns: str, name: str, timeout: int
+) -> tuple[str, str]:
+    start_time = asyncio.get_event_loop().time()
+    while (asyncio.get_event_loop().time() - start_time) < timeout:
+        try:
+            res = await (
+                api.read_namespaced_deployment_status(name, ns)
+                if kind.lower() == "deployment"
+                else api.read_namespaced_stateful_set_status(name, ns)
+            )
+            if (res.status.ready_replicas or 0) == 0:
+                return ns, name
+        except Exception:
+            return ns, name  # Assume gone if error
+        await asyncio.sleep(3)
+    logger.error(f"Timeout: {kind} {ns}/{name} still has running pods.")
+    return ns, name
+
+
+async def scale_resource(
+    api_instance: client.AppsV1Api,
+    resource_type: str,
+    namespace: str,
+    name: str,
+    replicas: int,
+) -> None:
+    body = {"spec": {"replicas": replicas}}
+    try:
+        if resource_type.lower() == "deployment":
+            await api_instance.patch_namespaced_deployment_scale(name, namespace, body)
+        else:
+            await api_instance.patch_namespaced_stateful_set_scale(
+                name, namespace, body
+            )
+    except Exception as e:
+        logger.error(f"Failed to scale {resource_type} {name}: {e}")
+
+
+async def run_stop_tier(
+    api_v1: client.AppsV1Api, label: str, output_file: str, timeout: int
+) -> None:
+    """Processes a single label tier: saves, scales to 0, and waits."""
+    excluded_ns = ["kube-system", "kube-public", "kube-node-lease"]
+
+    # 1. Discover
+    targets = [
+        ("Deployment", api_v1.list_deployment_for_all_namespaces),
+        ("StatefulSet", api_v1.list_stateful_set_for_all_namespaces),
+    ]
+
+    tier_resources = []
+    for kind, list_func in targets:
+        resp = await list_func(label_selector=label)
+        tier_resources.extend(
+            [
+                (kind, item)
+                for item in resp.items
+                if item.metadata.namespace not in excluded_ns
+            ]
+        )
+
+    if not tier_resources:
+        logger.warning(f"No resources found for label: {label}")
+        return
+
+    # 2. Save & Scale
+    active_jobs: set[tuple[str, str]] = set()
+    wait_tasks = []
+
+    # Append to file so we don't overwrite previous tiers
+    with open(output_file, "a") as f:
+        for kind, item in tier_resources:
+            ns, name = item.metadata.namespace, item.metadata.name
+            reps = item.spec.replicas or 0
+            f.write(f"{kind} {ns} {name} {reps}\n")
+            active_jobs.add((ns, name))
+
+            await scale_resource(api_v1, kind, ns, name, 0)
+            wait_tasks.append(wait_for_zero(api_v1, kind, ns, name, timeout))
+
+    # 3. Wait for this tier to finish before moving to next
+    logger.info(f"Tier [{label}]: Waiting for {len(active_jobs)} resources to stop...")
+    for coro in asyncio.as_completed(wait_tasks):
+        finished_ns, finished_name = await coro
+        active_jobs.discard((finished_ns, finished_name))
+        if active_jobs:
+            remaining_ns = sorted({ns for ns, name in active_jobs})
+            logger.info(
+                f"[{label}] Pending: {len(active_jobs)} | Namespaces: {', '.join(remaining_ns)}"
+            )
+
+    logger.info(f"✅ Tier [{label}] successfully shut down.")
+
+
+@click.group()
+def cli():
+    pass
+
+
+@cli.command()
+@click.argument("labels", nargs=-1, required=True)
+@click.option("--output", "-o", default="resources.txt", help="Output state file")
+@click.option("--timeout", "-t", default=3600)
+def stop(labels: List[str], output: str, timeout: int):
+    """Stop tiers sequentially. Usage: stop 'app=web' 'app=db'"""
+
+    async def main():
+        await config.load_kube_config()
+        # Clear/Create file at start
+        open(output, "w").close()
+
+        async with ApiClient() as api_client:
+            api_v1 = client.AppsV1Api(api_client)
+            for label in labels:
+                logger.info(f"🚀 Processing Shutdown Tier: {label}")
+                await run_stop_tier(api_v1, label, output, timeout)
+        logger.info("🏁 Sequence complete. Cluster is gracefully stopped.")
+
+    asyncio.run(main())
+
+
+@cli.command()
+@click.argument("labels", nargs=-1, required=True)
+@click.option("--file", "-f", default="resources.txt")
+@click.option("--timeout", "-t", default=3600, help="Seconds to wait per resource")
+def start(labels: List[str], file: str, timeout: int):
+    asyncio.run(run_start_sequence(labels, file, timeout))
+
+
+async def run_start_sequence(labels: List[str], file_path: str, timeout: int) -> None:
+    await config.load_kube_config()
+
+    async with ApiClient() as api_client:
+        apps_v1 = client.AppsV1Api(api_client)
+
+        # 1. Load the entire snapshot into memory for filtering
+        try:
+            with open(file_path, "r") as f:
+                # Format: Kind Namespace Name Replicas
+                snapshot_lines = [line.strip().split() for line in f if line.strip()]
+        except FileNotFoundError:
+            logger.error(f"Snapshot file {file_path} not found.")
+            return
+
+        # 2. Iterate through labels in the order provided
+        for label in labels:
+            logger.info(f"🚀 Starting Tier: {label}")
+
+            # Find resources in this tier by querying K8s for the label
+            # then matching against our snapshot file data
+            tier_resources = await get_resources_by_label(apps_v1, label)
+
+            # Cross-reference: Only start things that are in BOTH the K8s label query AND our file
+            # This ensures we restore them to the CORRECT previous replica count
+            to_restore = []
+            tier_keys = {(r["ns"], r["name"]) for r in tier_resources}
+
+            for kind, ns, name, reps in snapshot_lines:
+                if (ns, name) in tier_keys:
+                    to_restore.append((kind, ns, name, int(reps)))
+
+            if not to_restore:
+                logger.warning(f"No resources found in snapshot for tier: {label}")
+                continue
+
+            # 3. Scale and Wait for this specific tier
+            await process_start_tier(apps_v1, to_restore, timeout, label)
+
+        logger.info("🏁 All tiers started successfully.")
+
+
+async def get_resources_by_label(api: client.AppsV1Api, label: str) -> List[dict]:
+    """Helper to find what currently exists in the cluster with this label."""
+    targets = [
+        api.list_deployment_for_all_namespaces,
+        api.list_stateful_set_for_all_namespaces,
+    ]
+    found = []
+    for list_func in targets:
+        resp = await list_func(label_selector=label)
+        for item in resp.items:
+            found.append({"ns": item.metadata.namespace, "name": item.metadata.name})
+    return found
+
+
+async def process_start_tier(
+    api: client.AppsV1Api, resources: list, timeout: int, label: str
+):
+    active_jobs = set()
+    scale_tasks = []
+    wait_tasks = []
+
+    # Wrapper to track which job finishes
+    async def tracked_wait(kind, ns, name, target, t_out):
+        await wait_for_healthy(api, kind, ns, name, target, t_out)
+        return (ns, name)
+
+    for kind, ns, name, reps in resources:
+        active_jobs.add((ns, name))
+        scale_tasks.append(scale_resource(api, kind, ns, name, reps))
+        wait_tasks.append(tracked_wait(kind, ns, name, reps, timeout))
+
+    # Trigger all scales for this tier
+    await asyncio.gather(*scale_tasks)
+
+    # Monitor health
+    for coro in asyncio.as_completed(wait_tasks):
+        finished_ns, finished_name = await coro
+        active_jobs.discard((finished_ns, finished_name))
+
+        if active_jobs:
+            remaining_ns = sorted({ns for ns, name in active_jobs})
+            logger.info(
+                f"[{label}] Pending Health: {len(active_jobs)} | Namespaces: {', '.join(remaining_ns)}"
+            )
+
+    logger.info(f"✅ Tier [{label}] is healthy.")
+
+
+if __name__ == "__main__":
+    cli()
--- a/scripts/image_pull.sh
+++ b/scripts/image_pull.sh
@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+for n in $(kubectl get nodes -o wide | grep node | awk '{print $1}'); do 
+    echo $n;
+    kubectl drain $n --ignore-daemonsets --delete-emptydir-data && \
+    ssh wizard@$n < image_pull_remote.sh
+    # Check result
+    kubectl get --raw "/api/v1/nodes/$n/proxy/configz" | jq '.kubeletconfig | {serializeImagePulls, maxParallelImagePulls}'
+    kubectl uncordon $n
+done
--- a/scripts/image_pull_remote.sh
+++ b/scripts/image_pull_remote.sh
@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+# Containerd
+sudo sed -i 's/.*max_concurrent_downloads.*/max_concurrent_downloads = 5/g' /etc/containerd/config.toml 
+sudo systemctl restart containerd
+
+# Kubelet
+#sed serializeImagePulls: false # Allow container images to be downloaded in parallel
+#maxParallelImagePulls: 20 # To limit the number of parallel image pulls.
+
+sudo sed -i '/serializeImagePulls:/d' /var/lib/kubelet/config.yaml && \
+sudo sed -i '/maxParallelImagePulls:/d' /var/lib/kubelet/config.yaml && \
+echo -e 'serializeImagePulls: false\nmaxParallelImagePulls: 5' | sudo tee -a /var/lib/kubelet/config.yaml
+sudo systemctl restart kubelet
--- a/scripts/setup-containerd-pullthrough.sh
+++ b/scripts/setup-containerd-pullthrough.sh
@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+############################################
+# CONFIGURATION
+############################################
+
+# Internal pull-through registry endpoint
+# Examples:
+#   http://registry.internal:5000
+#   https://registry.internal
+INTERNAL_REGISTRY="http://10.0.20.10:5002"
+
+# Path where containerd reads registry configs
+CERTS_DIR="/etc/containerd/certs.d"
+
+# Optional: path to CA file if INTERNAL_REGISTRY uses HTTPS with custom CA
+# Leave empty if not needed
+INTERNAL_CA_PATH=""
+
+# Restart containerd at the end
+RESTART_CONTAINERD=true
+
+############################################
+# REGISTRIES TO MIRROR
+############################################
+
+REGISTRIES=(
+  "docker.io"
+  "registry-1.docker.io"
+  "registry.k8s.io"
+  "quay.io"
+  "ghcr.io"
+  "gcr.io"
+  "us-docker.pkg.dev"
+  "public.ecr.aws"
+  "mcr.microsoft.com"
+)
+
+############################################
+# FUNCTIONS
+############################################
+
+require_root() {
+  if [[ "$(id -u)" -ne 0 ]]; then
+    echo "ERROR: must be run as root" >&2
+    exit 1
+  fi
+}
+
+ensure_containerd_config_path() {
+  local cfg="/etc/containerd/config.toml"
+
+  if [[ ! -f "$cfg" ]]; then
+    echo "Generating default containerd config"
+    containerd config default > "$cfg"
+  fi
+
+  if ! grep -q 'config_path *= *"/etc/containerd/certs.d"' "$cfg"; then
+    echo "Enabling config_path in containerd config"
+
+    # Minimal and safe append if section exists
+    if grep -q '\[plugins\."io.containerd.grpc.v1.cri".registry\]' "$cfg"; then
+      sed -i '/\[plugins\."io.containerd.grpc.v1.cri".registry\]/a \  config_path = "/etc/containerd/certs.d"' "$cfg"
+    else
+      cat >> "$cfg" <<'EOF'
+
+[plugins."io.containerd.grpc.v1.cri".registry]
+  config_path = "/etc/containerd/certs.d"
+EOF
+    fi
+  fi
+}
+
+write_hosts_toml() {
+  local registry="$1"
+  local dir="$CERTS_DIR/$registry"
+  local file="$dir/hosts.toml"
+
+  mkdir -p "$dir"
+
+  cat > "$file" <<EOF
+server = "https://$registry"
+
+[host."$INTERNAL_REGISTRY"]
+  capabilities = ["pull", "resolve"]
+EOF
+
+  if [[ -n "$INTERNAL_CA_PATH" ]]; then
+    cat >> "$file" <<EOF
+  ca = "$INTERNAL_CA_PATH"
+EOF
+  fi
+}
+
+############################################
+# MAIN
+############################################
+
+require_root
+ensure_containerd_config_path
+
+echo "Creating registry mirror configurations..."
+
+for r in "${REGISTRIES[@]}"; do
+  echo "  - $r"
+  write_hosts_toml "$r"
+done
+
+if [[ "$RESTART_CONTAINERD" == "true" ]]; then
+  echo "Restarting containerd"
+  systemctl restart containerd
+fi
+
+echo "Done."