chore: add untracked stacks, scripts, and agent configs
- New stacks: beads-server, hermes-agent - Terragrunt tiers.tf for infra, phpipam, status-page - Secrets symlinks for vault, phpipam, hermes-agent - Scripts: cluster_manager, image_pull, containerd pullthrough setup - Frigate config, audiblez-web app source, n8n workflows dir - Claude agent: service-upgrade, reference: upgrade-config.json - Removed: claudeception skill, excalidraw empty submodule, temp listings [ci skip] Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
bd41bb9230
commit
bcad200a23
44 changed files with 3819 additions and 0 deletions
277
scripts/cluster_manager.py
Normal file
277
scripts/cluster_manager.py
Normal file
|
|
@ -0,0 +1,277 @@
|
|||
import asyncio
|
||||
import click
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Union, Optional
|
||||
from kubernetes_asyncio import client, config
|
||||
from kubernetes_asyncio.client.api_client import ApiClient
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def wait_for_healthy(
|
||||
api_instance: client.AppsV1Api,
|
||||
resource_type: str,
|
||||
namespace: str,
|
||||
name: str,
|
||||
target_replicas: int,
|
||||
timeout: int = 300,
|
||||
) -> None:
|
||||
start_time = time.time()
|
||||
logger.info(
|
||||
f"Waiting for {resource_type} {name} to reach {target_replicas} replicas..."
|
||||
)
|
||||
|
||||
while True:
|
||||
if time.time() - start_time > timeout:
|
||||
logger.error(f"❌ Timeout reached for {resource_type} {name}")
|
||||
return
|
||||
|
||||
try:
|
||||
if resource_type.lower() == "deployment":
|
||||
res = await api_instance.read_namespaced_deployment_status(
|
||||
name, namespace
|
||||
)
|
||||
ready = res.status.ready_replicas or 0
|
||||
updated = res.status.updated_replicas or 0
|
||||
if ready == target_replicas and updated == target_replicas:
|
||||
break
|
||||
else: # StatefulSet
|
||||
res = await api_instance.read_namespaced_stateful_set_status(
|
||||
name, namespace
|
||||
)
|
||||
ready = res.status.ready_replicas or 0
|
||||
if ready == target_replicas:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Retrying status check for {name}: {e}")
|
||||
|
||||
await asyncio.sleep(5)
|
||||
|
||||
logger.info(f"✅ {resource_type} {name} is now healthy.")
|
||||
|
||||
|
||||
async def wait_for_zero(
|
||||
api: client.AppsV1Api, kind: str, ns: str, name: str, timeout: int
|
||||
) -> tuple[str, str]:
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
while (asyncio.get_event_loop().time() - start_time) < timeout:
|
||||
try:
|
||||
res = await (
|
||||
api.read_namespaced_deployment_status(name, ns)
|
||||
if kind.lower() == "deployment"
|
||||
else api.read_namespaced_stateful_set_status(name, ns)
|
||||
)
|
||||
if (res.status.ready_replicas or 0) == 0:
|
||||
return ns, name
|
||||
except Exception:
|
||||
return ns, name # Assume gone if error
|
||||
await asyncio.sleep(3)
|
||||
logger.error(f"Timeout: {kind} {ns}/{name} still has running pods.")
|
||||
return ns, name
|
||||
|
||||
|
||||
async def scale_resource(
|
||||
api_instance: client.AppsV1Api,
|
||||
resource_type: str,
|
||||
namespace: str,
|
||||
name: str,
|
||||
replicas: int,
|
||||
) -> None:
|
||||
body = {"spec": {"replicas": replicas}}
|
||||
try:
|
||||
if resource_type.lower() == "deployment":
|
||||
await api_instance.patch_namespaced_deployment_scale(name, namespace, body)
|
||||
else:
|
||||
await api_instance.patch_namespaced_stateful_set_scale(
|
||||
name, namespace, body
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to scale {resource_type} {name}: {e}")
|
||||
|
||||
|
||||
async def run_stop_tier(
|
||||
api_v1: client.AppsV1Api, label: str, output_file: str, timeout: int
|
||||
) -> None:
|
||||
"""Processes a single label tier: saves, scales to 0, and waits."""
|
||||
excluded_ns = ["kube-system", "kube-public", "kube-node-lease"]
|
||||
|
||||
# 1. Discover
|
||||
targets = [
|
||||
("Deployment", api_v1.list_deployment_for_all_namespaces),
|
||||
("StatefulSet", api_v1.list_stateful_set_for_all_namespaces),
|
||||
]
|
||||
|
||||
tier_resources = []
|
||||
for kind, list_func in targets:
|
||||
resp = await list_func(label_selector=label)
|
||||
tier_resources.extend(
|
||||
[
|
||||
(kind, item)
|
||||
for item in resp.items
|
||||
if item.metadata.namespace not in excluded_ns
|
||||
]
|
||||
)
|
||||
|
||||
if not tier_resources:
|
||||
logger.warning(f"No resources found for label: {label}")
|
||||
return
|
||||
|
||||
# 2. Save & Scale
|
||||
active_jobs: set[tuple[str, str]] = set()
|
||||
wait_tasks = []
|
||||
|
||||
# Append to file so we don't overwrite previous tiers
|
||||
with open(output_file, "a") as f:
|
||||
for kind, item in tier_resources:
|
||||
ns, name = item.metadata.namespace, item.metadata.name
|
||||
reps = item.spec.replicas or 0
|
||||
f.write(f"{kind} {ns} {name} {reps}\n")
|
||||
active_jobs.add((ns, name))
|
||||
|
||||
await scale_resource(api_v1, kind, ns, name, 0)
|
||||
wait_tasks.append(wait_for_zero(api_v1, kind, ns, name, timeout))
|
||||
|
||||
# 3. Wait for this tier to finish before moving to next
|
||||
logger.info(f"Tier [{label}]: Waiting for {len(active_jobs)} resources to stop...")
|
||||
for coro in asyncio.as_completed(wait_tasks):
|
||||
finished_ns, finished_name = await coro
|
||||
active_jobs.discard((finished_ns, finished_name))
|
||||
if active_jobs:
|
||||
remaining_ns = sorted({ns for ns, name in active_jobs})
|
||||
logger.info(
|
||||
f"[{label}] Pending: {len(active_jobs)} | Namespaces: {', '.join(remaining_ns)}"
|
||||
)
|
||||
|
||||
logger.info(f"✅ Tier [{label}] successfully shut down.")
|
||||
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
pass
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("labels", nargs=-1, required=True)
|
||||
@click.option("--output", "-o", default="resources.txt", help="Output state file")
|
||||
@click.option("--timeout", "-t", default=3600)
|
||||
def stop(labels: List[str], output: str, timeout: int):
|
||||
"""Stop tiers sequentially. Usage: stop 'app=web' 'app=db'"""
|
||||
|
||||
async def main():
|
||||
await config.load_kube_config()
|
||||
# Clear/Create file at start
|
||||
open(output, "w").close()
|
||||
|
||||
async with ApiClient() as api_client:
|
||||
api_v1 = client.AppsV1Api(api_client)
|
||||
for label in labels:
|
||||
logger.info(f"🚀 Processing Shutdown Tier: {label}")
|
||||
await run_stop_tier(api_v1, label, output, timeout)
|
||||
logger.info("🏁 Sequence complete. Cluster is gracefully stopped.")
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("labels", nargs=-1, required=True)
|
||||
@click.option("--file", "-f", default="resources.txt")
|
||||
@click.option("--timeout", "-t", default=3600, help="Seconds to wait per resource")
|
||||
def start(labels: List[str], file: str, timeout: int):
|
||||
asyncio.run(run_start_sequence(labels, file, timeout))
|
||||
|
||||
|
||||
async def run_start_sequence(labels: List[str], file_path: str, timeout: int) -> None:
|
||||
await config.load_kube_config()
|
||||
|
||||
async with ApiClient() as api_client:
|
||||
apps_v1 = client.AppsV1Api(api_client)
|
||||
|
||||
# 1. Load the entire snapshot into memory for filtering
|
||||
try:
|
||||
with open(file_path, "r") as f:
|
||||
# Format: Kind Namespace Name Replicas
|
||||
snapshot_lines = [line.strip().split() for line in f if line.strip()]
|
||||
except FileNotFoundError:
|
||||
logger.error(f"Snapshot file {file_path} not found.")
|
||||
return
|
||||
|
||||
# 2. Iterate through labels in the order provided
|
||||
for label in labels:
|
||||
logger.info(f"🚀 Starting Tier: {label}")
|
||||
|
||||
# Find resources in this tier by querying K8s for the label
|
||||
# then matching against our snapshot file data
|
||||
tier_resources = await get_resources_by_label(apps_v1, label)
|
||||
|
||||
# Cross-reference: Only start things that are in BOTH the K8s label query AND our file
|
||||
# This ensures we restore them to the CORRECT previous replica count
|
||||
to_restore = []
|
||||
tier_keys = {(r["ns"], r["name"]) for r in tier_resources}
|
||||
|
||||
for kind, ns, name, reps in snapshot_lines:
|
||||
if (ns, name) in tier_keys:
|
||||
to_restore.append((kind, ns, name, int(reps)))
|
||||
|
||||
if not to_restore:
|
||||
logger.warning(f"No resources found in snapshot for tier: {label}")
|
||||
continue
|
||||
|
||||
# 3. Scale and Wait for this specific tier
|
||||
await process_start_tier(apps_v1, to_restore, timeout, label)
|
||||
|
||||
logger.info("🏁 All tiers started successfully.")
|
||||
|
||||
|
||||
async def get_resources_by_label(api: client.AppsV1Api, label: str) -> List[dict]:
|
||||
"""Helper to find what currently exists in the cluster with this label."""
|
||||
targets = [
|
||||
api.list_deployment_for_all_namespaces,
|
||||
api.list_stateful_set_for_all_namespaces,
|
||||
]
|
||||
found = []
|
||||
for list_func in targets:
|
||||
resp = await list_func(label_selector=label)
|
||||
for item in resp.items:
|
||||
found.append({"ns": item.metadata.namespace, "name": item.metadata.name})
|
||||
return found
|
||||
|
||||
|
||||
async def process_start_tier(
|
||||
api: client.AppsV1Api, resources: list, timeout: int, label: str
|
||||
):
|
||||
active_jobs = set()
|
||||
scale_tasks = []
|
||||
wait_tasks = []
|
||||
|
||||
# Wrapper to track which job finishes
|
||||
async def tracked_wait(kind, ns, name, target, t_out):
|
||||
await wait_for_healthy(api, kind, ns, name, target, t_out)
|
||||
return (ns, name)
|
||||
|
||||
for kind, ns, name, reps in resources:
|
||||
active_jobs.add((ns, name))
|
||||
scale_tasks.append(scale_resource(api, kind, ns, name, reps))
|
||||
wait_tasks.append(tracked_wait(kind, ns, name, reps, timeout))
|
||||
|
||||
# Trigger all scales for this tier
|
||||
await asyncio.gather(*scale_tasks)
|
||||
|
||||
# Monitor health
|
||||
for coro in asyncio.as_completed(wait_tasks):
|
||||
finished_ns, finished_name = await coro
|
||||
active_jobs.discard((finished_ns, finished_name))
|
||||
|
||||
if active_jobs:
|
||||
remaining_ns = sorted({ns for ns, name in active_jobs})
|
||||
logger.info(
|
||||
f"[{label}] Pending Health: {len(active_jobs)} | Namespaces: {', '.join(remaining_ns)}"
|
||||
)
|
||||
|
||||
logger.info(f"✅ Tier [{label}] is healthy.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
10
scripts/image_pull.sh
Executable file
10
scripts/image_pull.sh
Executable file
|
|
@ -0,0 +1,10 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
for n in $(kubectl get nodes -o wide | grep node | awk '{print $1}'); do
|
||||
echo $n;
|
||||
kubectl drain $n --ignore-daemonsets --delete-emptydir-data && \
|
||||
ssh wizard@$n < image_pull_remote.sh
|
||||
# Check result
|
||||
kubectl get --raw "/api/v1/nodes/$n/proxy/configz" | jq '.kubeletconfig | {serializeImagePulls, maxParallelImagePulls}'
|
||||
kubectl uncordon $n
|
||||
done
|
||||
14
scripts/image_pull_remote.sh
Executable file
14
scripts/image_pull_remote.sh
Executable file
|
|
@ -0,0 +1,14 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Containerd
|
||||
sudo sed -i 's/.*max_concurrent_downloads.*/max_concurrent_downloads = 5/g' /etc/containerd/config.toml
|
||||
sudo systemctl restart containerd
|
||||
|
||||
# Kubelet
|
||||
#sed serializeImagePulls: false # Allow container images to be downloaded in parallel
|
||||
#maxParallelImagePulls: 20 # To limit the number of parallel image pulls.
|
||||
|
||||
sudo sed -i '/serializeImagePulls:/d' /var/lib/kubelet/config.yaml && \
|
||||
sudo sed -i '/maxParallelImagePulls:/d' /var/lib/kubelet/config.yaml && \
|
||||
echo -e 'serializeImagePulls: false\nmaxParallelImagePulls: 5' | sudo tee -a /var/lib/kubelet/config.yaml
|
||||
sudo systemctl restart kubelet
|
||||
115
scripts/setup-containerd-pullthrough.sh
Executable file
115
scripts/setup-containerd-pullthrough.sh
Executable file
|
|
@ -0,0 +1,115 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
############################################
|
||||
# CONFIGURATION
|
||||
############################################
|
||||
|
||||
# Internal pull-through registry endpoint
|
||||
# Examples:
|
||||
# http://registry.internal:5000
|
||||
# https://registry.internal
|
||||
INTERNAL_REGISTRY="http://10.0.20.10:5002"
|
||||
|
||||
# Path where containerd reads registry configs
|
||||
CERTS_DIR="/etc/containerd/certs.d"
|
||||
|
||||
# Optional: path to CA file if INTERNAL_REGISTRY uses HTTPS with custom CA
|
||||
# Leave empty if not needed
|
||||
INTERNAL_CA_PATH=""
|
||||
|
||||
# Restart containerd at the end
|
||||
RESTART_CONTAINERD=true
|
||||
|
||||
############################################
|
||||
# REGISTRIES TO MIRROR
|
||||
############################################
|
||||
|
||||
REGISTRIES=(
|
||||
"docker.io"
|
||||
"registry-1.docker.io"
|
||||
"registry.k8s.io"
|
||||
"quay.io"
|
||||
"ghcr.io"
|
||||
"gcr.io"
|
||||
"us-docker.pkg.dev"
|
||||
"public.ecr.aws"
|
||||
"mcr.microsoft.com"
|
||||
)
|
||||
|
||||
############################################
|
||||
# FUNCTIONS
|
||||
############################################
|
||||
|
||||
require_root() {
|
||||
if [[ "$(id -u)" -ne 0 ]]; then
|
||||
echo "ERROR: must be run as root" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_containerd_config_path() {
|
||||
local cfg="/etc/containerd/config.toml"
|
||||
|
||||
if [[ ! -f "$cfg" ]]; then
|
||||
echo "Generating default containerd config"
|
||||
containerd config default > "$cfg"
|
||||
fi
|
||||
|
||||
if ! grep -q 'config_path *= *"/etc/containerd/certs.d"' "$cfg"; then
|
||||
echo "Enabling config_path in containerd config"
|
||||
|
||||
# Minimal and safe append if section exists
|
||||
if grep -q '\[plugins\."io.containerd.grpc.v1.cri".registry\]' "$cfg"; then
|
||||
sed -i '/\[plugins\."io.containerd.grpc.v1.cri".registry\]/a \ config_path = "/etc/containerd/certs.d"' "$cfg"
|
||||
else
|
||||
cat >> "$cfg" <<'EOF'
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".registry]
|
||||
config_path = "/etc/containerd/certs.d"
|
||||
EOF
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
write_hosts_toml() {
|
||||
local registry="$1"
|
||||
local dir="$CERTS_DIR/$registry"
|
||||
local file="$dir/hosts.toml"
|
||||
|
||||
mkdir -p "$dir"
|
||||
|
||||
cat > "$file" <<EOF
|
||||
server = "https://$registry"
|
||||
|
||||
[host."$INTERNAL_REGISTRY"]
|
||||
capabilities = ["pull", "resolve"]
|
||||
EOF
|
||||
|
||||
if [[ -n "$INTERNAL_CA_PATH" ]]; then
|
||||
cat >> "$file" <<EOF
|
||||
ca = "$INTERNAL_CA_PATH"
|
||||
EOF
|
||||
fi
|
||||
}
|
||||
|
||||
############################################
|
||||
# MAIN
|
||||
############################################
|
||||
|
||||
require_root
|
||||
ensure_containerd_config_path
|
||||
|
||||
echo "Creating registry mirror configurations..."
|
||||
|
||||
for r in "${REGISTRIES[@]}"; do
|
||||
echo " - $r"
|
||||
write_hosts_toml "$r"
|
||||
done
|
||||
|
||||
if [[ "$RESTART_CONTAINERD" == "true" ]]; then
|
||||
echo "Restarting containerd"
|
||||
systemctl restart containerd
|
||||
fi
|
||||
|
||||
echo "Done."
|
||||
Loading…
Add table
Add a link
Reference in a new issue