sync regenerated providers.tf + upstream changes

- Terragrunt-regenerated providers.tf across stacks (vault_root_token
  variable removed from root generate block)
- Upstream monitoring/openclaw/CLAUDE.md changes from rebase
This commit is contained in:
Viktor Barzin 2026-03-22 02:56:04 +02:00
parent 1bf8676a6d
commit 1c13af142d
28 changed files with 336 additions and 132 deletions

View file

@ -113,6 +113,22 @@ Repo IDs: infra=1, Website=2, finance=3, health=4, travel_blog=5, webhook-handle
- Every new service gets Prometheus scrape config + Uptime Kuma monitor.
- Key alerts: OOMKill, pod replica mismatch, 4xx/5xx error rates, UPS battery, CPU temp, SSD writes, NFS responsiveness, ClusterMemoryRequestsHigh (>85%), ContainerNearOOM (>85% limit), PodUnschedulable.
## Storage & Backup Architecture
### Cloud Sync (TrueNAS → Synology NAS)
- **Task 1**: Weekly push (Monday 09:00) of `/mnt/main` NFS data to `nas.viktorbarzin.lan:/Backup/Viki/truenas`. Uses `--no-traverse` to skip expensive remote directory listing (~1.8M files) — checks each changed source file individually instead.
- **Snapshot consistency**: Pre-script creates `main@cloudsync-temp`, rclone reads from `/mnt/main/.zfs/snapshot/cloudsync-temp/`, post-script destroys it
- **Excludes**: ytldp, prometheus, logs, post, crowdsec, servarr/downloads, iscsi, iscsi-snaps
### iSCSI Backup Architecture
- iSCSI zvols are raw block devices exported to k8s nodes via democratic-csi
- TrueNAS cannot read filesystem contents inside zvols — only the k8s pod can
- **Local protection**: ZFS snapshots (every 12h, 24h retention + daily, 3-week retention) cover zvols automatically
- **Offsite protection**: Application-level backup CronJobs dump data to NFS paths, which Task 1 syncs to Synology
- **Current CronJob coverage**: MySQL (mysqldump), PostgreSQL (pg_dumpall), Vault (raft snapshot), Redis (BGSAVE), Vaultwarden (sqlite3 .backup)
- **Convention**: Any new iSCSI-backed app MUST add a backup CronJob to its Terraform stack that writes to `/mnt/main/<app>-backup/`
- **Uncovered (acceptable)**: Prometheus (disposable metrics), Loki (disposable logs), plotting-book and novelapp (small, low-priority)
## Known Issues
- **CrowdSec Helm upgrade times out**: `terragrunt apply` on platform stack causes CrowdSec Helm release to get stuck in `pending-upgrade`. Workaround: `helm rollback crowdsec <rev> -n crowdsec`. Root cause: likely ResourceQuota CPU at 302% preventing pods from passing readiness probes. Needs investigation.
- **OpenClaw config is writable**: OpenClaw writes to `openclaw.json` at runtime (doctor --fix, plugin auto-enable). Never use subPath ConfigMap mounts for it — use an init container to copy into a writable volume. Needs 2Gi memory + `NODE_OPTIONS=--max-old-space-size=1536`.

View file

@ -196,6 +196,116 @@ def create_event(summary, start_time, end_time=None, calendar_name="Personal",
}
def get_todos(calendar_name=None, include_completed=False):
"""Get todos from calendar(s)."""
client = get_client()
principal = client.principal()
calendars = principal.calendars()
all_todos = []
for cal in calendars:
if calendar_name and cal_name(cal).lower() != calendar_name.lower():
continue
try:
todos = cal.todos(include_completed=include_completed)
for todo in todos:
try:
ical = Calendar.from_ical(todo.data)
for component in ical.walk():
if component.name == "VTODO":
due = component.get("due")
due_str = None
if due:
dt = due.dt
due_str = dt.strftime("%Y-%m-%d %H:%M") if hasattr(dt, 'hour') else dt.strftime("%Y-%m-%d")
priority = component.get("priority")
all_todos.append({
"calendar": cal_name(cal),
"summary": str(component.get("summary", "No title")),
"status": str(component.get("status", "NEEDS-ACTION")),
"due": due_str,
"priority": int(priority) if priority else None,
"uid": str(component.get("uid", "")),
"description": str(component.get("description", "")) or None,
"_cal_obj": cal,
"_todo_obj": todo,
})
except Exception:
pass
except Exception as e:
print(f"Warning: Could not fetch todos from {cal_name(cal)}: {e}", file=sys.stderr)
# Sort: by due date (None last), then priority (None last), then name
def sort_key(t):
due = t["due"] or "9999-99-99"
pri = t["priority"] if t["priority"] is not None else 99
return (due, pri, t["summary"].lower())
all_todos.sort(key=sort_key)
return all_todos
def complete_todo(search_term, calendar_name=None):
"""Complete a todo by searching for it by name (substring match)."""
todos = get_todos(calendar_name=calendar_name, include_completed=False)
search_lower = search_term.lower()
matches = [t for t in todos if search_lower in t["summary"].lower()]
if not matches:
raise ValueError(f"No open todo matching '{search_term}' found.")
if len(matches) > 1:
names = [f" - [{t['calendar']}] {t['summary']}" for t in matches]
raise ValueError(f"Multiple todos match '{search_term}':\n" + "\n".join(names) + "\nBe more specific.")
todo = matches[0]
todo_obj = todo["_todo_obj"]
todo_obj.complete()
return {
"status": "completed",
"summary": todo["summary"],
"calendar": todo["calendar"],
}
def format_todos(todos, output_format="text"):
"""Format todos for display."""
if output_format == "json":
clean = [{k: v for k, v in t.items() if not k.startswith("_")} for t in todos]
return json.dumps(clean, indent=2)
if not todos:
return "No todos found."
lines = []
current_cal = None
for todo in todos:
if todo["calendar"] != current_cal:
current_cal = todo["calendar"]
lines.append(f"\n## {current_cal}")
status_icon = "x" if todo["status"] == "COMPLETED" else " "
line = f"- [{status_icon}] {todo['summary']}"
if todo["due"]:
line += f" (due: {todo['due']})"
if todo["priority"] and todo["priority"] < 9:
line += f" [priority: {todo['priority']}]"
lines.append(line)
if todo["description"]:
desc = todo["description"][:200]
if len(todo["description"]) > 200:
desc += "..."
lines.append(f" {desc}")
return "\n".join(lines)
def format_events(events, output_format="text"):
"""Format events for display."""
if output_format == "json":

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -1,8 +1,16 @@
# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
terraform {
required_providers {
vault = {
source = "hashicorp/vault"
version = "~> 4.0"
}
}
}
variable "kube_config_path" {
type = string
default = "~/.kube/config"
sensitive = true
type = string
default = "~/.kube/config"
}
provider "kubernetes" {
@ -14,3 +22,8 @@ provider "helm" {
config_path = var.kube_config_path
}
}
provider "vault" {
address = "https://vault.viktorbarzin.me"
skip_child_token = true
}

View file

@ -1,8 +1,16 @@
# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
terraform {
required_providers {
vault = {
source = "hashicorp/vault"
version = "~> 4.0"
}
}
}
variable "kube_config_path" {
type = string
default = "~/.kube/config"
sensitive = true
type = string
default = "~/.kube/config"
}
provider "kubernetes" {
@ -14,3 +22,8 @@ provider "helm" {
config_path = var.kube_config_path
}
}
provider "vault" {
address = "https://vault.viktorbarzin.me"
skip_child_token = true
}

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -1,6 +1,10 @@
# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
terraform {
required_providers {
vault = {
source = "hashicorp/vault"
version = "~> 4.0"
}
proxmox = {
source = "telmate/proxmox"
version = "3.0.2-rc07"
@ -17,6 +21,11 @@ variable "proxmox_pm_api_url" { type = string }
variable "proxmox_pm_api_token_id" { type = string }
variable "proxmox_pm_api_token_secret" { type = string }
provider "vault" {
address = "https://vault.viktorbarzin.me"
skip_child_token = true
}
provider "proxmox" {
pm_api_url = var.proxmox_pm_api_url
pm_api_token_id = var.proxmox_pm_api_token_id

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -1,8 +1,16 @@
# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
terraform {
required_providers {
vault = {
source = "hashicorp/vault"
version = "~> 4.0"
}
}
}
variable "kube_config_path" {
type = string
default = "~/.kube/config"
sensitive = true
type = string
default = "~/.kube/config"
}
provider "kubernetes" {
@ -14,3 +22,8 @@ provider "helm" {
config_path = var.kube_config_path
}
}
provider "vault" {
address = "https://vault.viktorbarzin.me"
skip_child_token = true
}

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -17,6 +17,14 @@ resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" {
}
}
module "nfs_prometheus_backup" {
source = "../../../../modules/kubernetes/nfs_volume"
name = "monitoring-prometheus-backup"
namespace = kubernetes_namespace.monitoring.metadata[0].name
nfs_server = var.nfs_server
nfs_path = "/mnt/main/prometheus-backup"
}
resource "helm_release" "prometheus" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true

View file

@ -148,7 +148,7 @@ prometheus-node-exporter:
server:
# Enable me to delete metrics
extraFlags:
# - "web.enable-admin-api"
- "web.enable-admin-api"
- "web.enable-lifecycle"
- "storage.tsdb.allow-overlapping-blocks"
- "storage.tsdb.retention.size=180GB"
@ -176,10 +176,80 @@ server:
emptyDir:
medium: Memory
sizeLimit: 2Gi
# 2. Mount it over the WAL directory
- name: prometheus-backup
persistentVolumeClaim:
claimName: monitoring-prometheus-backup
extraVolumeMounts:
- name: prometheus-wal-tmpfs
mountPath: /data/wal # Standard path for the chart
mountPath: /data/wal
- name: prometheus-backup
mountPath: /backup
sidecarContainers:
prometheus-backup:
image: docker.io/library/alpine:3.21
command:
- /bin/sh
- -c
- |
echo "Prometheus backup sidecar started"
while true; do
# Sleep until 03:00 UTC daily
hour=$(date -u +%H)
min=$(date -u +%M)
secs_since_midnight=$(( hour * 3600 + min * 60 ))
target_secs=$((3 * 3600)) # 03:00 UTC
if [ $secs_since_midnight -lt $target_secs ]; then
sleep_secs=$((target_secs - secs_since_midnight))
else
sleep_secs=$((86400 - secs_since_midnight + target_secs))
fi
echo "$(date) Sleeping $${sleep_secs}s until next backup window"
sleep $sleep_secs
echo "$(date) Starting Prometheus TSDB snapshot"
# Create TSDB snapshot via admin API (wget is built into BusyBox)
resp=$(wget -qO- --post-data='' http://localhost:9090/api/v1/admin/tsdb/snapshot 2>&1)
if [ $? -ne 0 ]; then
echo "$(date) ERROR: Failed to create snapshot: $resp"
continue
fi
# Parse snapshot name without jq: {"status":"success","data":{"name":"20260322T030000Z-..."}}
snap_name=$(echo "$resp" | grep -o '"name":"[^"]*"' | head -1 | cut -d'"' -f4)
if [ -z "$snap_name" ]; then
echo "$(date) ERROR: Could not parse snapshot name from: $resp"
continue
fi
echo "$(date) Snapshot created: $snap_name"
# Tar snapshot to NFS backup volume
backup_file="prometheus_$(date +%Y%m%d_%H%M).tar.gz"
tar czf "/backup/$backup_file" -C /data/snapshots/ "$snap_name"
echo "$(date) Backup written: $backup_file ($(du -h /backup/$backup_file | cut -f1))"
# Clean up snapshot from data dir
rm -rf "/data/snapshots/$snap_name"
# Rotate: keep 14 days of backups
find /backup -name "prometheus_*.tar.gz" -type f -mtime +14 -delete
# Push success metric to Pushgateway for alerting
echo "prometheus_backup_last_success_timestamp $(date +%s)" | wget -qO- --post-file=- http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/prometheus-backup 2>/dev/null
echo "$(date) Backup complete. Files in /backup:"
ls -lh /backup/prometheus_*.tar.gz 2>/dev/null || echo " (none)"
done
volumeMounts:
- name: storage-volume
mountPath: /data
readOnly: false
- name: prometheus-backup
mountPath: /backup
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
memory: 128Mi
ingress:
enabled: true
ingressClassName: "traefik"
@ -572,6 +642,20 @@ serverFiles:
severity: critical
annotations:
summary: "Redis backup CronJob has never completed successfully"
- alert: PrometheusBackupStale
expr: (time() - prometheus_backup_last_success_timestamp{job="prometheus-backup"}) > 129600
for: 30m
labels:
severity: critical
annotations:
summary: "Prometheus backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
- alert: PrometheusBackupNeverRun
expr: absent(prometheus_backup_last_success_timestamp{job="prometheus-backup"})
for: 48h
labels:
severity: warning
annotations:
summary: "Prometheus backup has never reported a successful run"
- alert: CSIDriverCrashLoop
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace=~"nfs-csi|iscsi-csi"} > 0
for: 10m

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -330,6 +330,17 @@ resource "kubernetes_deployment" "openclaw" {
spec {
service_account_name = kubernetes_service_account.openclaw.metadata[0].name
# Init 0: fix /workspace ownership so node user can write
init_container {
name = "fix-workspace-perms"
image = "busybox:1.37"
command = ["sh", "-c", "chown 1000:1000 /workspace"]
volume_mount {
name = "workspace"
mount_path = "/workspace"
}
}
# Init 1: copy openclaw.json from ConfigMap into writable NFS home
init_container {
name = "copy-config"
@ -472,6 +483,25 @@ resource "kubernetes_deployment" "openclaw" {
}
}
# Sidecar: playwright-mcp headless browser for agents
container {
name = "playwright-mcp"
image = "docker.io/viktorbarzin/playwright-mcp:v1"
args = ["--headless", "--browser", "chromium", "--no-sandbox", "--port", "3000", "--host", "0.0.0.0"]
port {
container_port = 3000
}
resources {
requests = {
cpu = "50m"
memory = "256Mi"
}
limits = {
memory = "512Mi"
}
}
}
# Sidecar: modelrelay auto-routes to fastest healthy free model
container {
name = "modelrelay"

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -1,8 +1,16 @@
# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
terraform {
required_providers {
vault = {
source = "hashicorp/vault"
version = "~> 4.0"
}
}
}
variable "kube_config_path" {
type = string
default = "~/.kube/config"
sensitive = true
}
provider "kubernetes" {
@ -14,3 +22,8 @@ provider "helm" {
config_path = var.kube_config_path
}
}
provider "vault" {
address = "https://vault.viktorbarzin.me"
skip_child_token = true
}

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -1,4 +1,13 @@
# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
terraform {
required_providers {
vault = {
source = "hashicorp/vault"
version = "~> 4.0"
}
}
}
variable "kube_config_path" {
type = string
default = "~/.kube/config"
@ -13,3 +22,8 @@ provider "helm" {
config_path = var.kube_config_path
}
}
provider "vault" {
address = "https://vault.viktorbarzin.me"
skip_child_token = true
}

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}

View file

@ -13,12 +13,6 @@ variable "kube_config_path" {
default = "~/.kube/config"
}
variable "vault_root_token" {
type = string
sensitive = true
default = ""
}
provider "kubernetes" {
config_path = var.kube_config_path
}
@ -31,6 +25,5 @@ provider "helm" {
provider "vault" {
address = "https://vault.viktorbarzin.me"
token = var.vault_root_token
skip_child_token = true
}