diff --git a/stacks/beads-server/main.tf b/stacks/beads-server/main.tf index 62bc8ddb..ea554489 100644 --- a/stacks/beads-server/main.tf +++ b/stacks/beads-server/main.tf @@ -336,7 +336,11 @@ resource "kubernetes_deployment" "workbench" { spec { init_container { name = "seed-config" - image = "dolthub/dolt-workbench:latest" + # Pinned 2026-05-26: Keel rolled :latest → :0.1.0 on 2026-05-17, + # which speaks an old GraphQL schema (missing `type` arg on + # addDatabaseConnection) → seed-config fails, UI can't add the + # connection. :0.3.73 was the last Keel-resolved good tag. + image = "dolthub/dolt-workbench:0.3.73" command = ["sh", "-c", <<-EOT # Seed connection store cp /config/store.json /store/store.json @@ -365,7 +369,11 @@ resource "kubernetes_deployment" "workbench" { container { name = "workbench" - image = "dolthub/dolt-workbench:latest" + # Pinned 2026-05-26: Keel rolled :latest → :0.1.0 on 2026-05-17, + # which speaks an old GraphQL schema (missing `type` arg on + # addDatabaseConnection) → seed-config fails, UI can't add the + # connection. :0.3.73 was the last Keel-resolved good tag. + image = "dolthub/dolt-workbench:0.3.73" command = ["sh", "-c", <<-EOT # Patch GraphQL server to listen on 0.0.0.0 (IPv4) — Node 18+ defaults to IPv6 sed -i 's|app.listen(9002)|app.listen(9002,"0.0.0.0")|g' /app/graphql-server/dist/main.js diff --git a/stacks/excalidraw/.terraform.lock.hcl b/stacks/excalidraw/.terraform.lock.hcl index 522ec0cc..1445955c 100644 --- a/stacks/excalidraw/.terraform.lock.hcl +++ b/stacks/excalidraw/.terraform.lock.hcl @@ -87,3 +87,11 @@ provider "registry.terraform.io/hashicorp/vault" { "zh:ff35fb1ab6add288f0f368981e56f780b50405accd1937131cba1137999c8d83", ] } + +provider "registry.terraform.io/telmate/proxmox" { + version = "3.0.2-rc07" + constraints = "3.0.2-rc07" + hashes = [ + "h1:zp5hpQJQ4t4zROSLqdltVpBO+Riy9VugtfFbpyTw1aM=", + ] +} diff --git a/stacks/excalidraw/providers.tf b/stacks/excalidraw/providers.tf index d5469984..3d0bc2c6 100644 --- a/stacks/excalidraw/providers.tf +++ b/stacks/excalidraw/providers.tf @@ -20,6 +20,10 @@ terraform { source = "gavinbunney/kubectl" version = "~> 1.14" } + proxmox = { + source = "telmate/proxmox" + version = "3.0.2-rc07" + } } } diff --git a/stacks/immich/.terraform.lock.hcl b/stacks/immich/.terraform.lock.hcl index e6c6290c..8ca8bad0 100644 --- a/stacks/immich/.terraform.lock.hcl +++ b/stacks/immich/.terraform.lock.hcl @@ -41,21 +41,9 @@ provider "registry.terraform.io/goauthentik/authentik" { } provider "registry.terraform.io/hashicorp/helm" { - version = "3.1.1" + version = "3.1.2" hashes = [ - "h1:5b2ojWKT0noujHiweCds37ZreRFRQLNaErdJLusJN88=", - "zh:1a6d5ce931708aec29d1f3d9e360c2a0c35ba5a54d03eeaff0ce3ca597cd0275", - "zh:3411919ba2a5941801e677f0fea08bdd0ae22ba3c9ce3309f55554699e06524a", - "zh:81b36138b8f2320dc7f877b50f9e38f4bc614affe68de885d322629dd0d16a29", - "zh:95a2a0a497a6082ee06f95b38bd0f0d6924a65722892a856cfd914c0d117f104", - "zh:9d3e78c2d1bb46508b972210ad706dd8c8b106f8b206ecf096cd211c54f46990", - "zh:a79139abf687387a6efdbbb04289a0a8e7eaca2bd91cdc0ce68ea4f3286c2c34", - "zh:aaa8784be125fbd50c48d84d6e171d3fb6ef84a221dbc5165c067ce05faab4c8", - "zh:afecd301f469975c9d8f350cc482fe656e082b6ab0f677d1a816c3c615837cc1", - "zh:c54c22b18d48ff9053d899d178d9ffef7d9d19785d9bf310a07d648b7aac075b", - "zh:db2eefd55aea48e73384a555c72bac3f7d428e24147bedb64e1a039398e5b903", - "zh:ee61666a233533fd2be971091cecc01650561f1585783c381b6f6e8a390198a4", - "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + "h1:lIuknMfM7+QTzPWs8VBocstZF0B3TpEMIj/bw+dLAOs=", ] } @@ -85,3 +73,11 @@ provider "registry.terraform.io/hashicorp/vault" { "zh:ff35fb1ab6add288f0f368981e56f780b50405accd1937131cba1137999c8d83", ] } + +provider "registry.terraform.io/telmate/proxmox" { + version = "3.0.2-rc07" + constraints = "3.0.2-rc07" + hashes = [ + "h1:zp5hpQJQ4t4zROSLqdltVpBO+Riy9VugtfFbpyTw1aM=", + ] +} diff --git a/stacks/immich/main.tf b/stacks/immich/main.tf index deae2507..d8401720 100644 --- a/stacks/immich/main.tf +++ b/stacks/immich/main.tf @@ -157,7 +157,8 @@ resource "kubernetes_namespace" "immich" { # Override the kyverno-generated tier-2-gpu quota (12Gi requests.memory). # Immich-server needs 8Gi to absorb face-detection burst spikes (OOM 2026-04-26) # without OOM. Plus immich-machine-learning (3.5Gi) + immich-postgresql (3Gi) + -# backup CronJobs ≈ 15.5Gi. 20Gi gives ~4.5Gi headroom. +# backup CronJobs ≈ 15.5Gi. 24Gi gives ~8Gi headroom (raised 2026-05-26 — was at +# 88% with VPA bumps creeping up on immich-server burst behaviour). resource "kubernetes_resource_quota" "immich" { metadata { name = "tier-quota" @@ -166,8 +167,8 @@ resource "kubernetes_resource_quota" "immich" { spec { hard = { "requests.cpu" = "8" - "requests.memory" = "20Gi" - "limits.memory" = "32Gi" + "requests.memory" = "24Gi" + "limits.memory" = "40Gi" pods = "40" } } diff --git a/stacks/immich/providers.tf b/stacks/immich/providers.tf index d5469984..3d0bc2c6 100644 --- a/stacks/immich/providers.tf +++ b/stacks/immich/providers.tf @@ -20,6 +20,10 @@ terraform { source = "gavinbunney/kubectl" version = "~> 1.14" } + proxmox = { + source = "telmate/proxmox" + version = "3.0.2-rc07" + } } } diff --git a/stacks/keel/main.tf b/stacks/keel/main.tf index 7a794d8f..e68b8576 100644 --- a/stacks/keel/main.tf +++ b/stacks/keel/main.tf @@ -46,6 +46,16 @@ resource "helm_release" "keel" { atomic = true values = [yamlencode({ + # EMERGENCY STOP — scaled to 0 on 2026-05-26 16:42 UTC. Keel was actively + # rewriting tag strings (not just digests) despite the + # `keel.sh/match-tag=true` annotation injected by Kyverno that's supposed + # to constrain it to digest-only watches. Known casualties this round: + # uptime-kuma (2 → 1, 4h CrashLoopBackOff), n8n (1.80.5 → 0.1.2, silent + # degradation), beads-server/dolt-workbench (0.3.73 → 0.1.0), and ~10 + # other deployments with downgrade-flavored change-cause annotations. + # Re-enable only after root-causing why match-tag isn't being enforced, + # OR after migrating each app to a content-addressed (SHA) tag pin. + replicaCount = 0 # Prometheus pod-annotation scrape — picks up Keel-specific metrics # (pending_approvals, poll_trigger_tracked_images, registries_scanned_total{image,registry}) # on container port 9300 /metrics. The cluster's `kubernetes-pods` diff --git a/stacks/monitoring/modules/monitoring/main.tf b/stacks/monitoring/modules/monitoring/main.tf index ee493906..b4e85e96 100644 --- a/stacks/monitoring/modules/monitoring/main.tf +++ b/stacks/monitoring/modules/monitoring/main.tf @@ -568,6 +568,9 @@ resource "kubernetes_manifest" "yotovski_ingress_route" { # Custom ResourceQuota for monitoring — larger than the default 1-cluster tier quota # because monitoring runs 29+ pods (Prometheus, Grafana, Loki, Alloy, exporters, etc.) +# Headroom: cluster grew from 5 → 7 workers (k8s-node5/6 added 2026-05-26); per-pod +# DaemonSets (alloy 562Mi, node-exporter 100Mi, loki-canary 128Mi, sysctl-inotify 4Mi) +# now consume ~+2Gi vs. pre-expansion. 20Gi gives ~3-4Gi safe headroom. resource "kubernetes_resource_quota" "monitoring" { metadata { name = "monitoring-quota" @@ -576,7 +579,7 @@ resource "kubernetes_resource_quota" "monitoring" { spec { hard = { "requests.cpu" = "16" - "requests.memory" = "16Gi" + "requests.memory" = "20Gi" "limits.memory" = "64Gi" pods = "100" } diff --git a/stacks/n8n/.terraform.lock.hcl b/stacks/n8n/.terraform.lock.hcl index 0fc9b894..e9db626a 100644 --- a/stacks/n8n/.terraform.lock.hcl +++ b/stacks/n8n/.terraform.lock.hcl @@ -111,3 +111,11 @@ provider "registry.terraform.io/hashicorp/vault" { "zh:ff35fb1ab6add288f0f368981e56f780b50405accd1937131cba1137999c8d83", ] } + +provider "registry.terraform.io/telmate/proxmox" { + version = "3.0.2-rc07" + constraints = "3.0.2-rc07" + hashes = [ + "h1:zp5hpQJQ4t4zROSLqdltVpBO+Riy9VugtfFbpyTw1aM=", + ] +} diff --git a/stacks/n8n/providers.tf b/stacks/n8n/providers.tf index d5469984..3d0bc2c6 100644 --- a/stacks/n8n/providers.tf +++ b/stacks/n8n/providers.tf @@ -20,6 +20,10 @@ terraform { source = "gavinbunney/kubectl" version = "~> 1.14" } + proxmox = { + source = "telmate/proxmox" + version = "3.0.2-rc07" + } } } diff --git a/stacks/uptime-kuma/modules/uptime-kuma/main.tf b/stacks/uptime-kuma/modules/uptime-kuma/main.tf index f62ea402..345fdeea 100644 --- a/stacks/uptime-kuma/modules/uptime-kuma/main.tf +++ b/stacks/uptime-kuma/modules/uptime-kuma/main.tf @@ -81,9 +81,23 @@ resource "kubernetes_deployment" "uptime-kuma" { labels = { app = "uptime-kuma" tier = var.tier + # Opt out of Kyverno's inject-keel-annotations ClusterPolicy. The Kyverno + # rule excludes any workload with this LABEL (see + # stacks/kyverno/modules/kyverno/keel-annotations.tf, exclude.any + # matchLabels keel.sh/policy=never). Without the label, Kyverno would + # silently re-add `keel.sh/policy=force` after every reconcile, undoing + # the annotation below. + "keel.sh/policy" = "never" } annotations = { "reloader.stakater.com/search" = "true" + # Stop Keel polling for this workload. Even with match-tag=true, + # Keel auto-downgraded :2 → :1 on 2026-05-26 12:14, which v1 booted + # into SQLite mode and couldn't read the existing MariaDB store + # (db-config.json) → 4h CrashLoopBackOff. Pinning the image string + # alone isn't enough because Keel kept fighting the apply. Combined + # with the matching LABEL above, this fully bypasses Keel. + "keel.sh/policy" = "never" } } spec { @@ -108,7 +122,14 @@ resource "kubernetes_deployment" "uptime-kuma" { } spec { container { - image = "louislam/uptime-kuma:2" + # Pinned to 2.3.2 because Keel auto-downgraded :2 → :1 on 2026-05-26 + # 12:14 UTC despite the Kyverno-injected `keel.sh/match-tag=true` + + # `keel.sh/policy=force` annotation pair (which is supposed to gate + # digest changes only). The v1 image opens kuma.db (SQLite) at boot + # and can't read the v2 db-config.json → 4h CrashLoopBackOff while + # the MariaDB store sat intact. Until the keel-match-tag regression + # is root-caused, pin minor versions explicitly. + image = "louislam/uptime-kuma:2.3.2" name = "uptime-kuma" resources { @@ -167,9 +188,12 @@ resource "kubernetes_deployment" "uptime-kuma" { lifecycle { ignore_changes = [ spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 - metadata[0].annotations["keel.sh/policy"], + # `keel.sh/policy` is intentionally NOT ignored — we want TF to own it + # as `never` so a Kyverno reconcile (or manual kubectl) can't flip it + # back to `force` and re-enable auto-updates. metadata[0].annotations["keel.sh/trigger"], - metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 + metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 + metadata[0].annotations["keel.sh/match-tag"], # injected by Kyverno ] } } diff --git a/stacks/wealthfolio/main.tf b/stacks/wealthfolio/main.tf index 8f6201a8..bd5349b3 100644 --- a/stacks/wealthfolio/main.tf +++ b/stacks/wealthfolio/main.tf @@ -146,7 +146,10 @@ resource "kubernetes_deployment" "wealthfolio" { } spec { container { - image = "afadil/wealthfolio:3.2" + # Pinned 2026-05-26: prior live was :3.2.1, Keel rolled it to :2.0 + # on 2026-05-26 03:13, then truncated to :3.2 at 06:46 (Keel string + # match dropped the patch suffix). Restore the patch version. + image = "afadil/wealthfolio:3.2.1" name = "wealthfolio" port { container_port = 8080