infra: per-VM I/O caps + terragrunt v0.77 plumbing + state recovery

WHAT LANDED: - terragrunt.hcl (root): added telmate/proxmox to k8s_providers required_providers. Other stacks just don't instantiate a provider block — harmless. Replaces the same-name override trick the infra stack used to do, which stopped working under Terragrunt v0.77 ("Detected generate blocks with the same name"). - stacks/infra/terragrunt.hcl: new generate "proxmox_provider" block writes proxmox_provider.tf with the provider config; credentials read from Vault secret/viktor at plan/apply time (no env vars). - modules/create-vm: new mbps_rd / mbps_wr number variables (default 0 = uncapped), wired into scsi0/scsi1 disk{} blocks as mbps_r_concurrent / mbps_wr_concurrent. lifecycle.ignore_changes extended to scsi6..scsi29 (K8s nodes have many CSI-managed slots), plus scsihw and qemu_os (vary per-VM; non-trivial live changes). - stacks/infra/main.tf: docker-registry-vm gains mbps_rd=40, mbps_wr=40 in HCL — already applied live via qm set on 2026-05-26. WHAT FAILED AND WAS ROLLED BACK: - Attempted import of 7 VMs (102 devvm, 103 home-assistant, 200 k8s-master, 201 k8s-node1, 202 k8s-node2, 203 k8s-node3, 204 k8s-node4) via import {} blocks. The telmate/proxmox v3.0.2-rc07 provider mangled proxmox-csi PVC slots on apply for vmid 202 and 203: every scsi slot got rewritten from `vm-9999-pvc-<uuid>` to the boot disk `vm-<vmid>-disk-0`. Restored both .conf files from the 2026-05-24 nightly PVE config backup at /mnt/backup/pve-config/ etc-pve/nodes/pve/qemu-server/{202,203}.conf — no reboots, no data loss, K8s CSI reconciled PVC attachments within minutes. Removed the 7 imports from state via `terraform state rm` and re-encrypted. Tracked in beads code-xzbl: blocked on bpg/proxmox provider migration (telmate has the same dynamic-disk defect that bit us on iSCSI back in 2026-04-02; see memory id=539). LIVE CAPS STILL IN PLACE (qm set, 2026-05-26 ~03:13 UTC): 102 devvm 60/60 103 home-assistant 40/40 200 k8s-master 100/60 201 k8s-node1 150/120 202 k8s-node2 150/120 203 k8s-node3 150/120 204 k8s-node4 150/120 220 docker-registry 40/40 (pfSense 101 BSD + Windows10 300 intentionally out of scope.) PRE-EXISTING DRIFT EXPOSED (NOT NEW): - HCL declares k8s-master (200) and k8s-node2 (202) but neither was ever imported into TF state — confirmed against the SOPS-encrypted state in git (lineage e1cc5bb5, serial 42, last touched 2026-04-06). This commit leaves both declarations in place but does NOT import them; that's part of the code-xzbl follow-up. Closes: code-s9xr
2026-05-26 06:46:47 +00:00 · 2026-05-26 06:46:47 +00:00 · 445feb118f
commit 445feb118f
parent 07bd2e0017
5 changed files with 484 additions and 419 deletions
--- a/modules/create-vm/main.tf
+++ b/modules/create-vm/main.tf
@ -135,6 +135,22 @@ variable "hostpci0" {
  default = "" # e.g., "0000:06:00.0" for Tesla T4 passthrough
 }

+# ---------------------------------------------------------------------------
+# Variables — Disk I/O throttling (bytes/sec; 0 = uncapped)
+# ---------------------------------------------------------------------------
+# Caps any single VM's share of the underlying disk so a runaway workload
+# (e.g. the 2026-05-23/26 alloy IO storm — memory id=2726) cannot wedge the
+# whole Proxmox host's sdc thin pool. Values inferred from PVE RRD p99/max
+# observed in /nodes/pve/qemu/<vmid>/rrddata.
+variable "mbps_rd" {
+  type    = number
+  default = 0
+}
+variable "mbps_wr" {
+  type    = number
+  default = 0
+}
+
 # ---------------------------------------------------------------------------
 # Resource
 # ---------------------------------------------------------------------------
@ -192,9 +208,11 @@ resource "proxmox_vm_qemu" "cloudinit-vm" {
        for_each = var.disk_slot == "scsi0" ? [1] : []
        content {
          disk {
-            storage = "local-lvm"
-            size    = var.vm_disk_size
-            discard = true # Enable TRIM passthrough to LVM thin pool — reduces CoW overhead
+            storage            = "local-lvm"
+            size               = var.vm_disk_size
+            discard            = true # Enable TRIM passthrough to LVM thin pool — reduces CoW overhead
+            mbps_r_concurrent  = var.mbps_rd
+            mbps_wr_concurrent = var.mbps_wr
          }
        }
      }
@ -202,9 +220,11 @@ resource "proxmox_vm_qemu" "cloudinit-vm" {
        for_each = var.disk_slot == "scsi1" ? [1] : []
        content {
          disk {
-            storage = "local-lvm"
-            size    = var.vm_disk_size
-            discard = true
+            storage            = "local-lvm"
+            size               = var.vm_disk_size
+            discard            = true
+            mbps_r_concurrent  = var.mbps_rd
+            mbps_wr_concurrent = var.mbps_wr
          }
        }
      }
@ -234,12 +254,39 @@ resource "proxmox_vm_qemu" "cloudinit-vm" {
  lifecycle {
    prevent_destroy = true
    ignore_changes = [
-      # democratic-csi dynamically attaches/detaches iSCSI disks
+      # proxmox-csi dynamically attaches/detaches PVC disks. K8s workers
+      # have up to ~30 slots in use simultaneously (k8s-node1: scsi1-29 +
+      # unused0-29). The k8s-master only uses scsi0 (boot) so most of
+      # these are no-ops for that VM but harmless.
      disks[0].scsi[0].scsi1,
      disks[0].scsi[0].scsi2,
      disks[0].scsi[0].scsi3,
      disks[0].scsi[0].scsi4,
      disks[0].scsi[0].scsi5,
+      disks[0].scsi[0].scsi6,
+      disks[0].scsi[0].scsi7,
+      disks[0].scsi[0].scsi8,
+      disks[0].scsi[0].scsi9,
+      disks[0].scsi[0].scsi10,
+      disks[0].scsi[0].scsi11,
+      disks[0].scsi[0].scsi12,
+      disks[0].scsi[0].scsi13,
+      disks[0].scsi[0].scsi14,
+      disks[0].scsi[0].scsi15,
+      disks[0].scsi[0].scsi16,
+      disks[0].scsi[0].scsi17,
+      disks[0].scsi[0].scsi18,
+      disks[0].scsi[0].scsi19,
+      disks[0].scsi[0].scsi20,
+      disks[0].scsi[0].scsi21,
+      disks[0].scsi[0].scsi22,
+      disks[0].scsi[0].scsi23,
+      disks[0].scsi[0].scsi24,
+      disks[0].scsi[0].scsi25,
+      disks[0].scsi[0].scsi26,
+      disks[0].scsi[0].scsi27,
+      disks[0].scsi[0].scsi28,
+      disks[0].scsi[0].scsi29,
      # cloud-init config may drift after first boot
      cicustom,
      ciupgrade,
@ -254,6 +301,13 @@ resource "proxmox_vm_qemu" "cloudinit-vm" {
      # Provider defaults that differ from imported state
      define_connection_info,
      full_clone,
+      # scsihw varies per VM (virtio-scsi-pci / virtio-scsi-single / lsi)
+      # and changing it on a running VM is risky — leave whatever's live.
+      scsihw,
+      # qemu_os is a hint to qemu about the guest OS; some live VMs have
+      # "other" (unset originally) and the module's "l26" default would
+      # otherwise force an unnecessary write on apply.
+      qemu_os,
    ]
  }
 }
--- a/stacks/infra/main.tf
+++ b/stacks/infra/main.tf
@ -430,19 +430,36 @@ module "docker-registry-vm" {
  # 5040 -> registry-kyverno (reg.kyverno.io) — DISABLED
  # 5050 -> nginx -> registry-private (R/W registry for CI build cache)
  # 8080 -> registry-ui (joxit/docker-registry-ui)
+
+  # I/O cap (MB/s) — observed peak <6 MB/s; 40 is generous headroom.
+  # Live state already has this via qm set (beads code-9v2j); HCL value
+  # matches so applies are no-ops.
+  mbps_rd = 40
+  mbps_wr = 40
 }

 # ---------------------------------------------------------------------------
-# K8s node VMs (imported from existing Proxmox VMs)
-# ---------------------------------------------------------------------------
-
-# ---------------------------------------------------------------------------
-# K8s node VMs — imported from existing Proxmox VMs
+# K8s node VMs — IMPORT ATTEMPT 2026-05-26 ABORTED, see beads code-anh3.
 #
-# NOTE: Nodes with iSCSI PVC disks (201, 203, 204) cannot be imported yet
-# due to telmate/proxmox provider bug: it constructs wrong volume references
-# for shared iSCSI disks on update, causing API 500 errors. These nodes will
-# be importable after migrating to the bpg/proxmox provider.
+# The telmate/proxmox v3.0.2-rc07 provider mangled proxmox-csi PVC disk
+# references on k8s-node2 + k8s-node3 during the import-apply: all
+# previously-attached `vm-9999-pvc-*` slots got rewritten to point at the
+# boot disk (vm-<vmid>-disk-0). VMs were saved by restoring /etc/pve/
+# qemu-server/<vmid>.conf from the 2026-05-24 nightly PVE config backup;
+# no reboots, no data loss, K8s CSI reconciled the attachment list.
+#
+# The same lesson as memory id=539: telmate trips on dynamically-attached
+# disks (was iSCSI then; now proxmox-csi — the underlying defect is the
+# same — the provider's disks{} block cannot represent slots it didn't
+# create). lifecycle.ignore_changes does NOT prevent it from re-writing
+# the disk strings on update.
+#
+# The 8 Linux VMs (101 pfsense + 300 Windows excluded) still have their
+# I/O caps applied live via qm set — see code-9v2j and the script at
+# /tmp/apply-mbps-caps.sh on devvm. Adoption into TF needs either:
+#   (a) switch to the bpg/proxmox provider (which models CSI-managed
+#       disks correctly), or
+#   (b) keep telmate but pre-detach all CSI disks before any update.
 # ---------------------------------------------------------------------------

 module "k8s-master" {
--- a/stacks/infra/terragrunt.hcl
+++ b/stacks/infra/terragrunt.hcl
@ -3,42 +3,30 @@ include "root" {
  path = find_in_parent_folders()
 }

-# Override provider generation to include proxmox + vault (k8s providers not needed)
-generate "providers" {
-  path      = "providers.tf"
-  if_exists = "overwrite"
+# The root's `k8s_providers` generate block now declares `telmate/proxmox`
+# in required_providers for every stack (harmless for non-infra stacks —
+# they just don't instantiate a `provider "proxmox" {}` block).
+#
+# Here we add the per-stack provider config + the tfvar variable for the
+# API URL. Credentials come from Vault `secret/viktor` (same pattern as
+# cloudflare_provider.tf at the root). The output file name is distinct
+# from `providers.tf` to avoid the same-path conflict that the old
+# `generate "providers"` block silently triggered under Terragrunt v0.77.
+generate "proxmox_provider" {
+  path      = "proxmox_provider.tf"
+  if_exists = "overwrite_terragrunt"
  contents  = <<EOF
-terraform {
-  required_providers {
-    vault = {
-      source  = "hashicorp/vault"
-      version = "~> 4.0"
-    }
-    proxmox = {
-      source  = "telmate/proxmox"
-      version = "3.0.2-rc07"
-    }
-  }
-}
-
-variable "kube_config_path" {
-  type    = string
-  default = "~/.kube/config"
-}
-
 variable "proxmox_pm_api_url" { type = string }
-variable "proxmox_pm_api_token_id" { type = string }
-variable "proxmox_pm_api_token_secret" { type = string }

-provider "vault" {
-  address          = "https://vault.viktorbarzin.me"
-  skip_child_token = true
+data "vault_kv_secret_v2" "proxmox_pm" {
+  mount = "secret"
+  name  = "viktor"
 }

 provider "proxmox" {
  pm_api_url          = var.proxmox_pm_api_url
-  pm_api_token_id     = var.proxmox_pm_api_token_id
-  pm_api_token_secret = var.proxmox_pm_api_token_secret
+  pm_api_token_id     = data.vault_kv_secret_v2.proxmox_pm.data["proxmox_pm_api_token_id"]
+  pm_api_token_secret = data.vault_kv_secret_v2.proxmox_pm.data["proxmox_pm_api_token_secret"]
  pm_tls_insecure     = true
 }
 EOF
--- a/state/stacks/infra/terraform.tfstate.enc
+++ b/state/stacks/infra/terraform.tfstate.enc
--- a/terragrunt.hcl
+++ b/terragrunt.hcl
@ -46,8 +46,14 @@ terraform {
  }
 }

-# Generate kubernetes + helm + cloudflare providers for all stacks.
-# The infra stack overrides this to add the proxmox provider.
+# Generate kubernetes + helm + cloudflare + proxmox providers for all stacks.
+# (Stacks that don't use proxmox simply omit any `provider "proxmox" {}` block;
+# the required_providers entry is harmless. The pre-2026-05-26 trick of the
+# infra stack overriding this block to add proxmox stopped working under
+# Terragrunt v0.77 — same-name generate blocks are now forbidden — so proxmox
+# is declared globally instead. The `provider "proxmox" {}` config lives in
+# stacks/infra/terragrunt.hcl, generated under a different filename so it
+# doesn't collide with this providers.tf.)
 generate "k8s_providers" {
  path      = "providers.tf"
  if_exists = "overwrite_terragrunt"
@ -73,6 +79,10 @@ terraform {
      source  = "gavinbunney/kubectl"
      version = "~> 1.14"
    }
+    proxmox = {
+      source  = "telmate/proxmox"
+      version = "3.0.2-rc07"
+    }
  }
 }