From a12b09af041692ed595a707fd4b61c6380b9923f Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Thu, 2 Jul 2026 05:16:38 +0000 Subject: [PATCH] broker-sync: pin data-mounting CronJobs to k8s-node4 (stop nightly RWO wedge) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All broker-sync CronJobs share one RWO proxmox-lvm volume. With free scheduling the nightly 02:00-04:15 runs land on different nodes, forcing a detach/attach cycle whose QMP hotplug intermittently ghost-attaches on disk-heavy VMs — every job then sits in ContainerCreating for hours (happened 2026-06-30, 07-01 and again 07-02; fires PodsStuckContainerCreating and skips the day's trade syncs). Pinning all seven volume-mounting jobs to k8s-node4 (fewest CSI disks, 11) makes the volume attach once and stay put — no hotplug dance, no wedge. version_probe mounts nothing and stays unpinned. Durable fix for the recurrence tracked in beads code-9ko8. Co-Authored-By: Claude Fable 5 --- stacks/broker-sync/main.tf | 42 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/stacks/broker-sync/main.tf b/stacks/broker-sync/main.tf index 76d822d8..67f8089e 100644 --- a/stacks/broker-sync/main.tf +++ b/stacks/broker-sync/main.tf @@ -167,6 +167,12 @@ resource "kubernetes_cron_job_v1" "trading212" { } spec { restart_policy = "OnFailure" + # Pin every job that mounts the shared RWO data volume to one node: + # cross-node scheduling forced a nightly detach/attach cycle whose + # QMP hotplug intermittently ghost-attaches on disk-heavy VMs and + # wedges all broker-sync jobs in ContainerCreating (2026-07-01/02). + # One node = volume attaches once and stays put. + node_selector = { "kubernetes.io/hostname" = "k8s-node4" } # See imap cron — without fsGroup=10001 the broker user (uid=10001 # gid=999) can't write the sqlite3 journal next to /data/sync.db # and the dedup.record() call after a successful WF import crashes @@ -277,6 +283,12 @@ resource "kubernetes_cron_job_v1" "ibkr" { } spec { restart_policy = "OnFailure" + # Pin every job that mounts the shared RWO data volume to one node: + # cross-node scheduling forced a nightly detach/attach cycle whose + # QMP hotplug intermittently ghost-attaches on disk-heavy VMs and + # wedges all broker-sync jobs in ContainerCreating (2026-07-01/02). + # One node = volume attaches once and stays put. + node_selector = { "kubernetes.io/hostname" = "k8s-node4" } security_context { fs_group = 10001 } @@ -411,6 +423,12 @@ resource "kubernetes_cron_job_v1" "imap" { } spec { restart_policy = "OnFailure" + # Pin every job that mounts the shared RWO data volume to one node: + # cross-node scheduling forced a nightly detach/attach cycle whose + # QMP hotplug intermittently ghost-attaches on disk-heavy VMs and + # wedges all broker-sync jobs in ContainerCreating (2026-07-01/02). + # One node = volume attaches once and stays put. + node_selector = { "kubernetes.io/hostname" = "k8s-node4" } # The broker image's user is uid=10001 gid=999, but the shared # data PVC's /data root was created with gid=10001 (legacy from # an earlier image build). Without fsGroup the pod can't write @@ -563,6 +581,12 @@ resource "kubernetes_cron_job_v1" "csv_drop" { } spec { restart_policy = "OnFailure" + # Pin every job that mounts the shared RWO data volume to one node: + # cross-node scheduling forced a nightly detach/attach cycle whose + # QMP hotplug intermittently ghost-attaches on disk-heavy VMs and + # wedges all broker-sync jobs in ContainerCreating (2026-07-01/02). + # One node = volume attaches once and stays put. + node_selector = { "kubernetes.io/hostname" = "k8s-node4" } container { name = "broker-sync" image = local.broker_sync_image @@ -655,6 +679,12 @@ resource "kubernetes_cron_job_v1" "fx_reconcile" { } spec { restart_policy = "OnFailure" + # Pin every job that mounts the shared RWO data volume to one node: + # cross-node scheduling forced a nightly detach/attach cycle whose + # QMP hotplug intermittently ghost-attaches on disk-heavy VMs and + # wedges all broker-sync jobs in ContainerCreating (2026-07-01/02). + # One node = volume attaches once and stays put. + node_selector = { "kubernetes.io/hostname" = "k8s-node4" } container { name = "broker-sync" image = local.broker_sync_image @@ -747,6 +777,12 @@ resource "kubernetes_cron_job_v1" "backup" { } spec { restart_policy = "OnFailure" + # Pin every job that mounts the shared RWO data volume to one node: + # cross-node scheduling forced a nightly detach/attach cycle whose + # QMP hotplug intermittently ghost-attaches on disk-heavy VMs and + # wedges all broker-sync jobs in ContainerCreating (2026-07-01/02). + # One node = volume attaches once and stays put. + node_selector = { "kubernetes.io/hostname" = "k8s-node4" } container { name = "backup" image = "alpine:3.20" @@ -850,6 +886,12 @@ resource "kubernetes_cron_job_v1" "fidelity" { } spec { restart_policy = "OnFailure" + # Pin every job that mounts the shared RWO data volume to one node: + # cross-node scheduling forced a nightly detach/attach cycle whose + # QMP hotplug intermittently ghost-attaches on disk-heavy VMs and + # wedges all broker-sync jobs in ContainerCreating (2026-07-01/02). + # One node = volume attaches once and stays put. + node_selector = { "kubernetes.io/hostname" = "k8s-node4" } # Materialise the JSON storage_state from the projected Secret # onto the PVC where Playwright expects to read it. Init container # runs as root; the main broker-sync container runs as uid 10001,