broker-sync: pin data-mounting CronJobs to k8s-node4 (stop nightly RWO wedge)
All checks were successful
ci/woodpecker/push/default Pipeline was successful

All broker-sync CronJobs share one RWO proxmox-lvm volume. With free
scheduling the nightly 02:00-04:15 runs land on different nodes, forcing
a detach/attach cycle whose QMP hotplug intermittently ghost-attaches on
disk-heavy VMs — every job then sits in ContainerCreating for hours
(happened 2026-06-30, 07-01 and again 07-02; fires
PodsStuckContainerCreating and skips the day's trade syncs). Pinning all
seven volume-mounting jobs to k8s-node4 (fewest CSI disks, 11) makes the
volume attach once and stay put — no hotplug dance, no wedge.
version_probe mounts nothing and stays unpinned. Durable fix for the
recurrence tracked in beads code-9ko8.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-07-02 05:16:38 +00:00
parent 3c85af2dc2
commit a12b09af04

View file

@ -167,6 +167,12 @@ resource "kubernetes_cron_job_v1" "trading212" {
}
spec {
restart_policy = "OnFailure"
# Pin every job that mounts the shared RWO data volume to one node:
# cross-node scheduling forced a nightly detach/attach cycle whose
# QMP hotplug intermittently ghost-attaches on disk-heavy VMs and
# wedges all broker-sync jobs in ContainerCreating (2026-07-01/02).
# One node = volume attaches once and stays put.
node_selector = { "kubernetes.io/hostname" = "k8s-node4" }
# See imap cron without fsGroup=10001 the broker user (uid=10001
# gid=999) can't write the sqlite3 journal next to /data/sync.db
# and the dedup.record() call after a successful WF import crashes
@ -277,6 +283,12 @@ resource "kubernetes_cron_job_v1" "ibkr" {
}
spec {
restart_policy = "OnFailure"
# Pin every job that mounts the shared RWO data volume to one node:
# cross-node scheduling forced a nightly detach/attach cycle whose
# QMP hotplug intermittently ghost-attaches on disk-heavy VMs and
# wedges all broker-sync jobs in ContainerCreating (2026-07-01/02).
# One node = volume attaches once and stays put.
node_selector = { "kubernetes.io/hostname" = "k8s-node4" }
security_context {
fs_group = 10001
}
@ -411,6 +423,12 @@ resource "kubernetes_cron_job_v1" "imap" {
}
spec {
restart_policy = "OnFailure"
# Pin every job that mounts the shared RWO data volume to one node:
# cross-node scheduling forced a nightly detach/attach cycle whose
# QMP hotplug intermittently ghost-attaches on disk-heavy VMs and
# wedges all broker-sync jobs in ContainerCreating (2026-07-01/02).
# One node = volume attaches once and stays put.
node_selector = { "kubernetes.io/hostname" = "k8s-node4" }
# The broker image's user is uid=10001 gid=999, but the shared
# data PVC's /data root was created with gid=10001 (legacy from
# an earlier image build). Without fsGroup the pod can't write
@ -563,6 +581,12 @@ resource "kubernetes_cron_job_v1" "csv_drop" {
}
spec {
restart_policy = "OnFailure"
# Pin every job that mounts the shared RWO data volume to one node:
# cross-node scheduling forced a nightly detach/attach cycle whose
# QMP hotplug intermittently ghost-attaches on disk-heavy VMs and
# wedges all broker-sync jobs in ContainerCreating (2026-07-01/02).
# One node = volume attaches once and stays put.
node_selector = { "kubernetes.io/hostname" = "k8s-node4" }
container {
name = "broker-sync"
image = local.broker_sync_image
@ -655,6 +679,12 @@ resource "kubernetes_cron_job_v1" "fx_reconcile" {
}
spec {
restart_policy = "OnFailure"
# Pin every job that mounts the shared RWO data volume to one node:
# cross-node scheduling forced a nightly detach/attach cycle whose
# QMP hotplug intermittently ghost-attaches on disk-heavy VMs and
# wedges all broker-sync jobs in ContainerCreating (2026-07-01/02).
# One node = volume attaches once and stays put.
node_selector = { "kubernetes.io/hostname" = "k8s-node4" }
container {
name = "broker-sync"
image = local.broker_sync_image
@ -747,6 +777,12 @@ resource "kubernetes_cron_job_v1" "backup" {
}
spec {
restart_policy = "OnFailure"
# Pin every job that mounts the shared RWO data volume to one node:
# cross-node scheduling forced a nightly detach/attach cycle whose
# QMP hotplug intermittently ghost-attaches on disk-heavy VMs and
# wedges all broker-sync jobs in ContainerCreating (2026-07-01/02).
# One node = volume attaches once and stays put.
node_selector = { "kubernetes.io/hostname" = "k8s-node4" }
container {
name = "backup"
image = "alpine:3.20"
@ -850,6 +886,12 @@ resource "kubernetes_cron_job_v1" "fidelity" {
}
spec {
restart_policy = "OnFailure"
# Pin every job that mounts the shared RWO data volume to one node:
# cross-node scheduling forced a nightly detach/attach cycle whose
# QMP hotplug intermittently ghost-attaches on disk-heavy VMs and
# wedges all broker-sync jobs in ContainerCreating (2026-07-01/02).
# One node = volume attaches once and stays put.
node_selector = { "kubernetes.io/hostname" = "k8s-node4" }
# Materialise the JSON storage_state from the projected Secret
# onto the PVC where Playwright expects to read it. Init container
# runs as root; the main broker-sync container runs as uid 10001,