broker-sync: pin data-mounting CronJobs to k8s-node4 (stop nightly RWO wedge)
All checks were successful
ci/woodpecker/push/default Pipeline was successful
All checks were successful
ci/woodpecker/push/default Pipeline was successful
All broker-sync CronJobs share one RWO proxmox-lvm volume. With free scheduling the nightly 02:00-04:15 runs land on different nodes, forcing a detach/attach cycle whose QMP hotplug intermittently ghost-attaches on disk-heavy VMs — every job then sits in ContainerCreating for hours (happened 2026-06-30, 07-01 and again 07-02; fires PodsStuckContainerCreating and skips the day's trade syncs). Pinning all seven volume-mounting jobs to k8s-node4 (fewest CSI disks, 11) makes the volume attach once and stay put — no hotplug dance, no wedge. version_probe mounts nothing and stays unpinned. Durable fix for the recurrence tracked in beads code-9ko8. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
3c85af2dc2
commit
a12b09af04
1 changed files with 42 additions and 0 deletions
|
|
@ -167,6 +167,12 @@ resource "kubernetes_cron_job_v1" "trading212" {
|
|||
}
|
||||
spec {
|
||||
restart_policy = "OnFailure"
|
||||
# Pin every job that mounts the shared RWO data volume to one node:
|
||||
# cross-node scheduling forced a nightly detach/attach cycle whose
|
||||
# QMP hotplug intermittently ghost-attaches on disk-heavy VMs and
|
||||
# wedges all broker-sync jobs in ContainerCreating (2026-07-01/02).
|
||||
# One node = volume attaches once and stays put.
|
||||
node_selector = { "kubernetes.io/hostname" = "k8s-node4" }
|
||||
# See imap cron — without fsGroup=10001 the broker user (uid=10001
|
||||
# gid=999) can't write the sqlite3 journal next to /data/sync.db
|
||||
# and the dedup.record() call after a successful WF import crashes
|
||||
|
|
@ -277,6 +283,12 @@ resource "kubernetes_cron_job_v1" "ibkr" {
|
|||
}
|
||||
spec {
|
||||
restart_policy = "OnFailure"
|
||||
# Pin every job that mounts the shared RWO data volume to one node:
|
||||
# cross-node scheduling forced a nightly detach/attach cycle whose
|
||||
# QMP hotplug intermittently ghost-attaches on disk-heavy VMs and
|
||||
# wedges all broker-sync jobs in ContainerCreating (2026-07-01/02).
|
||||
# One node = volume attaches once and stays put.
|
||||
node_selector = { "kubernetes.io/hostname" = "k8s-node4" }
|
||||
security_context {
|
||||
fs_group = 10001
|
||||
}
|
||||
|
|
@ -411,6 +423,12 @@ resource "kubernetes_cron_job_v1" "imap" {
|
|||
}
|
||||
spec {
|
||||
restart_policy = "OnFailure"
|
||||
# Pin every job that mounts the shared RWO data volume to one node:
|
||||
# cross-node scheduling forced a nightly detach/attach cycle whose
|
||||
# QMP hotplug intermittently ghost-attaches on disk-heavy VMs and
|
||||
# wedges all broker-sync jobs in ContainerCreating (2026-07-01/02).
|
||||
# One node = volume attaches once and stays put.
|
||||
node_selector = { "kubernetes.io/hostname" = "k8s-node4" }
|
||||
# The broker image's user is uid=10001 gid=999, but the shared
|
||||
# data PVC's /data root was created with gid=10001 (legacy from
|
||||
# an earlier image build). Without fsGroup the pod can't write
|
||||
|
|
@ -563,6 +581,12 @@ resource "kubernetes_cron_job_v1" "csv_drop" {
|
|||
}
|
||||
spec {
|
||||
restart_policy = "OnFailure"
|
||||
# Pin every job that mounts the shared RWO data volume to one node:
|
||||
# cross-node scheduling forced a nightly detach/attach cycle whose
|
||||
# QMP hotplug intermittently ghost-attaches on disk-heavy VMs and
|
||||
# wedges all broker-sync jobs in ContainerCreating (2026-07-01/02).
|
||||
# One node = volume attaches once and stays put.
|
||||
node_selector = { "kubernetes.io/hostname" = "k8s-node4" }
|
||||
container {
|
||||
name = "broker-sync"
|
||||
image = local.broker_sync_image
|
||||
|
|
@ -655,6 +679,12 @@ resource "kubernetes_cron_job_v1" "fx_reconcile" {
|
|||
}
|
||||
spec {
|
||||
restart_policy = "OnFailure"
|
||||
# Pin every job that mounts the shared RWO data volume to one node:
|
||||
# cross-node scheduling forced a nightly detach/attach cycle whose
|
||||
# QMP hotplug intermittently ghost-attaches on disk-heavy VMs and
|
||||
# wedges all broker-sync jobs in ContainerCreating (2026-07-01/02).
|
||||
# One node = volume attaches once and stays put.
|
||||
node_selector = { "kubernetes.io/hostname" = "k8s-node4" }
|
||||
container {
|
||||
name = "broker-sync"
|
||||
image = local.broker_sync_image
|
||||
|
|
@ -747,6 +777,12 @@ resource "kubernetes_cron_job_v1" "backup" {
|
|||
}
|
||||
spec {
|
||||
restart_policy = "OnFailure"
|
||||
# Pin every job that mounts the shared RWO data volume to one node:
|
||||
# cross-node scheduling forced a nightly detach/attach cycle whose
|
||||
# QMP hotplug intermittently ghost-attaches on disk-heavy VMs and
|
||||
# wedges all broker-sync jobs in ContainerCreating (2026-07-01/02).
|
||||
# One node = volume attaches once and stays put.
|
||||
node_selector = { "kubernetes.io/hostname" = "k8s-node4" }
|
||||
container {
|
||||
name = "backup"
|
||||
image = "alpine:3.20"
|
||||
|
|
@ -850,6 +886,12 @@ resource "kubernetes_cron_job_v1" "fidelity" {
|
|||
}
|
||||
spec {
|
||||
restart_policy = "OnFailure"
|
||||
# Pin every job that mounts the shared RWO data volume to one node:
|
||||
# cross-node scheduling forced a nightly detach/attach cycle whose
|
||||
# QMP hotplug intermittently ghost-attaches on disk-heavy VMs and
|
||||
# wedges all broker-sync jobs in ContainerCreating (2026-07-01/02).
|
||||
# One node = volume attaches once and stays put.
|
||||
node_selector = { "kubernetes.io/hostname" = "k8s-node4" }
|
||||
# Materialise the JSON storage_state from the projected Secret
|
||||
# onto the PVC where Playwright expects to read it. Init container
|
||||
# runs as root; the main broker-sync container runs as uid 10001,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue