add vaultwarden daily backup CronJob to NFS
SQLite backup via Online Backup API + copy of RSA keys, attachments, sends, and config. 30-day retention with rotation. Pod affinity ensures co-scheduling with vaultwarden for RWO PVC access.
This commit is contained in:
parent
3c622659d8
commit
9acbcc7718
127 changed files with 2521 additions and 413 deletions
|
|
@ -30,7 +30,7 @@ resource "helm_release" "kyverno" {
|
|||
reportsController = {
|
||||
resources = {
|
||||
limits = {
|
||||
memory = "128Mi"
|
||||
memory = "256Mi"
|
||||
}
|
||||
requests = {
|
||||
cpu = "100m"
|
||||
|
|
|
|||
|
|
@ -790,57 +790,6 @@ resource "kubernetes_manifest" "mutate_priority_from_tier" {
|
|||
}
|
||||
}
|
||||
|
||||
# --- GPU toleration for critical tiers ---
|
||||
# Allows pods in tier-0-core and tier-1-cluster namespaces to overflow onto the
|
||||
# GPU node during N-1 failures. Uses patchesJson6902 (not patchStrategicMerge)
|
||||
# to APPEND the toleration without replacing existing tolerations.
|
||||
resource "kubernetes_manifest" "mutate_gpu_toleration" {
|
||||
manifest = {
|
||||
apiVersion = "kyverno.io/v1"
|
||||
kind = "ClusterPolicy"
|
||||
metadata = {
|
||||
name = "gpu-toleration-critical-tiers"
|
||||
annotations = {
|
||||
"policies.kyverno.io/title" = "GPU Toleration for Critical Tiers"
|
||||
"policies.kyverno.io/description" = "Adds nvidia.com/gpu toleration to pods in tier-0-core and tier-1-cluster namespaces so they can overflow onto the GPU node during N-1 failures."
|
||||
}
|
||||
}
|
||||
spec = {
|
||||
rules = [for tier in ["0-core", "1-cluster"] : {
|
||||
name = "add-gpu-toleration-tier-${split("-", tier)[0]}"
|
||||
match = {
|
||||
any = [
|
||||
{
|
||||
resources = {
|
||||
kinds = ["Pod"]
|
||||
operations = ["CREATE"]
|
||||
namespaceSelector = {
|
||||
matchLabels = {
|
||||
tier = tier
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
skipBackgroundRequests = true
|
||||
mutate = {
|
||||
patchesJson6902 = yamlencode([
|
||||
{
|
||||
op = "add"
|
||||
path = "/spec/tolerations/-"
|
||||
value = {
|
||||
key = "nvidia.com/gpu"
|
||||
operator = "Exists"
|
||||
effect = "NoSchedule"
|
||||
}
|
||||
}
|
||||
])
|
||||
}
|
||||
}]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# --- ndots:2 injection ---
|
||||
# Kubernetes defaults to ndots:5, which causes 4 wasted NxDomain queries per
|
||||
|
|
@ -909,90 +858,3 @@ resource "kubernetes_manifest" "mutate_ndots" {
|
|||
}
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Layer 5: GPU Node Toleration for Critical Services (Kyverno Mutate)
|
||||
# -----------------------------------------------------------------------------
|
||||
# Adds nvidia.com/gpu toleration to pods in tier-0 and tier-1 namespaces.
|
||||
# This allows critical infrastructure to overflow onto the GPU node (k8s-node1)
|
||||
# during N-1 scenarios, giving the scheduler ~14 GiB extra capacity.
|
||||
# GPU workloads won't be preempted — this just makes the node eligible.
|
||||
|
||||
resource "kubernetes_manifest" "mutate_gpu_toleration_critical" {
|
||||
manifest = {
|
||||
apiVersion = "kyverno.io/v1"
|
||||
kind = "ClusterPolicy"
|
||||
metadata = {
|
||||
name = "gpu-toleration-critical-tiers"
|
||||
annotations = {
|
||||
"policies.kyverno.io/title" = "GPU Toleration for Critical Tiers"
|
||||
"policies.kyverno.io/description" = "Adds nvidia.com/gpu toleration to pods in tier-0-core and tier-1-cluster namespaces so they can overflow onto the GPU node during N-1 failures."
|
||||
}
|
||||
}
|
||||
spec = {
|
||||
rules = [
|
||||
{
|
||||
name = "add-gpu-toleration-tier-0"
|
||||
match = {
|
||||
any = [
|
||||
{
|
||||
resources = {
|
||||
kinds = ["Pod"]
|
||||
operations = ["CREATE"]
|
||||
namespaceSelector = {
|
||||
matchLabels = {
|
||||
tier = "0-core"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
mutate = {
|
||||
patchStrategicMerge = {
|
||||
spec = {
|
||||
tolerations = [
|
||||
{
|
||||
key = "nvidia.com/gpu"
|
||||
operator = "Exists"
|
||||
effect = "NoSchedule"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
name = "add-gpu-toleration-tier-1"
|
||||
match = {
|
||||
any = [
|
||||
{
|
||||
resources = {
|
||||
kinds = ["Pod"]
|
||||
operations = ["CREATE"]
|
||||
namespaceSelector = {
|
||||
matchLabels = {
|
||||
tier = "1-cluster"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
mutate = {
|
||||
patchStrategicMerge = {
|
||||
spec = {
|
||||
tolerations = [
|
||||
{
|
||||
key = "nvidia.com/gpu"
|
||||
operator = "Exists"
|
||||
effect = "NoSchedule"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue