add vaultwarden daily backup CronJob to NFS
SQLite backup via Online Backup API + copy of RSA keys, attachments, sends, and config. 30-day retention with rotation. Pod affinity ensures co-scheduling with vaultwarden for RWO PVC access.
This commit is contained in:
parent
3c622659d8
commit
9acbcc7718
127 changed files with 2521 additions and 413 deletions
|
|
@ -35,8 +35,8 @@ resource "kubernetes_resource_quota" "dbaas" {
|
|||
spec {
|
||||
hard = {
|
||||
"requests.cpu" = "8"
|
||||
"requests.memory" = "12Gi"
|
||||
"limits.memory" = "12Gi"
|
||||
"requests.memory" = "16Gi"
|
||||
"limits.memory" = "16Gi"
|
||||
pods = "30"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,7 +2,8 @@ resource "kubernetes_namespace" "iscsi_csi" {
|
|||
metadata {
|
||||
name = "iscsi-csi"
|
||||
labels = {
|
||||
tier = var.tier
|
||||
tier = var.tier
|
||||
"resource-governance/custom-quota" = "true"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ resource "helm_release" "kyverno" {
|
|||
reportsController = {
|
||||
resources = {
|
||||
limits = {
|
||||
memory = "128Mi"
|
||||
memory = "256Mi"
|
||||
}
|
||||
requests = {
|
||||
cpu = "100m"
|
||||
|
|
|
|||
|
|
@ -790,57 +790,6 @@ resource "kubernetes_manifest" "mutate_priority_from_tier" {
|
|||
}
|
||||
}
|
||||
|
||||
# --- GPU toleration for critical tiers ---
|
||||
# Allows pods in tier-0-core and tier-1-cluster namespaces to overflow onto the
|
||||
# GPU node during N-1 failures. Uses patchesJson6902 (not patchStrategicMerge)
|
||||
# to APPEND the toleration without replacing existing tolerations.
|
||||
resource "kubernetes_manifest" "mutate_gpu_toleration" {
|
||||
manifest = {
|
||||
apiVersion = "kyverno.io/v1"
|
||||
kind = "ClusterPolicy"
|
||||
metadata = {
|
||||
name = "gpu-toleration-critical-tiers"
|
||||
annotations = {
|
||||
"policies.kyverno.io/title" = "GPU Toleration for Critical Tiers"
|
||||
"policies.kyverno.io/description" = "Adds nvidia.com/gpu toleration to pods in tier-0-core and tier-1-cluster namespaces so they can overflow onto the GPU node during N-1 failures."
|
||||
}
|
||||
}
|
||||
spec = {
|
||||
rules = [for tier in ["0-core", "1-cluster"] : {
|
||||
name = "add-gpu-toleration-tier-${split("-", tier)[0]}"
|
||||
match = {
|
||||
any = [
|
||||
{
|
||||
resources = {
|
||||
kinds = ["Pod"]
|
||||
operations = ["CREATE"]
|
||||
namespaceSelector = {
|
||||
matchLabels = {
|
||||
tier = tier
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
skipBackgroundRequests = true
|
||||
mutate = {
|
||||
patchesJson6902 = yamlencode([
|
||||
{
|
||||
op = "add"
|
||||
path = "/spec/tolerations/-"
|
||||
value = {
|
||||
key = "nvidia.com/gpu"
|
||||
operator = "Exists"
|
||||
effect = "NoSchedule"
|
||||
}
|
||||
}
|
||||
])
|
||||
}
|
||||
}]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# --- ndots:2 injection ---
|
||||
# Kubernetes defaults to ndots:5, which causes 4 wasted NxDomain queries per
|
||||
|
|
@ -909,90 +858,3 @@ resource "kubernetes_manifest" "mutate_ndots" {
|
|||
}
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Layer 5: GPU Node Toleration for Critical Services (Kyverno Mutate)
|
||||
# -----------------------------------------------------------------------------
|
||||
# Adds nvidia.com/gpu toleration to pods in tier-0 and tier-1 namespaces.
|
||||
# This allows critical infrastructure to overflow onto the GPU node (k8s-node1)
|
||||
# during N-1 scenarios, giving the scheduler ~14 GiB extra capacity.
|
||||
# GPU workloads won't be preempted — this just makes the node eligible.
|
||||
|
||||
resource "kubernetes_manifest" "mutate_gpu_toleration_critical" {
|
||||
manifest = {
|
||||
apiVersion = "kyverno.io/v1"
|
||||
kind = "ClusterPolicy"
|
||||
metadata = {
|
||||
name = "gpu-toleration-critical-tiers"
|
||||
annotations = {
|
||||
"policies.kyverno.io/title" = "GPU Toleration for Critical Tiers"
|
||||
"policies.kyverno.io/description" = "Adds nvidia.com/gpu toleration to pods in tier-0-core and tier-1-cluster namespaces so they can overflow onto the GPU node during N-1 failures."
|
||||
}
|
||||
}
|
||||
spec = {
|
||||
rules = [
|
||||
{
|
||||
name = "add-gpu-toleration-tier-0"
|
||||
match = {
|
||||
any = [
|
||||
{
|
||||
resources = {
|
||||
kinds = ["Pod"]
|
||||
operations = ["CREATE"]
|
||||
namespaceSelector = {
|
||||
matchLabels = {
|
||||
tier = "0-core"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
mutate = {
|
||||
patchStrategicMerge = {
|
||||
spec = {
|
||||
tolerations = [
|
||||
{
|
||||
key = "nvidia.com/gpu"
|
||||
operator = "Exists"
|
||||
effect = "NoSchedule"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
name = "add-gpu-toleration-tier-1"
|
||||
match = {
|
||||
any = [
|
||||
{
|
||||
resources = {
|
||||
kinds = ["Pod"]
|
||||
operations = ["CREATE"]
|
||||
namespaceSelector = {
|
||||
matchLabels = {
|
||||
tier = "1-cluster"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
mutate = {
|
||||
patchStrategicMerge = {
|
||||
spec = {
|
||||
tolerations = [
|
||||
{
|
||||
key = "nvidia.com/gpu"
|
||||
operator = "Exists"
|
||||
effect = "NoSchedule"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ variable "k8s_users" {
|
|||
# Binds to built-in cluster-admin ClusterRole
|
||||
|
||||
resource "kubernetes_cluster_role_binding" "admin_users" {
|
||||
for_each = { for name, user in var.k8s_users : name => user if user.role == "admin" }
|
||||
for_each = nonsensitive({ for name, user in var.k8s_users : name => user if user.role == "admin" })
|
||||
|
||||
metadata {
|
||||
name = "oidc-admin-${each.key}"
|
||||
|
|
@ -109,7 +109,7 @@ resource "kubernetes_cluster_role" "power_user" {
|
|||
}
|
||||
|
||||
resource "kubernetes_cluster_role_binding" "power_users" {
|
||||
for_each = { for name, user in var.k8s_users : name => user if user.role == "power-user" }
|
||||
for_each = nonsensitive({ for name, user in var.k8s_users : name => user if user.role == "power-user" })
|
||||
|
||||
metadata {
|
||||
name = "oidc-power-user-${each.key}"
|
||||
|
|
@ -146,7 +146,7 @@ locals {
|
|||
}
|
||||
|
||||
resource "kubernetes_role_binding" "namespace_owner" {
|
||||
for_each = { for pair in local.namespace_owner_pairs : "${pair.user_key}-${pair.namespace}" => pair }
|
||||
for_each = nonsensitive({ for pair in local.namespace_owner_pairs : "${pair.user_key}-${pair.namespace}" => pair })
|
||||
|
||||
metadata {
|
||||
name = "namespace-owner-${each.value.user_key}"
|
||||
|
|
@ -192,7 +192,7 @@ resource "kubernetes_cluster_role" "namespace_owner_readonly" {
|
|||
}
|
||||
|
||||
resource "kubernetes_cluster_role_binding" "namespace_owner_readonly" {
|
||||
for_each = { for name, user in var.k8s_users : name => user if user.role == "namespace-owner" }
|
||||
for_each = nonsensitive({ for name, user in var.k8s_users : name => user if user.role == "namespace-owner" })
|
||||
|
||||
metadata {
|
||||
name = "oidc-ns-owner-readonly-${each.key}"
|
||||
|
|
@ -213,7 +213,7 @@ resource "kubernetes_cluster_role_binding" "namespace_owner_readonly" {
|
|||
|
||||
# Resource quotas per user namespace
|
||||
resource "kubernetes_resource_quota" "user_namespace_quota" {
|
||||
for_each = { for pair in local.namespace_owner_pairs : "${pair.user_key}-${pair.namespace}" => pair }
|
||||
for_each = nonsensitive({ for pair in local.namespace_owner_pairs : "${pair.user_key}-${pair.namespace}" => pair })
|
||||
|
||||
metadata {
|
||||
name = "user-quota"
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ variable "tls_secret_name" {}
|
|||
variable "tier" { type = string }
|
||||
variable "smtp_password" {}
|
||||
variable "mail_host" { type = string }
|
||||
variable "nfs_server" { type = string }
|
||||
|
||||
resource "kubernetes_namespace" "vaultwarden" {
|
||||
metadata {
|
||||
|
|
@ -193,3 +194,101 @@ module "ingress" {
|
|||
"gethomepage.dev/pod-selector" = ""
|
||||
}
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Backup — Daily SQLite + data files to NFS
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
module "nfs_vaultwarden_backup" {
|
||||
source = "../../../../modules/kubernetes/nfs_volume"
|
||||
name = "vaultwarden-backup"
|
||||
namespace = kubernetes_namespace.vaultwarden.metadata[0].name
|
||||
nfs_server = var.nfs_server
|
||||
nfs_path = "/mnt/main/vaultwarden-backup"
|
||||
}
|
||||
|
||||
resource "kubernetes_cron_job_v1" "vaultwarden-backup" {
|
||||
metadata {
|
||||
name = "vaultwarden-backup"
|
||||
namespace = kubernetes_namespace.vaultwarden.metadata[0].name
|
||||
}
|
||||
spec {
|
||||
concurrency_policy = "Replace"
|
||||
failed_jobs_history_limit = 5
|
||||
schedule = "0 0 * * *"
|
||||
starting_deadline_seconds = 10
|
||||
successful_jobs_history_limit = 10
|
||||
job_template {
|
||||
metadata {}
|
||||
spec {
|
||||
backoff_limit = 3
|
||||
ttl_seconds_after_finished = 10
|
||||
template {
|
||||
metadata {}
|
||||
spec {
|
||||
affinity {
|
||||
pod_affinity {
|
||||
required_during_scheduling_ignored_during_execution {
|
||||
label_selector {
|
||||
match_labels = {
|
||||
app = "vaultwarden"
|
||||
}
|
||||
}
|
||||
topology_key = "kubernetes.io/hostname"
|
||||
}
|
||||
}
|
||||
}
|
||||
container {
|
||||
name = "vaultwarden-backup"
|
||||
image = "alpine"
|
||||
command = ["/bin/sh", "-c", <<-EOT
|
||||
set -euxo pipefail
|
||||
apk add --no-cache sqlite
|
||||
now=$(date +"%Y_%m_%d_%H_%M")
|
||||
mkdir -p /backup/$now
|
||||
# Safe SQLite backup (handles WAL/locks)
|
||||
sqlite3 /data/db.sqlite3 ".backup /backup/$now/db.sqlite3"
|
||||
# Copy RSA keys, attachments, sends, config
|
||||
cp -a /data/rsa_key.pem /data/rsa_key.pub.pem /backup/$now/ 2>/dev/null || true
|
||||
cp -a /data/attachments /backup/$now/ 2>/dev/null || true
|
||||
cp -a /data/sends /backup/$now/ 2>/dev/null || true
|
||||
cp -a /data/config.json /backup/$now/ 2>/dev/null || true
|
||||
# Rotate — 30 day retention
|
||||
find /backup -maxdepth 1 -mindepth 1 -type d -mtime +30 -exec rm -rf {} +
|
||||
echo "Backup complete: $now"
|
||||
EOT
|
||||
]
|
||||
volume_mount {
|
||||
name = "data"
|
||||
mount_path = "/data"
|
||||
read_only = true
|
||||
}
|
||||
volume_mount {
|
||||
name = "backup"
|
||||
mount_path = "/backup"
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "data"
|
||||
persistent_volume_claim {
|
||||
claim_name = kubernetes_persistent_volume_claim.vaultwarden_data.metadata[0].name
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "backup"
|
||||
persistent_volume_claim {
|
||||
claim_name = module.nfs_vaultwarden_backup.claim_name
|
||||
}
|
||||
}
|
||||
dns_config {
|
||||
option {
|
||||
name = "ndots"
|
||||
value = "2"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue