state(dbaas): update encrypted state

This commit is contained in:
Viktor Barzin 2026-03-19 20:23:59 +00:00
parent 67d1ce453c
commit 21bb3036af
34 changed files with 381 additions and 186 deletions

View file

@ -49,6 +49,9 @@ agent:
- name: whitelist
configMap:
name: crowdsec-whitelist
podAnnotations:
dependency.kyverno.io/wait-for: "mysql.dbaas:3306"
lapi:
resources:
requests:

View file

@ -101,8 +101,8 @@ resource "kubernetes_cron_job_v1" "backup-etcd" {
container {
name = "backup-etcd"
image = "registry.k8s.io/etcd:3.5.21-0"
command = ["etcdctl"]
args = ["--endpoints=https://127.0.0.1:2379", "--cacert=/etc/kubernetes/pki/etcd/ca.crt", "--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt", "--key=/etc/kubernetes/pki/etcd/healthcheck-client.key", "snapshot", "save", "/backup/etcd-snapshot-latest.db"]
command = ["/bin/sh", "-c"]
args = ["ETCDCTL_API=3 etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y%m%d-%H%M%S).db"]
env {
name = "ETCDCTL_API"
value = "3"

View file

@ -160,6 +160,97 @@ resource "kubernetes_manifest" "policy_restrict_capabilities" {
depends_on = [helm_release.kyverno]
}
# =============================================================================
# Image Pull Policy Governance
# =============================================================================
# Mutate imagePullPolicy to IfNotPresent for all containers with pinned tags
# (non-:latest). This prevents pods from getting stuck in ImagePullBackOff
# when the pull-through cache at 10.0.20.10 has transient failures.
# For :latest or untagged images, set to Always so stale images don't persist.
resource "kubernetes_manifest" "policy_set_image_pull_policy" {
manifest = {
apiVersion = "kyverno.io/v1"
kind = "ClusterPolicy"
metadata = {
name = "set-image-pull-policy"
annotations = {
"policies.kyverno.io/title" = "Set Image Pull Policy"
"policies.kyverno.io/category" = "Best Practices"
"policies.kyverno.io/severity" = "medium"
"policies.kyverno.io/description" = "Set imagePullPolicy to IfNotPresent for pinned tags and Always for :latest to prevent ImagePullBackOff from transient cache failures."
}
}
spec = {
background = false
rules = [
{
name = "set-ifnotpresent-for-pinned-tags"
match = {
any = [{
resources = {
kinds = ["Pod"]
}
}]
}
mutate = {
foreach = [{
list = "request.object.spec.containers"
preconditions = {
all = [{
key = "{{ ends_with(element.image, ':latest') || !contains(element.image, ':') }}"
operator = "Equals"
value = false
}]
}
patchStrategicMerge = {
spec = {
containers = [{
name = "{{ element.name }}"
imagePullPolicy = "IfNotPresent"
}]
}
}
}]
}
},
{
name = "set-always-for-latest"
match = {
any = [{
resources = {
kinds = ["Pod"]
}
}]
}
mutate = {
foreach = [{
list = "request.object.spec.containers"
preconditions = {
all = [{
key = "{{ ends_with(element.image, ':latest') || !contains(element.image, ':') }}"
operator = "Equals"
value = true
}]
}
patchStrategicMerge = {
spec = {
containers = [{
name = "{{ element.name }}"
imagePullPolicy = "Always"
}]
}
}
}]
}
}
]
}
}
depends_on = [helm_release.kyverno]
}
resource "kubernetes_manifest" "policy_require_trusted_registries" {
manifest = {
apiVersion = "kyverno.io/v1"

View file

@ -500,6 +500,90 @@ serverFiles:
severity: critical
annotations:
summary: "etcd backup CronJob has never completed successfully"
- alert: PostgreSQLBackupStale
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="postgresql-backup", namespace="dbaas"}) > 129600
for: 30m
labels:
severity: critical
annotations:
summary: "PostgreSQL backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
- alert: PostgreSQLBackupNeverSucceeded
expr: kube_cronjob_status_last_successful_time{cronjob="postgresql-backup", namespace="dbaas"} == 0
for: 1h
labels:
severity: critical
annotations:
summary: "PostgreSQL backup CronJob has never completed successfully"
- alert: MySQLBackupStale
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="mysql-backup", namespace="dbaas"}) > 129600
for: 30m
labels:
severity: critical
annotations:
summary: "MySQL backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
- alert: MySQLBackupNeverSucceeded
expr: kube_cronjob_status_last_successful_time{cronjob="mysql-backup", namespace="dbaas"} == 0
for: 1h
labels:
severity: critical
annotations:
summary: "MySQL backup CronJob has never completed successfully"
- alert: VaultBackupStale
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="vault-raft-backup", namespace="vault"}) > 129600
for: 30m
labels:
severity: critical
annotations:
summary: "Vault backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
- alert: VaultBackupNeverSucceeded
expr: kube_cronjob_status_last_successful_time{cronjob="vault-raft-backup", namespace="vault"} == 0
for: 1h
labels:
severity: critical
annotations:
summary: "Vault backup CronJob has never completed successfully"
- alert: VaultwardenBackupStale
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="vaultwarden-backup", namespace="vaultwarden"}) > 129600
for: 30m
labels:
severity: critical
annotations:
summary: "Vaultwarden backup is {{ $value | humanizeDuration }} old (threshold: 36h)"
- alert: VaultwardenBackupNeverSucceeded
expr: kube_cronjob_status_last_successful_time{cronjob="vaultwarden-backup", namespace="vaultwarden"} == 0
for: 1h
labels:
severity: critical
annotations:
summary: "Vaultwarden backup CronJob has never completed successfully"
- alert: RedisBackupStale
expr: (time() - kube_cronjob_status_last_successful_time{cronjob="redis-backup", namespace="redis"}) > 14400
for: 30m
labels:
severity: critical
annotations:
summary: "Redis backup is {{ $value | humanizeDuration }} old (threshold: 4h)"
- alert: RedisBackupNeverSucceeded
expr: kube_cronjob_status_last_successful_time{cronjob="redis-backup", namespace="redis"} == 0
for: 1h
labels:
severity: critical
annotations:
summary: "Redis backup CronJob has never completed successfully"
- alert: CSIDriverCrashLoop
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace=~"nfs-csi|iscsi-csi"} > 0
for: 10m
labels:
severity: critical
annotations:
summary: "CSI driver CrashLoopBackOff in {{ $labels.namespace }}/{{ $labels.pod }} — storage-layer failure risk"
- alert: BackupCronJobFailed
expr: kube_job_status_failed{job_name=~".*backup.*"} > 0
for: 15m
labels:
severity: warning
annotations:
summary: "Backup job failed: {{ $labels.namespace }}/{{ $labels.job_name }}"
- alert: NewTailscaleClient
expr: irate(headscale_machine_registrations_total{action="reauth"}[5m]) > 0
for: 5m

View file

@ -283,12 +283,15 @@ resource "kubernetes_cron_job_v1" "redis-backup" {
image = "redis:7-alpine"
command = ["/bin/sh", "-c", <<-EOT
set -eux
TIMESTAMP=$(date +%Y%m%d-%H%M)
# Trigger a fresh RDB save on the master
redis-cli -h redis.redis BGSAVE
sleep 5
# Copy the RDB via redis-cli --rdb
redis-cli -h redis.redis --rdb /backup/dump.rdb
echo "Backup complete: $(ls -lh /backup/dump.rdb)"
redis-cli -h redis.redis --rdb /backup/redis-$TIMESTAMP.rdb
# Rotate 7-day retention
find /backup -name 'redis-*.rdb' -type f -mtime +7 -delete
echo "Backup complete: redis-$TIMESTAMP.rdb"
EOT
]
volume_mount {