diff --git a/config.tfvars b/config.tfvars index 706a3ded..9c50b8ef 100644 Binary files a/config.tfvars and b/config.tfvars differ diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index 79489348..474cb610 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -213,13 +213,16 @@ resource "helm_release" "mysql_cluster" { EOT } + # Top-level resources apply to SIDECAR container + # VPA shows sidecar needs only 248Mi target / 334Mi upper bound + # Setting to 350Mi (was 2Gi/4Gi - 17× over-provisioned) resources = { requests = { cpu = "250m" - memory = "2Gi" + memory = "350Mi" } limits = { - memory = "4Gi" + memory = "350Mi" } } @@ -251,15 +254,18 @@ resource "helm_release" "mysql_cluster" { }] } } + # Container-specific resources for MYSQL container + # VPA shows 2.98Gi target / 5.26Gi upper bound + # Current usage ~1.8Gi peak. Reducing limit from 4Gi to 3Gi containers = [{ name = "mysql" resources = { requests = { - memory = "3Gi" + memory = "2Gi" cpu = "250m" } limits = { - memory = "6Gi" + memory = "3Gi" } } }] @@ -287,6 +293,15 @@ resource "helm_release" "mysql_cluster" { } ] } + +# MySQL Router - explicitly set resources (chart does not expose router.resources) +# VPA shows 100Mi upper bound, setting to 128Mi +# Note: This requires manual kubectl patch after helm release: +# kubectl patch deployment mysql-cluster-router -n dbaas --type=json -p='[ +# {"op": "replace", "path": "/spec/template/spec/containers/0/resources", +# "value": {"requests": {"cpu": "25m", "memory": "128Mi"}, "limits": {"memory": "128Mi"}}}]' +# TODO: migrate to mysql-operator fork or wait for upstream router.resources support + })] depends_on = [helm_release.mysql_operator] @@ -637,10 +652,10 @@ resource "kubernetes_deployment" "phpmyadmin" { resources { requests = { cpu = "15m" - memory = "128Mi" + memory = "100Mi" } limits = { - memory = "128Mi" + memory = "100Mi" } } } @@ -1076,10 +1091,10 @@ resource "kubernetes_deployment" "pgadmin" { resources { requests = { cpu = "25m" - memory = "512Mi" + memory = "450Mi" } limits = { - memory = "512Mi" + memory = "450Mi" } } diff --git a/stacks/immich/main.tf b/stacks/immich/main.tf index 5c7558be..5802a713 100644 --- a/stacks/immich/main.tf +++ b/stacks/immich/main.tf @@ -517,6 +517,7 @@ resource "kubernetes_deployment" "immich-machine-learning" { } } spec { + priority_class_name = "gpu-workload" node_selector = { "gpu" : "true" } diff --git a/stacks/nfs-csi/modules/nfs-csi/main.tf b/stacks/nfs-csi/modules/nfs-csi/main.tf index 962a1fe4..af2cbfd3 100644 --- a/stacks/nfs-csi/modules/nfs-csi/main.tf +++ b/stacks/nfs-csi/modules/nfs-csi/main.tf @@ -23,6 +23,9 @@ resource "helm_release" "nfs_csi_driver" { values = [yamlencode({ controller = { replicas = 2 + livenessProbe = { + httpPort = 29653 + } resources = { csiProvisioner = { requests = { cpu = "10m", memory = "128Mi" } diff --git a/stacks/platform/modules/nfs-csi/main.tf b/stacks/platform/modules/nfs-csi/main.tf index 962a1fe4..af2cbfd3 100644 --- a/stacks/platform/modules/nfs-csi/main.tf +++ b/stacks/platform/modules/nfs-csi/main.tf @@ -23,6 +23,9 @@ resource "helm_release" "nfs_csi_driver" { values = [yamlencode({ controller = { replicas = 2 + livenessProbe = { + httpPort = 29653 + } resources = { csiProvisioner = { requests = { cpu = "10m", memory = "128Mi" } diff --git a/stacks/platform/modules/technitium/main.tf b/stacks/platform/modules/technitium/main.tf index a8b8910a..69f3c76b 100644 --- a/stacks/platform/modules/technitium/main.tf +++ b/stacks/platform/modules/technitium/main.tf @@ -30,7 +30,7 @@ module "tls_secret" { } # CoreDNS Corefile - manages cluster DNS resolution -# The viktorbarzin.lan block forwards to Technitium via LoadBalancer. +# The viktorbarzin.lan block forwards to Technitium via ClusterIP (stable, LB-independent). # A template regex in the viktorbarzin.lan block short-circuits junk queries # caused by ndots:5 search domain expansion (e.g. www.cloudflare.com.viktorbarzin.lan, # redis.redis.svc.cluster.local.viktorbarzin.lan) by returning NXDOMAIN for any @@ -74,7 +74,7 @@ resource "kubernetes_config_map" "coredns" { rcode NXDOMAIN fallthrough } - forward . 10.0.20.200 # Technitium LoadBalancer + forward . 10.96.0.53 # Technitium ClusterIP (technitium-dns-internal) cache { success 10000 300 6 denial 10000 300 60 diff --git a/stacks/redis/modules/redis/main.tf b/stacks/redis/modules/redis/main.tf index 24dbcb1c..19c8b0a3 100644 --- a/stacks/redis/modules/redis/main.tf +++ b/stacks/redis/modules/redis/main.tf @@ -236,6 +236,36 @@ resource "kubernetes_deployment" "haproxy" { depends_on = [helm_release.redis] } +# Dedicated service for HAProxy master-only routing. +# Clients should use redis-master.redis.svc.cluster.local for write-safe connections. +# HAProxy health-checks Redis nodes and only routes to the current master. +resource "kubernetes_service" "redis_master" { + metadata { + name = "redis-master" + namespace = kubernetes_namespace.redis.metadata[0].name + labels = { + app = "redis-haproxy" + } + } + spec { + selector = { + app = "redis-haproxy" + } + port { + name = "redis" + port = 6379 + target_port = 6379 + } + port { + name = "sentinel" + port = 26379 + target_port = 26379 + } + } + + depends_on = [kubernetes_deployment.haproxy] +} + # The Helm chart creates a `redis` Service that selects all nodes (master + replica), # causing READONLY errors when clients hit the replica. We patch it post-Helm to # route through HAProxy instead, which health-checks and routes only to the master. diff --git a/stacks/technitium/modules/technitium/main.tf b/stacks/technitium/modules/technitium/main.tf index 41a9934c..569b2207 100644 --- a/stacks/technitium/modules/technitium/main.tf +++ b/stacks/technitium/modules/technitium/main.tf @@ -30,7 +30,7 @@ module "tls_secret" { } # CoreDNS Corefile - manages cluster DNS resolution -# The viktorbarzin.lan block forwards to Technitium via LoadBalancer. +# The viktorbarzin.lan block forwards to Technitium via ClusterIP (stable, LB-independent). # A template regex in the viktorbarzin.lan block short-circuits junk queries # caused by ndots:5 search domain expansion (e.g. www.cloudflare.com.viktorbarzin.lan, # redis.redis.svc.cluster.local.viktorbarzin.lan) by returning NXDOMAIN for any @@ -74,7 +74,7 @@ resource "kubernetes_config_map" "coredns" { rcode NXDOMAIN fallthrough } - forward . 10.0.20.200 # Technitium LoadBalancer + forward . 10.96.0.53 # Technitium ClusterIP (technitium-dns-internal) cache { success 10000 300 6 denial 10000 300 60 @@ -148,22 +148,6 @@ resource "kubernetes_deployment" "technitium" { } spec { affinity { - # Prefer nodes running Traefik for network locality - pod_affinity { - preferred_during_scheduling_ignored_during_execution { - weight = 100 - pod_affinity_term { - label_selector { - match_expressions { - key = "app.kubernetes.io/name" - operator = "In" - values = ["traefik"] - } - } - topology_key = "kubernetes.io/hostname" - } - } - } # Spread DNS pods across nodes for HA pod_anti_affinity { required_during_scheduling_ignored_during_execution { @@ -225,7 +209,7 @@ resource "kubernetes_deployment" "technitium" { volume { name = "nfs-config" persistent_volume_claim { - claim_name = kubernetes_persistent_volume_claim.config_proxmox.metadata[0].name + claim_name = module.nfs_config.claim_name } } volume { @@ -284,24 +268,58 @@ resource "kubernetes_service" "technitium-dns" { "app" = "technitium" } annotations = { - "metallb.io/loadBalancerIPs" = "10.0.20.200" - "metallb.io/allow-shared-ip" = "shared" + "metallb.io/loadBalancerIPs" = "10.0.20.201" } } spec { type = "LoadBalancer" port { - name = "technitium-dns" + name = "dns-udp" port = 53 protocol = "UDP" } - external_traffic_policy = "Cluster" + port { + name = "dns-tcp" + port = 53 + protocol = "TCP" + } + external_traffic_policy = "Local" selector = { "dns-server" = "true" } } } + +# Fixed ClusterIP for CoreDNS forwarding — bypasses MetalLB entirely. +# IP 10.96.0.53 is pinned so it survives Service recreation. +resource "kubernetes_service" "technitium_dns_internal" { + metadata { + name = "technitium-dns-internal" + namespace = kubernetes_namespace.technitium.metadata[0].name + labels = { + app = "technitium" + } + } + spec { + type = "ClusterIP" + cluster_ip = "10.96.0.53" + selector = { + "dns-server" = "true" + } + port { + name = "dns-udp" + port = 53 + protocol = "UDP" + } + port { + name = "dns-tcp" + port = 53 + protocol = "TCP" + } + } +} + module "ingress" { source = "../../../../modules/kubernetes/ingress_factory" namespace = kubernetes_namespace.technitium.metadata[0].name diff --git a/stacks/trading-bot/main.tf b/stacks/trading-bot/main.tf index fe0246bd..0a7230a7 100644 --- a/stacks/trading-bot/main.tf +++ b/stacks/trading-bot/main.tf @@ -1,3 +1,8 @@ +/* +# TRADING-BOT STACK COMMENTED OUT - 2026-04-06 +# Deployments scaled to 0, infrastructure disabled to prevent re-creation on apply +# To re-enable: uncomment this entire block + variable "tls_secret_name" { type = string sensitive = true @@ -620,3 +625,4 @@ module "ingress" { "gethomepage.dev/pod-selector" = "" } } +*/ diff --git a/stacks/vault/main.tf b/stacks/vault/main.tf index 9dbd39da..5818bc2c 100644 --- a/stacks/vault/main.tf +++ b/stacks/vault/main.tf @@ -189,7 +189,7 @@ resource "vault_policy" "sops_admin" { policy = <<-EOT path "transit/encrypt/sops-state-*" { capabilities = ["update"] } path "transit/decrypt/sops-state-*" { capabilities = ["update"] } - path "transit/keys/sops-state-*" { capabilities = ["read"] } + path "transit/keys/sops-state-*" { capabilities = ["create", "read", "update"] } EOT } @@ -481,7 +481,8 @@ resource "vault_database_secret_backend_connection" "postgresql" { backend = vault_mount.database.path name = "postgresql" allowed_roles = [ - "pg-trading", "pg-health", "pg-linkwarden", + # "pg-trading", # Commented out 2026-04-06 - trading-bot disabled + "pg-health", "pg-linkwarden", "pg-affine", "pg-woodpecker", "pg-claude-memory" ] @@ -545,6 +546,8 @@ resource "vault_database_secret_backend_static_role" "mysql_grafana" { # --- PostgreSQL Static Roles --- +/* +# Commented out 2026-04-06 - trading-bot disabled resource "vault_database_secret_backend_static_role" "pg_trading" { backend = vault_mount.database.path db_name = vault_database_secret_backend_connection.postgresql.name @@ -552,6 +555,7 @@ resource "vault_database_secret_backend_static_role" "pg_trading" { username = "trading" rotation_period = 604800 } +*/ resource "vault_database_secret_backend_static_role" "pg_health" { backend = vault_mount.database.path