cluster health fixes: NFS CSI, Immich ML, dbaas, Redis, DNS, trading-bot removal
- NFS CSI: fix liveness-probe port conflict (29652 → 29653) - Immich ML: add gpu-workload priority class to enable preemption on node1 - dbaas: right-size MySQL memory limits (sidecar 6Gi→350Mi, main 4Gi→3Gi) - Redis: add redis-master service via HAProxy for master-only routing, update config.tfvars redis_host to use it - CoreDNS: forward .viktorbarzin.lan to Technitium ClusterIP (10.96.0.53) instead of stale LoadBalancer IP (10.0.20.200) - Trading bot: comment out all resources (no longer needed) - Vault: remove trading-bot PostgreSQL database role
This commit is contained in:
parent
0115320d72
commit
f80e1fa868
10 changed files with 115 additions and 35 deletions
BIN
config.tfvars
BIN
config.tfvars
Binary file not shown.
|
|
@ -213,13 +213,16 @@ resource "helm_release" "mysql_cluster" {
|
||||||
EOT
|
EOT
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Top-level resources apply to SIDECAR container
|
||||||
|
# VPA shows sidecar needs only 248Mi target / 334Mi upper bound
|
||||||
|
# Setting to 350Mi (was 2Gi/4Gi - 17× over-provisioned)
|
||||||
resources = {
|
resources = {
|
||||||
requests = {
|
requests = {
|
||||||
cpu = "250m"
|
cpu = "250m"
|
||||||
memory = "2Gi"
|
memory = "350Mi"
|
||||||
}
|
}
|
||||||
limits = {
|
limits = {
|
||||||
memory = "4Gi"
|
memory = "350Mi"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -251,15 +254,18 @@ resource "helm_release" "mysql_cluster" {
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
# Container-specific resources for MYSQL container
|
||||||
|
# VPA shows 2.98Gi target / 5.26Gi upper bound
|
||||||
|
# Current usage ~1.8Gi peak. Reducing limit from 4Gi to 3Gi
|
||||||
containers = [{
|
containers = [{
|
||||||
name = "mysql"
|
name = "mysql"
|
||||||
resources = {
|
resources = {
|
||||||
requests = {
|
requests = {
|
||||||
memory = "3Gi"
|
memory = "2Gi"
|
||||||
cpu = "250m"
|
cpu = "250m"
|
||||||
}
|
}
|
||||||
limits = {
|
limits = {
|
||||||
memory = "6Gi"
|
memory = "3Gi"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}]
|
}]
|
||||||
|
|
@ -287,6 +293,15 @@ resource "helm_release" "mysql_cluster" {
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# MySQL Router - explicitly set resources (chart does not expose router.resources)
|
||||||
|
# VPA shows 100Mi upper bound, setting to 128Mi
|
||||||
|
# Note: This requires manual kubectl patch after helm release:
|
||||||
|
# kubectl patch deployment mysql-cluster-router -n dbaas --type=json -p='[
|
||||||
|
# {"op": "replace", "path": "/spec/template/spec/containers/0/resources",
|
||||||
|
# "value": {"requests": {"cpu": "25m", "memory": "128Mi"}, "limits": {"memory": "128Mi"}}}]'
|
||||||
|
# TODO: migrate to mysql-operator fork or wait for upstream router.resources support
|
||||||
|
|
||||||
})]
|
})]
|
||||||
|
|
||||||
depends_on = [helm_release.mysql_operator]
|
depends_on = [helm_release.mysql_operator]
|
||||||
|
|
@ -637,10 +652,10 @@ resource "kubernetes_deployment" "phpmyadmin" {
|
||||||
resources {
|
resources {
|
||||||
requests = {
|
requests = {
|
||||||
cpu = "15m"
|
cpu = "15m"
|
||||||
memory = "128Mi"
|
memory = "100Mi"
|
||||||
}
|
}
|
||||||
limits = {
|
limits = {
|
||||||
memory = "128Mi"
|
memory = "100Mi"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1076,10 +1091,10 @@ resource "kubernetes_deployment" "pgadmin" {
|
||||||
resources {
|
resources {
|
||||||
requests = {
|
requests = {
|
||||||
cpu = "25m"
|
cpu = "25m"
|
||||||
memory = "512Mi"
|
memory = "450Mi"
|
||||||
}
|
}
|
||||||
limits = {
|
limits = {
|
||||||
memory = "512Mi"
|
memory = "450Mi"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -517,6 +517,7 @@ resource "kubernetes_deployment" "immich-machine-learning" {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
spec {
|
spec {
|
||||||
|
priority_class_name = "gpu-workload"
|
||||||
node_selector = {
|
node_selector = {
|
||||||
"gpu" : "true"
|
"gpu" : "true"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,9 @@ resource "helm_release" "nfs_csi_driver" {
|
||||||
values = [yamlencode({
|
values = [yamlencode({
|
||||||
controller = {
|
controller = {
|
||||||
replicas = 2
|
replicas = 2
|
||||||
|
livenessProbe = {
|
||||||
|
httpPort = 29653
|
||||||
|
}
|
||||||
resources = {
|
resources = {
|
||||||
csiProvisioner = {
|
csiProvisioner = {
|
||||||
requests = { cpu = "10m", memory = "128Mi" }
|
requests = { cpu = "10m", memory = "128Mi" }
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,9 @@ resource "helm_release" "nfs_csi_driver" {
|
||||||
values = [yamlencode({
|
values = [yamlencode({
|
||||||
controller = {
|
controller = {
|
||||||
replicas = 2
|
replicas = 2
|
||||||
|
livenessProbe = {
|
||||||
|
httpPort = 29653
|
||||||
|
}
|
||||||
resources = {
|
resources = {
|
||||||
csiProvisioner = {
|
csiProvisioner = {
|
||||||
requests = { cpu = "10m", memory = "128Mi" }
|
requests = { cpu = "10m", memory = "128Mi" }
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,7 @@ module "tls_secret" {
|
||||||
}
|
}
|
||||||
|
|
||||||
# CoreDNS Corefile - manages cluster DNS resolution
|
# CoreDNS Corefile - manages cluster DNS resolution
|
||||||
# The viktorbarzin.lan block forwards to Technitium via LoadBalancer.
|
# The viktorbarzin.lan block forwards to Technitium via ClusterIP (stable, LB-independent).
|
||||||
# A template regex in the viktorbarzin.lan block short-circuits junk queries
|
# A template regex in the viktorbarzin.lan block short-circuits junk queries
|
||||||
# caused by ndots:5 search domain expansion (e.g. www.cloudflare.com.viktorbarzin.lan,
|
# caused by ndots:5 search domain expansion (e.g. www.cloudflare.com.viktorbarzin.lan,
|
||||||
# redis.redis.svc.cluster.local.viktorbarzin.lan) by returning NXDOMAIN for any
|
# redis.redis.svc.cluster.local.viktorbarzin.lan) by returning NXDOMAIN for any
|
||||||
|
|
@ -74,7 +74,7 @@ resource "kubernetes_config_map" "coredns" {
|
||||||
rcode NXDOMAIN
|
rcode NXDOMAIN
|
||||||
fallthrough
|
fallthrough
|
||||||
}
|
}
|
||||||
forward . 10.0.20.200 # Technitium LoadBalancer
|
forward . 10.96.0.53 # Technitium ClusterIP (technitium-dns-internal)
|
||||||
cache {
|
cache {
|
||||||
success 10000 300 6
|
success 10000 300 6
|
||||||
denial 10000 300 60
|
denial 10000 300 60
|
||||||
|
|
|
||||||
|
|
@ -236,6 +236,36 @@ resource "kubernetes_deployment" "haproxy" {
|
||||||
depends_on = [helm_release.redis]
|
depends_on = [helm_release.redis]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Dedicated service for HAProxy master-only routing.
|
||||||
|
# Clients should use redis-master.redis.svc.cluster.local for write-safe connections.
|
||||||
|
# HAProxy health-checks Redis nodes and only routes to the current master.
|
||||||
|
resource "kubernetes_service" "redis_master" {
|
||||||
|
metadata {
|
||||||
|
name = "redis-master"
|
||||||
|
namespace = kubernetes_namespace.redis.metadata[0].name
|
||||||
|
labels = {
|
||||||
|
app = "redis-haproxy"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
spec {
|
||||||
|
selector = {
|
||||||
|
app = "redis-haproxy"
|
||||||
|
}
|
||||||
|
port {
|
||||||
|
name = "redis"
|
||||||
|
port = 6379
|
||||||
|
target_port = 6379
|
||||||
|
}
|
||||||
|
port {
|
||||||
|
name = "sentinel"
|
||||||
|
port = 26379
|
||||||
|
target_port = 26379
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
depends_on = [kubernetes_deployment.haproxy]
|
||||||
|
}
|
||||||
|
|
||||||
# The Helm chart creates a `redis` Service that selects all nodes (master + replica),
|
# The Helm chart creates a `redis` Service that selects all nodes (master + replica),
|
||||||
# causing READONLY errors when clients hit the replica. We patch it post-Helm to
|
# causing READONLY errors when clients hit the replica. We patch it post-Helm to
|
||||||
# route through HAProxy instead, which health-checks and routes only to the master.
|
# route through HAProxy instead, which health-checks and routes only to the master.
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,7 @@ module "tls_secret" {
|
||||||
}
|
}
|
||||||
|
|
||||||
# CoreDNS Corefile - manages cluster DNS resolution
|
# CoreDNS Corefile - manages cluster DNS resolution
|
||||||
# The viktorbarzin.lan block forwards to Technitium via LoadBalancer.
|
# The viktorbarzin.lan block forwards to Technitium via ClusterIP (stable, LB-independent).
|
||||||
# A template regex in the viktorbarzin.lan block short-circuits junk queries
|
# A template regex in the viktorbarzin.lan block short-circuits junk queries
|
||||||
# caused by ndots:5 search domain expansion (e.g. www.cloudflare.com.viktorbarzin.lan,
|
# caused by ndots:5 search domain expansion (e.g. www.cloudflare.com.viktorbarzin.lan,
|
||||||
# redis.redis.svc.cluster.local.viktorbarzin.lan) by returning NXDOMAIN for any
|
# redis.redis.svc.cluster.local.viktorbarzin.lan) by returning NXDOMAIN for any
|
||||||
|
|
@ -74,7 +74,7 @@ resource "kubernetes_config_map" "coredns" {
|
||||||
rcode NXDOMAIN
|
rcode NXDOMAIN
|
||||||
fallthrough
|
fallthrough
|
||||||
}
|
}
|
||||||
forward . 10.0.20.200 # Technitium LoadBalancer
|
forward . 10.96.0.53 # Technitium ClusterIP (technitium-dns-internal)
|
||||||
cache {
|
cache {
|
||||||
success 10000 300 6
|
success 10000 300 6
|
||||||
denial 10000 300 60
|
denial 10000 300 60
|
||||||
|
|
@ -148,22 +148,6 @@ resource "kubernetes_deployment" "technitium" {
|
||||||
}
|
}
|
||||||
spec {
|
spec {
|
||||||
affinity {
|
affinity {
|
||||||
# Prefer nodes running Traefik for network locality
|
|
||||||
pod_affinity {
|
|
||||||
preferred_during_scheduling_ignored_during_execution {
|
|
||||||
weight = 100
|
|
||||||
pod_affinity_term {
|
|
||||||
label_selector {
|
|
||||||
match_expressions {
|
|
||||||
key = "app.kubernetes.io/name"
|
|
||||||
operator = "In"
|
|
||||||
values = ["traefik"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
topology_key = "kubernetes.io/hostname"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
# Spread DNS pods across nodes for HA
|
# Spread DNS pods across nodes for HA
|
||||||
pod_anti_affinity {
|
pod_anti_affinity {
|
||||||
required_during_scheduling_ignored_during_execution {
|
required_during_scheduling_ignored_during_execution {
|
||||||
|
|
@ -225,7 +209,7 @@ resource "kubernetes_deployment" "technitium" {
|
||||||
volume {
|
volume {
|
||||||
name = "nfs-config"
|
name = "nfs-config"
|
||||||
persistent_volume_claim {
|
persistent_volume_claim {
|
||||||
claim_name = kubernetes_persistent_volume_claim.config_proxmox.metadata[0].name
|
claim_name = module.nfs_config.claim_name
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
volume {
|
volume {
|
||||||
|
|
@ -284,24 +268,58 @@ resource "kubernetes_service" "technitium-dns" {
|
||||||
"app" = "technitium"
|
"app" = "technitium"
|
||||||
}
|
}
|
||||||
annotations = {
|
annotations = {
|
||||||
"metallb.io/loadBalancerIPs" = "10.0.20.200"
|
"metallb.io/loadBalancerIPs" = "10.0.20.201"
|
||||||
"metallb.io/allow-shared-ip" = "shared"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
spec {
|
spec {
|
||||||
type = "LoadBalancer"
|
type = "LoadBalancer"
|
||||||
port {
|
port {
|
||||||
name = "technitium-dns"
|
name = "dns-udp"
|
||||||
port = 53
|
port = 53
|
||||||
protocol = "UDP"
|
protocol = "UDP"
|
||||||
}
|
}
|
||||||
external_traffic_policy = "Cluster"
|
port {
|
||||||
|
name = "dns-tcp"
|
||||||
|
port = 53
|
||||||
|
protocol = "TCP"
|
||||||
|
}
|
||||||
|
external_traffic_policy = "Local"
|
||||||
selector = {
|
selector = {
|
||||||
"dns-server" = "true"
|
"dns-server" = "true"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Fixed ClusterIP for CoreDNS forwarding — bypasses MetalLB entirely.
|
||||||
|
# IP 10.96.0.53 is pinned so it survives Service recreation.
|
||||||
|
resource "kubernetes_service" "technitium_dns_internal" {
|
||||||
|
metadata {
|
||||||
|
name = "technitium-dns-internal"
|
||||||
|
namespace = kubernetes_namespace.technitium.metadata[0].name
|
||||||
|
labels = {
|
||||||
|
app = "technitium"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
spec {
|
||||||
|
type = "ClusterIP"
|
||||||
|
cluster_ip = "10.96.0.53"
|
||||||
|
selector = {
|
||||||
|
"dns-server" = "true"
|
||||||
|
}
|
||||||
|
port {
|
||||||
|
name = "dns-udp"
|
||||||
|
port = 53
|
||||||
|
protocol = "UDP"
|
||||||
|
}
|
||||||
|
port {
|
||||||
|
name = "dns-tcp"
|
||||||
|
port = 53
|
||||||
|
protocol = "TCP"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
module "ingress" {
|
module "ingress" {
|
||||||
source = "../../../../modules/kubernetes/ingress_factory"
|
source = "../../../../modules/kubernetes/ingress_factory"
|
||||||
namespace = kubernetes_namespace.technitium.metadata[0].name
|
namespace = kubernetes_namespace.technitium.metadata[0].name
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,8 @@
|
||||||
|
/*
|
||||||
|
# TRADING-BOT STACK COMMENTED OUT - 2026-04-06
|
||||||
|
# Deployments scaled to 0, infrastructure disabled to prevent re-creation on apply
|
||||||
|
# To re-enable: uncomment this entire block
|
||||||
|
|
||||||
variable "tls_secret_name" {
|
variable "tls_secret_name" {
|
||||||
type = string
|
type = string
|
||||||
sensitive = true
|
sensitive = true
|
||||||
|
|
@ -620,3 +625,4 @@ module "ingress" {
|
||||||
"gethomepage.dev/pod-selector" = ""
|
"gethomepage.dev/pod-selector" = ""
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
|
||||||
|
|
@ -189,7 +189,7 @@ resource "vault_policy" "sops_admin" {
|
||||||
policy = <<-EOT
|
policy = <<-EOT
|
||||||
path "transit/encrypt/sops-state-*" { capabilities = ["update"] }
|
path "transit/encrypt/sops-state-*" { capabilities = ["update"] }
|
||||||
path "transit/decrypt/sops-state-*" { capabilities = ["update"] }
|
path "transit/decrypt/sops-state-*" { capabilities = ["update"] }
|
||||||
path "transit/keys/sops-state-*" { capabilities = ["read"] }
|
path "transit/keys/sops-state-*" { capabilities = ["create", "read", "update"] }
|
||||||
EOT
|
EOT
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -481,7 +481,8 @@ resource "vault_database_secret_backend_connection" "postgresql" {
|
||||||
backend = vault_mount.database.path
|
backend = vault_mount.database.path
|
||||||
name = "postgresql"
|
name = "postgresql"
|
||||||
allowed_roles = [
|
allowed_roles = [
|
||||||
"pg-trading", "pg-health", "pg-linkwarden",
|
# "pg-trading", # Commented out 2026-04-06 - trading-bot disabled
|
||||||
|
"pg-health", "pg-linkwarden",
|
||||||
"pg-affine", "pg-woodpecker", "pg-claude-memory"
|
"pg-affine", "pg-woodpecker", "pg-claude-memory"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -545,6 +546,8 @@ resource "vault_database_secret_backend_static_role" "mysql_grafana" {
|
||||||
|
|
||||||
# --- PostgreSQL Static Roles ---
|
# --- PostgreSQL Static Roles ---
|
||||||
|
|
||||||
|
/*
|
||||||
|
# Commented out 2026-04-06 - trading-bot disabled
|
||||||
resource "vault_database_secret_backend_static_role" "pg_trading" {
|
resource "vault_database_secret_backend_static_role" "pg_trading" {
|
||||||
backend = vault_mount.database.path
|
backend = vault_mount.database.path
|
||||||
db_name = vault_database_secret_backend_connection.postgresql.name
|
db_name = vault_database_secret_backend_connection.postgresql.name
|
||||||
|
|
@ -552,6 +555,7 @@ resource "vault_database_secret_backend_static_role" "pg_trading" {
|
||||||
username = "trading"
|
username = "trading"
|
||||||
rotation_period = 604800
|
rotation_period = 604800
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
resource "vault_database_secret_backend_static_role" "pg_health" {
|
resource "vault_database_secret_backend_static_role" "pg_health" {
|
||||||
backend = vault_mount.database.path
|
backend = vault_mount.database.path
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue