extract monitoring, nvidia, mailserver, cloudflared, kyverno from platform [ci skip]

Phase 2 of platform stack split. 5 more modules extracted into independent stacks. All applied successfully with zero destroys. Cloudflared now reads k8s_users from Vault directly to compute user_domains. Woodpecker pipeline runs all 8 extracted stacks in parallel. Memory bumped to 6Gi for 9 concurrent TF processes. Platform reduced from 27 to 19 modules.
2026-03-17 21:34:11 +00:00 · 2026-03-17 21:34:11 +00:00 · ae36dc253b
commit ae36dc253b
parent 3c804aedf8
73 changed files with 166093 additions and 96 deletions
--- a/.woodpecker/default.yml
+++ b/.woodpecker/default.yml
@ -32,9 +32,9 @@ steps:
      kubernetes:
        resources:
          requests:
-            memory: 2Gi
+            memory: 3Gi
          limits:
-            memory: 4Gi
+            memory: 6Gi
    commands:
      - "apk update && apk add curl unzip git openssh-client"
      # Install Terraform
@ -45,10 +45,15 @@ steps:
      - "chmod 755 /usr/local/bin/terragrunt"
      # Source Vault token
      - "source .vault-env"
-      # Apply extracted stacks in parallel (slow modules)
+      # Apply extracted stacks in parallel
      - "cd stacks/dbaas && terragrunt apply --non-interactive -auto-approve &"
      - "cd stacks/authentik && terragrunt apply --non-interactive -auto-approve &"
      - "cd stacks/crowdsec && terragrunt apply --non-interactive -auto-approve &"
+      - "cd stacks/monitoring && terragrunt apply --non-interactive -auto-approve &"
+      - "cd stacks/nvidia && terragrunt apply --non-interactive -auto-approve &"
+      - "cd stacks/mailserver && terragrunt apply --non-interactive -auto-approve &"
+      - "cd stacks/cloudflared && terragrunt apply --non-interactive -auto-approve &"
+      - "cd stacks/kyverno && terragrunt apply --non-interactive -auto-approve &"
      # Apply platform stack (remaining core infrastructure services)
      - "cd stacks/platform && terragrunt apply --non-interactive -auto-approve"
      - "wait"
--- a/stacks/cloudflared/main.tf
+++ b/stacks/cloudflared/main.tf
@ -0,0 +1,42 @@
+# =============================================================================
+# Cloudflared Stack — Cloudflare tunnel + DNS records
+# =============================================================================
+
+variable "tls_secret_name" { type = string }
+variable "cloudflare_email" { type = string }
+variable "cloudflare_account_id" { type = string }
+variable "cloudflare_zone_id" { type = string }
+variable "cloudflare_tunnel_id" { type = string }
+variable "public_ip" { type = string }
+variable "cloudflare_proxied_names" {}
+variable "cloudflare_non_proxied_names" {}
+
+data "vault_kv_secret_v2" "secrets" {
+  mount = "secret"
+  name  = "platform"
+}
+
+locals {
+  k8s_users = jsondecode(data.vault_kv_secret_v2.secrets.data["k8s_users"])
+
+  user_domains = flatten([
+    for name, user in local.k8s_users : lookup(user, "domains", [])
+    if user.role == "namespace-owner"
+  ])
+}
+
+module "cloudflared" {
+  source          = "./modules/cloudflared"
+  tier            = local.tiers.core
+  tls_secret_name = var.tls_secret_name
+
+  cloudflare_api_key           = data.vault_kv_secret_v2.secrets.data["cloudflare_api_key"]
+  cloudflare_email             = var.cloudflare_email
+  cloudflare_account_id        = var.cloudflare_account_id
+  cloudflare_zone_id           = var.cloudflare_zone_id
+  cloudflare_tunnel_id         = var.cloudflare_tunnel_id
+  public_ip                    = var.public_ip
+  cloudflare_proxied_names     = concat(var.cloudflare_proxied_names, nonsensitive(local.user_domains))
+  cloudflare_non_proxied_names = var.cloudflare_non_proxied_names
+  cloudflare_tunnel_token      = data.vault_kv_secret_v2.secrets.data["cloudflare_tunnel_token"]
+}
--- a/stacks/cloudflared/modules/cloudflared/cloudflare.tf
+++ b/stacks/cloudflared/modules/cloudflared/cloudflare.tf
@ -0,0 +1,159 @@
+# Contents for cloudflare account
+variable "cloudflare_api_key" {}
+variable "cloudflare_email" {}
+variable "cloudflare_proxied_names" { type = list(string) }
+variable "cloudflare_non_proxied_names" { type = list(string) }
+variable "cloudflare_zone_id" {
+  description = "Zone ID for your domain"
+  type        = string
+}
+variable "cloudflare_account_id" {
+  type      = string
+  sensitive = true
+}
+variable "cloudflare_tunnel_id" {
+  type      = string
+  sensitive = true
+}
+variable "public_ip" {
+  type = string
+}
+
+
+terraform {
+  required_providers {
+    cloudflare = {
+      source  = "cloudflare/cloudflare"
+      version = "~> 4"
+
+    }
+  }
+}
+provider "cloudflare" {
+  api_key = var.cloudflare_api_key # I gave up on getting the permissions on the token...
+  email   = var.cloudflare_email
+}
+
+
+locals {
+  cloudflare_proxied_names_map = {
+    for h in var.cloudflare_proxied_names :
+    h => h
+  }
+  cloudflare_non_proxied_names_map = {
+    for h in var.cloudflare_non_proxied_names :
+    h => h
+  }
+}
+
+resource "cloudflare_zero_trust_tunnel_cloudflared_config" "sof" {
+  account_id = var.cloudflare_account_id
+  tunnel_id  = var.cloudflare_tunnel_id
+
+  config {
+    warp_routing {
+      enabled = true
+    }
+    dynamic "ingress_rule" {
+      for_each = toset(var.cloudflare_proxied_names)
+      content {
+        hostname = ingress_rule.value == "viktorbarzin.me" ? ingress_rule.value : "${ingress_rule.value}.viktorbarzin.me"
+        path     = "/"
+        service  = "https://10.0.20.202:443"
+        origin_request {
+          no_tls_verify = true
+        }
+      }
+    }
+    ingress_rule {
+      service = "http_status:404"
+    }
+  }
+}
+
+resource "cloudflare_record" "dns_record" {
+  # count   = length(var.cloudflare_proxied_names)
+  # name    = var.cloudflare_proxied_names[count.index]
+  for_each = local.cloudflare_proxied_names_map
+  name     = each.key
+
+  content = "${var.cloudflare_tunnel_id}.cfargotunnel.com"
+  proxied = true
+  ttl     = 1
+  type    = "CNAME"
+  zone_id = var.cloudflare_zone_id
+}
+
+resource "cloudflare_record" "non_proxied_dns_record" {
+  # count = length(var.cloudflare_non_proxied_names)
+  # name    = var.cloudflare_non_proxied_names[count.index]
+  for_each = local.cloudflare_non_proxied_names_map
+  name     = each.key
+
+  # content = var.non_proxied_names[count.index].ip
+  content = var.public_ip
+  proxied = false
+  ttl     = 1
+  type    = "A"
+  zone_id = var.cloudflare_zone_id
+}
+
+
+resource "cloudflare_record" "mail" {
+  content  = "mail.viktorbarzin.me"
+  name     = "viktorbarzin.me"
+  proxied  = false
+  ttl      = 1
+  type     = "MX"
+  priority = 1
+  zone_id  = var.cloudflare_zone_id
+}
+
+resource "cloudflare_record" "mail_domainkey" {
+  content  = "\"k=rsa; p=MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDIDLB8mhAHNqs1s6GeZMQHOxWweoNKIrqo5tqRM3yFilgfPUX34aTIXNZg9xAmlK+2S/xXO1ymt127ZGMjnoFKOEP8/uZ54iHTCnioHaPZWMfJ7o6TYIXjr+9ShKfoJxZLv7lHJ2wKQK3yOw4lg4cvja5nxQ6fNoGRwo+mQ/mgJQIDAQAB\""
+  name     = "s1._domainkey.viktorbarzin.me"
+  proxied  = false
+  ttl      = 1
+  type     = "TXT"
+  priority = 1
+  zone_id  = var.cloudflare_zone_id
+}
+
+resource "cloudflare_record" "mail_spf" {
+  content  = "\"v=spf1 include:mailgun.org ~all\""
+  name     = "viktorbarzin.me"
+  proxied  = false
+  ttl      = 1
+  type     = "TXT"
+  priority = 1
+  zone_id  = var.cloudflare_zone_id
+}
+
+resource "cloudflare_record" "mail_dmarc" {
+  content  = "\"v=DMARC1; p=quarantine; pct=100; fo=1; ri=3600; sp=quarantine; adkim=r; aspf=r; rua=mailto:e21c0ff8@dmarc.mailgun.org,mailto:adb84997@inbox.ondmarc.com; ruf=mailto:e21c0ff8@dmarc.mailgun.org,mailto:adb84997@inbox.ondmarc.com,mailto:postmaster@viktorbarzin.me;\""
+  name     = "_dmarc.viktorbarzin.me"
+  proxied  = false
+  ttl      = 1
+  type     = "TXT"
+  priority = 1
+  zone_id  = var.cloudflare_zone_id
+}
+
+resource "cloudflare_record" "keyserver" {
+  content  = "130.162.165.220" # Oracle VPS
+  name     = "keyserver.viktorbarzin.me"
+  proxied  = false
+  ttl      = 3600
+  type     = "A"
+  priority = 1
+  zone_id  = var.cloudflare_zone_id
+}
+
+# Enable HTTP/3 (QUIC) for Cloudflare-proxied domains
+resource "cloudflare_zone_settings_override" "http3" {
+  zone_id = var.cloudflare_zone_id
+
+  settings {
+    http3 = "on"
+  }
+}
--- a/stacks/cloudflared/modules/cloudflared/main.tf
+++ b/stacks/cloudflared/modules/cloudflared/main.tf
@ -0,0 +1,130 @@
+# Contents for cloudflare tunnel
+
+variable "tls_secret_name" {}
+variable "cloudflare_tunnel_token" {}
+resource "kubernetes_namespace" "cloudflared" {
+  metadata {
+    name = "cloudflared"
+    labels = {
+      tier = var.tier
+    }
+  }
+}
+variable "tier" { type = string }
+
+module "tls_secret" {
+  source          = "../../../../modules/kubernetes/setup_tls_secret"
+  namespace       = kubernetes_namespace.cloudflared.metadata[0].name
+  tls_secret_name = var.tls_secret_name
+}
+
+resource "kubernetes_deployment" "cloudflared" {
+  metadata {
+    name      = "cloudflared"
+    namespace = kubernetes_namespace.cloudflared.metadata[0].name
+    labels = {
+      app  = "cloudflared"
+      tier = var.tier
+    }
+    annotations = {
+      "reloader.stakater.com/search" = "true"
+    }
+  }
+  spec {
+    replicas = 3
+    strategy {
+      type = "RollingUpdate"
+    }
+    selector {
+      match_labels = {
+        app = "cloudflared"
+      }
+    }
+    template {
+      metadata {
+        labels = {
+          app = "cloudflared"
+        }
+      }
+      spec {
+        topology_spread_constraint {
+          max_skew           = 1
+          topology_key       = "kubernetes.io/hostname"
+          when_unsatisfiable = "ScheduleAnyway"
+          label_selector {
+            match_labels = {
+              app = "cloudflared"
+            }
+          }
+        }
+        container {
+          # image = "wisdomsky/cloudflared-web:latest"
+          image   = "cloudflare/cloudflared"
+          name    = "cloudflared"
+          command = ["cloudflared", "tunnel", "run"]
+          env {
+            name  = "TUNNEL_TOKEN"
+            value = var.cloudflare_tunnel_token
+          }
+
+          port {
+            container_port = 14333
+          }
+          resources {
+            requests = {
+              cpu    = "15m"
+              memory = "128Mi"
+            }
+            limits = {
+              memory = "128Mi"
+            }
+          }
+        }
+        dns_config {
+          option {
+            name  = "ndots"
+            value = "2"
+          }
+        }
+      }
+    }
+  }
+}
+
+resource "kubernetes_pod_disruption_budget_v1" "cloudflared" {
+  metadata {
+    name      = "cloudflared"
+    namespace = kubernetes_namespace.cloudflared.metadata[0].name
+  }
+  spec {
+    max_unavailable = "1"
+    selector {
+      match_labels = {
+        app = "cloudflared"
+      }
+    }
+  }
+}
+
+resource "kubernetes_service" "cloudflared" {
+  metadata {
+    name      = "cloudflared"
+    namespace = kubernetes_namespace.cloudflared.metadata[0].name
+    labels = {
+      "app" = "cloudflared"
+    }
+  }
+
+  spec {
+    selector = {
+      app = "cloudflared"
+    }
+    port {
+      name        = "http"
+      target_port = 14333
+      port        = 80
+      protocol    = "TCP"
+    }
+  }
+}
+
--- a/stacks/cloudflared/secrets
+++ b/stacks/cloudflared/secrets
@ -0,0 +1 @@
+../../secrets
--- a/stacks/cloudflared/terragrunt.hcl
+++ b/stacks/cloudflared/terragrunt.hcl
@ -0,0 +1,8 @@
+include "root" {
+  path = find_in_parent_folders()
+}
+
+dependency "infra" {
+  config_path  = "../infra"
+  skip_outputs = true
+}
--- a/stacks/cloudflared/tiers.tf
+++ b/stacks/cloudflared/tiers.tf
@ -0,0 +1,10 @@
+# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
+locals {
+  tiers = {
+    core    = "0-core"
+    cluster = "1-cluster"
+    gpu     = "2-gpu"
+    edge    = "3-edge"
+    aux     = "4-aux"
+  }
+}
--- a/stacks/kyverno/main.tf
+++ b/stacks/kyverno/main.tf
@ -0,0 +1,7 @@
+# =============================================================================
+# Kyverno Stack — Policy engine
+# =============================================================================
+
+module "kyverno" {
+  source = "./modules/kyverno"
+}
--- a/stacks/kyverno/modules/kyverno/dependency-init-containers.tf
+++ b/stacks/kyverno/modules/kyverno/dependency-init-containers.tf
@ -0,0 +1,72 @@
+
+# =============================================================================
+# Pod Dependency Init Container Injection
+# =============================================================================
+# Reads the annotation dependency.kyverno.io/wait-for from pods and injects
+# init containers that wait for each listed dependency to be reachable.
+#
+# Usage:
+#   annotations:
+#     dependency.kyverno.io/wait-for: "postgresql.dbaas:5432,redis.redis:6379"
+#
+# Each comma-separated entry becomes a busybox init container that runs
+# `nc -z <host> <port>` in a loop until the dependency is reachable.
+# Existing init containers are preserved — Kyverno appends to the array.
+
+resource "kubernetes_manifest" "inject_dependency_init_containers" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "inject-dependency-init-containers"
+      annotations = {
+        "policies.kyverno.io/title"       = "Inject Dependency Init Containers"
+        "policies.kyverno.io/description" = "Injects wait-for init containers based on dependency.kyverno.io/wait-for pod annotation. Each comma-separated host:port entry becomes a busybox init container that blocks until the dependency is reachable via nc -z."
+      }
+    }
+    spec = {
+      rules = [
+        {
+          name = "wait-for-dependencies"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds      = ["Pod"]
+                  operations = ["CREATE"]
+                }
+              }
+            ]
+          }
+          preconditions = {
+            all = [
+              {
+                key      = "{{ request.object.metadata.annotations.\"dependency.kyverno.io/wait-for\" || '' }}"
+                operator = "NotEquals"
+                value    = ""
+              }
+            ]
+          }
+          mutate = {
+            foreach = [
+              {
+                list = "request.object.metadata.annotations.\"dependency.kyverno.io/wait-for\" | split(@, ',')"
+                patchStrategicMerge = {
+                  spec = {
+                    initContainers = [
+                      {
+                        name    = "wait-for-{{ element | split(@, ':') | [0] | replace_all(@, '.', '-') }}"
+                        image   = "busybox:1.37"
+                        command = ["sh", "-c", "until nc -z {{ element | split(@, ':') | [0] }} {{ element | split(@, ':') | [1] }}; do echo waiting for {{ element }}; sleep 2; done"]
+                      }
+                    ]
+                  }
+                }
+              }
+            ]
+          }
+        }
+      ]
+    }
+  }
+}
--- a/stacks/kyverno/modules/kyverno/main.tf
+++ b/stacks/kyverno/modules/kyverno/main.tf
@ -0,0 +1,216 @@
+
+resource "kubernetes_namespace" "kyverno" {
+  metadata {
+    name = "kyverno"
+    labels = {
+      "istio-injection" : "disabled"
+    }
+  }
+}
+
+resource "helm_release" "kyverno" {
+  namespace        = kubernetes_namespace.kyverno.metadata[0].name
+  create_namespace = false
+  name             = "kyverno"
+  atomic           = true
+
+  repository = "https://kyverno.github.io/kyverno/"
+  chart      = "kyverno"
+  version    = "3.6.1"
+
+  values = [yamlencode({
+    # When Kyverno is unavailable, allow pod creation to proceed without
+    # mutation/validation rather than blocking all admissions cluster-wide.
+    features = {
+      forceFailurePolicyIgnore = {
+        enabled = true
+      }
+      policyReports = {
+        enabled = false
+      }
+    }
+
+    reportsController = {
+      resources = {
+        limits = {
+          memory = "512Mi"
+        }
+        requests = {
+          cpu    = "100m"
+          memory = "384Mi"
+        }
+      }
+    }
+
+    backgroundController = {
+      resources = {
+        limits = {
+          memory = "384Mi"
+        }
+        requests = {
+          cpu    = "100m"
+          memory = "384Mi"
+        }
+      }
+    }
+
+    cleanupController = {
+      resources = {
+        limits = {
+          memory = "192Mi"
+        }
+        requests = {
+          cpu    = "100m"
+          memory = "192Mi"
+        }
+      }
+    }
+
+    admissionController = {
+      replicas = 2
+
+      updateStrategy = {
+        type = "RollingUpdate"
+        rollingUpdate = {
+          maxSurge       = 0
+          maxUnavailable = 1
+        }
+      }
+
+      container = {
+        resources = {
+          limits = {
+            memory = "256Mi"
+          }
+          requests = {
+            cpu    = "100m"
+            memory = "256Mi"
+          }
+        }
+      }
+
+      # More tolerant liveness probe — API server slowness shouldn't kill the pod
+      livenessProbe = {
+        httpGet = {
+          path   = "/health/liveness"
+          port   = 9443
+          scheme = "HTTPS"
+        }
+        initialDelaySeconds = 15
+        periodSeconds       = 30
+        timeoutSeconds      = 5
+        failureThreshold    = 4
+        successThreshold    = 1
+      }
+
+      # Spread replicas across nodes for HA
+      topologySpreadConstraints = [
+        {
+          maxSkew           = 1
+          topologyKey       = "kubernetes.io/hostname"
+          whenUnsatisfiable = "DoNotSchedule"
+          labelSelector = {
+            matchLabels = {
+              "app.kubernetes.io/component" = "admission-controller"
+              "app.kubernetes.io/instance"  = "kyverno"
+            }
+          }
+        }
+      ]
+    }
+  })]
+}
+
+# To unlabel all:
+# kubectl label deployment,statefulset,daemonset --all-namespaces -l tier tier-
+#
+# Uses namespaceSelector to match tiers — no API call needed.
+# One rule per tier so Kyverno resolves the tier value from its informer cache.
+resource "kubernetes_manifest" "mutate_tier_from_namespace" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "sync-tier-label-from-namespace"
+    }
+    spec = {
+      rules = [for tier in local.governance_tiers : {
+        name = "sync-tier-${tier}"
+        match = {
+          any = [
+            {
+              resources = {
+                kinds = ["Deployment", "StatefulSet", "DaemonSet"]
+                namespaceSelector = {
+                  matchLabels = {
+                    tier = tier
+                  }
+                }
+              }
+            }
+          ]
+        }
+        exclude = {
+          any = [
+            {
+              resources = {
+                namespaces = ["kube-system", "metallb-system", "n8n"]
+              }
+            }
+          ]
+        }
+        mutate = {
+          patchStrategicMerge = {
+            metadata = {
+              labels = {
+                "+(tier)" = tier
+              }
+            }
+          }
+        }
+      }]
+    }
+  }
+}
+
+# resource "kubernetes_manifest" "enforce_pod_tier_label" {
+#   manifest = {
+#     apiVersion = "kyverno.io/v1"
+#     kind       = "ClusterPolicy"
+#     metadata = {
+#       name = "enforce-pod-tier-label"
+#       annotations = {
+#         "policies.kyverno.io/description" = "Rejects any pod that does not have a tier label."
+#       }
+#     }
+#     spec = {
+#       # 'Enforce' blocks the creation. 'Audit' just reports it.
+#       validationFailureAction = "Enforce"
+#       background              = true
+#       rules = [
+#         {
+#           name = "check-for-tier-label"
+#           match = {
+#             any = [
+#               {
+#                 resources = {
+#                   kinds = ["Pod"]
+#                 }
+#               }
+#             ]
+#           }
+#           validate = {
+#             message = "The label 'tier' is required for all pods in this cluster."
+#             pattern = {
+#               metadata = {
+#                 labels = {
+#                   "tier" = "?*" # The "?*" syntax means the value must not be empty
+#                 }
+#               }
+#             }
+#           }
+#         }
+#       ]
+#     }
+#   }
+# }
--- a/stacks/kyverno/modules/kyverno/resource-governance.tf
+++ b/stacks/kyverno/modules/kyverno/resource-governance.tf
@ -0,0 +1,950 @@
+
+# =============================================================================
+# Tier-Based Resource Governance
+# =============================================================================
+# default (limit) = defaultRequest (request) to give Guaranteed QoS and prevent
+# memory overcommit. Changed 2026-03-14 after node2 OOM crash caused by 250%
+# memory overcommit (61GB limits on 24GB node).
+#
+# Four layers of protection against noisy neighbor issues:
+# 1. PriorityClasses - critical services survive resource pressure
+# 2. LimitRange defaults (Kyverno generate) - auto-inject defaults for containers without resources
+# 3. ResourceQuotas (Kyverno generate) - hard ceiling on namespace resource consumption
+# 4. Priority injection (Kyverno mutate) - set priorityClassName based on namespace tier label
+
+locals {
+  governance_tiers    = ["0-core", "1-cluster", "2-gpu", "3-edge", "4-aux"]
+  excluded_namespaces = ["kube-system", "metallb-system", "kyverno", "calico-system", "calico-apiserver"]
+}
+
+# -----------------------------------------------------------------------------
+# Layer 1: PriorityClasses
+# -----------------------------------------------------------------------------
+# Values stay well below system-cluster-critical (2,000,000,000)
+
+resource "kubernetes_priority_class" "tier_0_core" {
+  metadata {
+    name = "tier-0-core"
+  }
+  value             = 1000000
+  global_default    = false
+  preemption_policy = "PreemptLowerPriority"
+  description       = "Critical infrastructure: ingress, DNS, VPN, auth, monitoring"
+}
+
+resource "kubernetes_priority_class" "tier_1_cluster" {
+  metadata {
+    name = "tier-1-cluster"
+  }
+  value             = 800000
+  global_default    = false
+  preemption_policy = "PreemptLowerPriority"
+  description       = "Cluster services: Redis, metrics, security"
+}
+
+resource "kubernetes_priority_class" "tier_2_gpu" {
+  metadata {
+    name = "tier-2-gpu"
+  }
+  value             = 600000
+  global_default    = false
+  preemption_policy = "PreemptLowerPriority"
+  description       = "GPU workloads: Immich, Ollama, Frigate"
+}
+
+resource "kubernetes_priority_class" "gpu_workload" {
+  metadata {
+    name = "gpu-workload"
+  }
+  value             = 1200000
+  global_default    = false
+  preemption_policy = "PreemptLowerPriority"
+  description       = "GPU-pinned workloads. Higher than all user tiers. Auto-injected by Kyverno on pods requesting nvidia.com/gpu."
+}
+
+resource "kubernetes_priority_class" "tier_3_edge" {
+  metadata {
+    name = "tier-3-edge"
+  }
+  value             = 400000
+  global_default    = false
+  preemption_policy = "PreemptLowerPriority"
+  description       = "User-facing services: mail, file sync, dashboards"
+}
+
+resource "kubernetes_priority_class" "tier_4_aux" {
+  metadata {
+    name = "tier-4-aux"
+  }
+  value             = 200000
+  global_default    = false
+  preemption_policy = "Never"
+  description       = "Optional services: blogs, tools, experiments. Will not preempt other aux services."
+}
+
+# -----------------------------------------------------------------------------
+# Layer 2: LimitRange Defaults (Kyverno Generate)
+# -----------------------------------------------------------------------------
+# Creates a LimitRange in each namespace based on its tier label.
+# Only affects containers WITHOUT explicit resource requests/limits.
+
+resource "kubernetes_manifest" "generate_limitrange_by_tier" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "generate-limitrange-by-tier"
+      annotations = {
+        "policies.kyverno.io/title"       = "Generate LimitRange by Tier"
+        "policies.kyverno.io/description" = "Creates tier-appropriate LimitRange defaults in namespaces based on their tier label. Only affects containers without explicit resource specifications. Excludes namespaces with resource-governance/custom-limitrange label."
+      }
+    }
+    spec = {
+      generateExisting = true
+      rules = [
+        # Tier 0-core
+        {
+          name = "limitrange-tier-0-core"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds = ["Namespace"]
+                  selector = {
+                    matchLabels = {
+                      tier = "0-core"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          exclude = {
+            any = [
+              {
+                resources = {
+                  selector = {
+                    matchLabels = {
+                      "resource-governance/custom-limitrange" = "true"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          generate = {
+            synchronize = true
+            apiVersion  = "v1"
+            kind        = "LimitRange"
+            name        = "tier-defaults"
+            namespace   = "{{request.object.metadata.name}}"
+            data = {
+              spec = {
+                limits = [
+                  {
+                    type = "Container"
+                    default = {
+                      memory = "256Mi"
+                    }
+                    defaultRequest = {
+                      cpu    = "100m"
+                      memory = "256Mi"
+                    }
+                    max = {
+                      memory = "8Gi"
+                    }
+                  }
+                ]
+              }
+            }
+          }
+        },
+        # Tier 1-cluster
+        {
+          name = "limitrange-tier-1-cluster"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds = ["Namespace"]
+                  selector = {
+                    matchLabels = {
+                      tier = "1-cluster"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          exclude = {
+            any = [
+              {
+                resources = {
+                  selector = {
+                    matchLabels = {
+                      "resource-governance/custom-limitrange" = "true"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          generate = {
+            synchronize = true
+            apiVersion  = "v1"
+            kind        = "LimitRange"
+            name        = "tier-defaults"
+            namespace   = "{{request.object.metadata.name}}"
+            data = {
+              spec = {
+                limits = [
+                  {
+                    type = "Container"
+                    default = {
+                      memory = "256Mi"
+                    }
+                    defaultRequest = {
+                      cpu    = "100m"
+                      memory = "256Mi"
+                    }
+                    max = {
+                      memory = "4Gi"
+                    }
+                  }
+                ]
+              }
+            }
+          }
+        },
+        # Tier 2-gpu
+        {
+          name = "limitrange-tier-2-gpu"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds = ["Namespace"]
+                  selector = {
+                    matchLabels = {
+                      tier = "2-gpu"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          exclude = {
+            any = [
+              {
+                resources = {
+                  selector = {
+                    matchLabels = {
+                      "resource-governance/custom-limitrange" = "true"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          generate = {
+            synchronize = true
+            apiVersion  = "v1"
+            kind        = "LimitRange"
+            name        = "tier-defaults"
+            namespace   = "{{request.object.metadata.name}}"
+            data = {
+              spec = {
+                limits = [
+                  {
+                    type = "Container"
+                    default = {
+                      memory = "1Gi"
+                    }
+                    defaultRequest = {
+                      cpu    = "200m"
+                      memory = "1Gi"
+                    }
+                    max = {
+                      memory = "16Gi"
+                    }
+                  }
+                ]
+              }
+            }
+          }
+        },
+        # Tier 3-edge — Burstable QoS: request < limit to reduce scheduler pressure
+        {
+          name = "limitrange-tier-3-edge"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds = ["Namespace"]
+                  selector = {
+                    matchLabels = {
+                      tier = "3-edge"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          exclude = {
+            any = [
+              {
+                resources = {
+                  selector = {
+                    matchLabels = {
+                      "resource-governance/custom-limitrange" = "true"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          generate = {
+            synchronize = true
+            apiVersion  = "v1"
+            kind        = "LimitRange"
+            name        = "tier-defaults"
+            namespace   = "{{request.object.metadata.name}}"
+            data = {
+              spec = {
+                limits = [
+                  {
+                    type = "Container"
+                    default = {
+                      memory = "192Mi"
+                    }
+                    defaultRequest = {
+                      cpu    = "50m"
+                      memory = "96Mi"
+                    }
+                    max = {
+                      memory = "4Gi"
+                    }
+                  }
+                ]
+              }
+            }
+          }
+        },
+        # Tier 4-aux — Burstable QoS: request < limit to reduce scheduler pressure
+        {
+          name = "limitrange-tier-4-aux"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds = ["Namespace"]
+                  selector = {
+                    matchLabels = {
+                      tier = "4-aux"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          exclude = {
+            any = [
+              {
+                resources = {
+                  selector = {
+                    matchLabels = {
+                      "resource-governance/custom-limitrange" = "true"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          generate = {
+            synchronize = true
+            apiVersion  = "v1"
+            kind        = "LimitRange"
+            name        = "tier-defaults"
+            namespace   = "{{request.object.metadata.name}}"
+            data = {
+              spec = {
+                limits = [
+                  {
+                    type = "Container"
+                    default = {
+                      memory = "256Mi"
+                    }
+                    defaultRequest = {
+                      cpu    = "50m"
+                      memory = "64Mi"
+                    }
+                    max = {
+                      memory = "4Gi"
+                    }
+                  }
+                ]
+              }
+            }
+          }
+        },
+        # Fallback: namespaces without a tier label get aux-level defaults
+        # requests = limits to prevent memory overcommit (2026-03-14 node2 OOM incident)
+        {
+          name = "limitrange-no-tier-fallback"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds = ["Namespace"]
+                }
+              }
+            ]
+          }
+          exclude = {
+            any = [
+              {
+                resources = {
+                  selector = {
+                    matchExpressions = [
+                      {
+                        key      = "tier"
+                        operator = "Exists"
+                      }
+                    ]
+                  }
+                }
+              },
+              {
+                resources = {
+                  namespaces = ["kube-system", "metallb-system", "kyverno", "calico-system", "calico-apiserver"]
+                }
+              }
+            ]
+          }
+          generate = {
+            synchronize = true
+            apiVersion  = "v1"
+            kind        = "LimitRange"
+            name        = "tier-defaults"
+            namespace   = "{{request.object.metadata.name}}"
+            data = {
+              spec = {
+                limits = [
+                  {
+                    type = "Container"
+                    default = {
+                      memory = "128Mi"
+                    }
+                    defaultRequest = {
+                      cpu    = "50m"
+                      memory = "128Mi"
+                    }
+                    max = {
+                      memory = "2Gi"
+                    }
+                  }
+                ]
+              }
+            }
+          }
+        },
+      ]
+    }
+  }
+}
+
+# -----------------------------------------------------------------------------
+# Layer 3: ResourceQuotas (Kyverno Generate)
+# -----------------------------------------------------------------------------
+# Creates a ResourceQuota in each namespace based on its tier label.
+# Sets hard ceiling on total namespace resource consumption.
+# Namespaces with label resource-governance/custom-quota=true are excluded.
+#
+# IMPORTANT: LimitRange (Layer 2) must exist before ResourceQuota takes effect,
+# because ResourceQuota requires all pods to have resource requests set.
+
+resource "kubernetes_manifest" "generate_resourcequota_by_tier" {
+  depends_on = [kubernetes_manifest.generate_limitrange_by_tier]
+
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "generate-resourcequota-by-tier"
+      annotations = {
+        "policies.kyverno.io/title"       = "Generate ResourceQuota by Tier"
+        "policies.kyverno.io/description" = "Creates tier-appropriate ResourceQuota in namespaces based on their tier label. Excludes namespaces with resource-governance/custom-quota label."
+      }
+    }
+    spec = {
+      generateExisting = true
+      rules = [
+        # Tier 0-core
+        {
+          name = "quota-tier-0-core"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds = ["Namespace"]
+                  selector = {
+                    matchLabels = {
+                      tier = "0-core"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          exclude = {
+            any = [
+              {
+                resources = {
+                  selector = {
+                    matchLabels = {
+                      "resource-governance/custom-quota" = "true"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          generate = {
+            synchronize = true
+            apiVersion  = "v1"
+            kind        = "ResourceQuota"
+            name        = "tier-quota"
+            namespace   = "{{request.object.metadata.name}}"
+            data = {
+              spec = {
+                hard = {
+                  "requests.cpu"    = "8"
+                  "requests.memory" = "8Gi"
+                  "limits.memory"   = "64Gi"
+                  pods              = "100"
+                }
+              }
+            }
+          }
+        },
+        # Tier 1-cluster
+        {
+          name = "quota-tier-1-cluster"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds = ["Namespace"]
+                  selector = {
+                    matchLabels = {
+                      tier = "1-cluster"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          exclude = {
+            any = [
+              {
+                resources = {
+                  selector = {
+                    matchLabels = {
+                      "resource-governance/custom-quota" = "true"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          generate = {
+            synchronize = true
+            apiVersion  = "v1"
+            kind        = "ResourceQuota"
+            name        = "tier-quota"
+            namespace   = "{{request.object.metadata.name}}"
+            data = {
+              spec = {
+                hard = {
+                  "requests.cpu"    = "4"
+                  "requests.memory" = "4Gi"
+                  "limits.memory"   = "32Gi"
+                  pods              = "30"
+                }
+              }
+            }
+          }
+        },
+        # Tier 2-gpu
+        {
+          name = "quota-tier-2-gpu"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds = ["Namespace"]
+                  selector = {
+                    matchLabels = {
+                      tier = "2-gpu"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          exclude = {
+            any = [
+              {
+                resources = {
+                  selector = {
+                    matchLabels = {
+                      "resource-governance/custom-quota" = "true"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          generate = {
+            synchronize = true
+            apiVersion  = "v1"
+            kind        = "ResourceQuota"
+            name        = "tier-quota"
+            namespace   = "{{request.object.metadata.name}}"
+            data = {
+              spec = {
+                hard = {
+                  "requests.cpu"    = "8"
+                  "requests.memory" = "8Gi"
+                  "limits.memory"   = "32Gi"
+                  pods              = "40"
+                }
+              }
+            }
+          }
+        },
+        # Tier 3-edge
+        {
+          name = "quota-tier-3-edge"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds = ["Namespace"]
+                  selector = {
+                    matchLabels = {
+                      tier = "3-edge"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          exclude = {
+            any = [
+              {
+                resources = {
+                  selector = {
+                    matchLabels = {
+                      "resource-governance/custom-quota" = "true"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          generate = {
+            synchronize = true
+            apiVersion  = "v1"
+            kind        = "ResourceQuota"
+            name        = "tier-quota"
+            namespace   = "{{request.object.metadata.name}}"
+            data = {
+              spec = {
+                hard = {
+                  "requests.cpu"    = "4"
+                  "requests.memory" = "4Gi"
+                  "limits.memory"   = "32Gi"
+                  pods              = "30"
+                }
+              }
+            }
+          }
+        },
+        # Tier 4-aux
+        {
+          name = "quota-tier-4-aux"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds = ["Namespace"]
+                  selector = {
+                    matchLabels = {
+                      tier = "4-aux"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          exclude = {
+            any = [
+              {
+                resources = {
+                  selector = {
+                    matchLabels = {
+                      "resource-governance/custom-quota" = "true"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          generate = {
+            synchronize = true
+            apiVersion  = "v1"
+            kind        = "ResourceQuota"
+            name        = "tier-quota"
+            namespace   = "{{request.object.metadata.name}}"
+            data = {
+              spec = {
+                hard = {
+                  "requests.cpu"    = "2"
+                  "requests.memory" = "2Gi"
+                  "limits.memory"   = "16Gi"
+                  pods              = "20"
+                }
+              }
+            }
+          }
+        },
+      ]
+    }
+  }
+}
+
+# -----------------------------------------------------------------------------
+# Layer 4: PriorityClassName Injection (Kyverno Mutate)
+# -----------------------------------------------------------------------------
+# Automatically sets priorityClassName on Pods based on their namespace's tier label.
+# Skips pods that already have a priorityClassName set.
+# Uses namespaceSelector instead of API calls — no round-trip to the API server.
+
+resource "kubernetes_manifest" "mutate_priority_from_tier" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "inject-priority-class-from-tier"
+      annotations = {
+        "policies.kyverno.io/title"       = "Inject PriorityClass from Tier"
+        "policies.kyverno.io/description" = "Sets priorityClassName on Pods based on the namespace tier label. Skips pods that already have a priorityClassName."
+      }
+    }
+    spec = {
+      rules = [for tier in local.governance_tiers : {
+        name = "inject-priority-${tier}"
+        match = {
+          any = [
+            {
+              resources = {
+                kinds      = ["Pod"]
+                operations = ["CREATE"]
+                namespaceSelector = {
+                  matchLabels = {
+                    tier = tier
+                  }
+                }
+              }
+            }
+          ]
+        }
+        exclude = {
+          any = [
+            {
+              resources = {
+                namespaces = local.excluded_namespaces
+              }
+            }
+          ]
+        }
+        preconditions = {
+          all = [
+            {
+              key      = "{{request.object.spec.priorityClassName || ''}}"
+              operator = "Equals"
+              value    = ""
+            }
+          ]
+        }
+        mutate = {
+          patchesJson6902 = yamlencode([
+            {
+              op   = "remove"
+              path = "/spec/priority"
+            },
+            {
+              op   = "remove"
+              path = "/spec/preemptionPolicy"
+            },
+            {
+              op    = "add"
+              path  = "/spec/priorityClassName"
+              value = "tier-${tier}"
+            }
+          ])
+        }
+      }]
+    }
+  }
+}
+
+
+# --- ndots:2 injection ---
+# Kubernetes defaults to ndots:5, which causes 4 wasted NxDomain queries per
+# external DNS lookup (search domain expansion). This policy injects ndots:2
+# on all pods to reduce NxDomain flood while still allowing short-name service
+# resolution (e.g. "redis.redis" has 1 dot, so it still expands).
+resource "kubernetes_manifest" "mutate_ndots" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "inject-ndots"
+      annotations = {
+        "policies.kyverno.io/title"       = "Inject ndots:2 DNS Config"
+        "policies.kyverno.io/description" = "Sets ndots:2 on all Pods to reduce NxDomain query flood from search domain expansion. Skips pods that already have ndots configured."
+      }
+    }
+    spec = {
+      rules = [
+        {
+          name = "inject-ndots-2"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds = ["Pod"]
+                }
+              }
+            ]
+          }
+          exclude = {
+            any = [
+              {
+                resources = {
+                  namespaces = ["kube-system", "metallb-system", "kyverno", "calico-system", "calico-apiserver"]
+                }
+              }
+            ]
+          }
+          preconditions = {
+            all = [
+              {
+                key      = "{{ request.object.spec.dnsConfig.options || `[]` | [?name == 'ndots'] | length(@) }}"
+                operator = "Equals"
+                value    = "0"
+              }
+            ]
+          }
+          mutate = {
+            patchStrategicMerge = {
+              spec = {
+                dnsConfig = {
+                  options = [
+                    {
+                      name  = "ndots"
+                      value = "2"
+                    }
+                  ]
+                }
+              }
+            }
+          }
+        }
+      ]
+    }
+  }
+}
+
+# -----------------------------------------------------------------------------
+# Layer 5: GPU Workload Priority Override (Kyverno Mutate)
+# -----------------------------------------------------------------------------
+# Overrides the tier-based priorityClassName with gpu-workload for pods that
+# actually request nvidia.com/gpu resources. This ensures GPU pods can preempt
+# non-GPU pods on the GPU node, regardless of namespace tier.
+# Runs after Layer 4 (tier injection), so it overrides the tier-based priority.
+
+resource "kubernetes_manifest" "mutate_gpu_priority" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "inject-gpu-workload-priority"
+      annotations = {
+        "policies.kyverno.io/title"       = "Inject GPU Workload Priority"
+        "policies.kyverno.io/description" = "Overrides priorityClassName to gpu-workload for pods requesting nvidia.com/gpu resources. Runs after tier-based injection."
+      }
+    }
+    spec = {
+      rules = [
+        {
+          name = "gpu-priority-override"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds      = ["Pod"]
+                  operations = ["CREATE"]
+                }
+              }
+            ]
+          }
+          exclude = {
+            any = [
+              {
+                resources = {
+                  namespaces = local.excluded_namespaces
+                }
+              }
+            ]
+          }
+          preconditions = {
+            any = [
+              {
+                key      = "{{ request.object.spec.containers[].resources.requests.\"nvidia.com/gpu\" || '' }}"
+                operator = "NotEquals"
+                value    = ""
+              },
+              {
+                key      = "{{ request.object.spec.containers[].resources.limits.\"nvidia.com/gpu\" || '' }}"
+                operator = "NotEquals"
+                value    = ""
+              }
+            ]
+          }
+          mutate = {
+            patchesJson6902 = yamlencode([
+              {
+                op    = "replace"
+                path  = "/spec/priorityClassName"
+                value = "gpu-workload"
+              },
+              {
+                op    = "replace"
+                path  = "/spec/priority"
+                value = 1200000
+              },
+              {
+                op    = "replace"
+                path  = "/spec/preemptionPolicy"
+                value = "PreemptLowerPriority"
+              }
+            ])
+          }
+        }
+      ]
+    }
+  }
+}
--- a/stacks/kyverno/modules/kyverno/security-policies.tf
+++ b/stacks/kyverno/modules/kyverno/security-policies.tf
@ -0,0 +1,294 @@
+# =============================================================================
+# Pod Security Policies (Audit Mode)
+# =============================================================================
+# Kyverno validate policies for pod security standards.
+# All policies start in Audit mode - violations are logged but not blocked.
+
+resource "kubernetes_manifest" "policy_deny_privileged" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "deny-privileged-containers"
+      annotations = {
+        "policies.kyverno.io/title"       = "Deny Privileged Containers"
+        "policies.kyverno.io/category"    = "Pod Security"
+        "policies.kyverno.io/severity"    = "high"
+        "policies.kyverno.io/description" = "Privileged containers have full host access. Deny unless explicitly exempted."
+      }
+    }
+    spec = {
+      validationFailureAction = "Audit"
+      background              = true
+      rules = [{
+        name = "deny-privileged"
+        match = {
+          any = [{
+            resources = {
+              kinds = ["Pod"]
+            }
+          }]
+        }
+        exclude = {
+          any = [{
+            resources = {
+              namespaces = ["frigate", "nvidia", "monitoring"]
+            }
+          }]
+        }
+        validate = {
+          message = "Privileged containers are not allowed. Use specific capabilities instead."
+          pattern = {
+            spec = {
+              containers = [{
+                "=(securityContext)" = {
+                  "=(privileged)" = false
+                }
+              }]
+              "=(initContainers)" = [{
+                "=(securityContext)" = {
+                  "=(privileged)" = false
+                }
+              }]
+            }
+          }
+        }
+      }]
+    }
+  }
+
+  depends_on = [helm_release.kyverno]
+}
+
+resource "kubernetes_manifest" "policy_deny_host_namespaces" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "deny-host-namespaces"
+      annotations = {
+        "policies.kyverno.io/title"       = "Deny Host Namespaces"
+        "policies.kyverno.io/category"    = "Pod Security"
+        "policies.kyverno.io/severity"    = "high"
+        "policies.kyverno.io/description" = "Sharing host namespaces enables container escapes. Deny hostNetwork, hostPID, hostIPC."
+      }
+    }
+    spec = {
+      validationFailureAction = "Audit"
+      background              = true
+      rules = [{
+        name = "deny-host-namespaces"
+        match = {
+          any = [{
+            resources = {
+              kinds = ["Pod"]
+            }
+          }]
+        }
+        exclude = {
+          any = [{
+            resources = {
+              namespaces = ["frigate", "monitoring"]
+            }
+          }]
+        }
+        validate = {
+          message = "Host namespaces (hostNetwork, hostPID, hostIPC) are not allowed."
+          pattern = {
+            spec = {
+              "=(hostNetwork)" = false
+              "=(hostPID)"     = false
+              "=(hostIPC)"     = false
+            }
+          }
+        }
+      }]
+    }
+  }
+
+  depends_on = [helm_release.kyverno]
+}
+
+resource "kubernetes_manifest" "policy_restrict_capabilities" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "restrict-sys-admin"
+      annotations = {
+        "policies.kyverno.io/title"       = "Restrict SYS_ADMIN Capability"
+        "policies.kyverno.io/category"    = "Pod Security"
+        "policies.kyverno.io/severity"    = "high"
+        "policies.kyverno.io/description" = "SYS_ADMIN is nearly equivalent to root. Restrict to explicitly exempted namespaces."
+      }
+    }
+    spec = {
+      validationFailureAction = "Audit"
+      background              = true
+      rules = [{
+        name = "restrict-sys-admin"
+        match = {
+          any = [{
+            resources = {
+              kinds = ["Pod"]
+            }
+          }]
+        }
+        exclude = {
+          any = [{
+            resources = {
+              namespaces = ["nvidia", "monitoring"]
+            }
+          }]
+        }
+        validate = {
+          message = "Adding SYS_ADMIN capability is not allowed."
+          deny = {
+            conditions = {
+              any = [{
+                key      = "{{ request.object.spec.containers[].securityContext.capabilities.add[] || `[]` }}"
+                operator = "AnyIn"
+                value    = ["SYS_ADMIN"]
+              }]
+            }
+          }
+        }
+      }]
+    }
+  }
+
+  depends_on = [helm_release.kyverno]
+}
+
+# =============================================================================
+# Image Pull Policy Governance
+# =============================================================================
+# Mutate imagePullPolicy to IfNotPresent for all containers with pinned tags
+# (non-:latest). This prevents pods from getting stuck in ImagePullBackOff
+# when the pull-through cache at 10.0.20.10 has transient failures.
+# For :latest or untagged images, set to Always so stale images don't persist.
+
+resource "kubernetes_manifest" "policy_set_image_pull_policy" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "set-image-pull-policy"
+      annotations = {
+        "policies.kyverno.io/title"       = "Set Image Pull Policy"
+        "policies.kyverno.io/category"    = "Best Practices"
+        "policies.kyverno.io/severity"    = "medium"
+        "policies.kyverno.io/description" = "Set imagePullPolicy to IfNotPresent for pinned tags and Always for :latest to prevent ImagePullBackOff from transient cache failures."
+      }
+    }
+    spec = {
+      background = false
+      rules = [
+        {
+          name = "set-ifnotpresent-for-pinned-tags"
+          match = {
+            any = [{
+              resources = {
+                kinds = ["Pod"]
+              }
+            }]
+          }
+          mutate = {
+            foreach = [{
+              list = "request.object.spec.containers"
+              preconditions = {
+                all = [{
+                  key      = "{{ ends_with(element.image, ':latest') || !contains(element.image, ':') }}"
+                  operator = "Equals"
+                  value    = false
+                }]
+              }
+              patchStrategicMerge = {
+                spec = {
+                  containers = [{
+                    name            = "{{ element.name }}"
+                    imagePullPolicy = "IfNotPresent"
+                  }]
+                }
+              }
+            }]
+          }
+        },
+        {
+          name = "set-always-for-latest"
+          match = {
+            any = [{
+              resources = {
+                kinds = ["Pod"]
+              }
+            }]
+          }
+          mutate = {
+            foreach = [{
+              list = "request.object.spec.containers"
+              preconditions = {
+                all = [{
+                  key      = "{{ ends_with(element.image, ':latest') || !contains(element.image, ':') }}"
+                  operator = "Equals"
+                  value    = true
+                }]
+              }
+              patchStrategicMerge = {
+                spec = {
+                  containers = [{
+                    name            = "{{ element.name }}"
+                    imagePullPolicy = "Always"
+                  }]
+                }
+              }
+            }]
+          }
+        }
+      ]
+    }
+  }
+
+  depends_on = [helm_release.kyverno]
+}
+
+resource "kubernetes_manifest" "policy_require_trusted_registries" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "require-trusted-registries"
+      annotations = {
+        "policies.kyverno.io/title"       = "Require Trusted Image Registries"
+        "policies.kyverno.io/category"    = "Pod Security"
+        "policies.kyverno.io/severity"    = "medium"
+        "policies.kyverno.io/description" = "Images must come from trusted registries to prevent supply chain attacks."
+      }
+    }
+    spec = {
+      validationFailureAction = "Audit"
+      background              = true
+      rules = [{
+        name = "validate-registries"
+        match = {
+          any = [{
+            resources = {
+              kinds = ["Pod"]
+            }
+          }]
+        }
+        validate = {
+          message = "Images must be from trusted registries (docker.io, ghcr.io, quay.io, registry.k8s.io, or local cache)."
+          pattern = {
+            spec = {
+              containers = [{
+                image = "docker.io/* | ghcr.io/* | quay.io/* | registry.k8s.io/* | 10.0.20.10* | */*"
+              }]
+            }
+          }
+        }
+      }]
+    }
+  }
+
+  depends_on = [helm_release.kyverno]
+}
--- a/stacks/kyverno/secrets
+++ b/stacks/kyverno/secrets
@ -0,0 +1 @@
+../../secrets
--- a/stacks/kyverno/terragrunt.hcl
+++ b/stacks/kyverno/terragrunt.hcl
@ -0,0 +1,8 @@
+include "root" {
+  path = find_in_parent_folders()
+}
+
+dependency "infra" {
+  config_path  = "../infra"
+  skip_outputs = true
+}
--- a/stacks/kyverno/tiers.tf
+++ b/stacks/kyverno/tiers.tf
@ -0,0 +1,10 @@
+# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
+locals {
+  tiers = {
+    core    = "0-core"
+    cluster = "1-cluster"
+    gpu     = "2-gpu"
+    edge    = "3-edge"
+    aux     = "4-aux"
+  }
+}
--- a/stacks/mailserver/main.tf
+++ b/stacks/mailserver/main.tf
@ -0,0 +1,32 @@
+# =============================================================================
+# Mailserver Stack — docker-mailserver
+# =============================================================================
+
+variable "tls_secret_name" { type = string }
+variable "nfs_server" { type = string }
+variable "mysql_host" { type = string }
+
+data "vault_kv_secret_v2" "secrets" {
+  mount = "secret"
+  name  = "platform"
+}
+
+locals {
+  mailserver_accounts     = jsondecode(data.vault_kv_secret_v2.secrets.data["mailserver_accounts"])
+  mailserver_aliases      = jsondecode(data.vault_kv_secret_v2.secrets.data["mailserver_aliases"])
+  mailserver_opendkim_key = jsondecode(data.vault_kv_secret_v2.secrets.data["mailserver_opendkim_key"])
+  mailserver_sasl_passwd  = jsondecode(data.vault_kv_secret_v2.secrets.data["mailserver_sasl_passwd"])
+}
+
+module "mailserver" {
+  source                  = "./modules/mailserver"
+  tls_secret_name         = var.tls_secret_name
+  nfs_server              = var.nfs_server
+  mysql_host              = var.mysql_host
+  mailserver_accounts     = local.mailserver_accounts
+  postfix_account_aliases = local.mailserver_aliases
+  opendkim_key            = local.mailserver_opendkim_key
+  sasl_passwd             = local.mailserver_sasl_passwd
+  roundcube_db_password   = data.vault_kv_secret_v2.secrets.data["mailserver_roundcubemail_db_password"]
+  tier                    = local.tiers.edge
+}
--- a/stacks/mailserver/modules/mailserver/extra/aliases.txt
+++ b/stacks/mailserver/modules/mailserver/extra/aliases.txt
@ -0,0 +1,5 @@
+firmly-gerardo-generated@viktorbarzin.me me@viktorbarzin.me
+closely-keith-generated@viktorbarzin.me vbarzin@gmail.com
+literally-paolo-generated@viktorbarzin.me viktorbarzin@fb.com
+hastily-stefanie-generated@viktorbarzin.me elliestamenova@gmail.com
+vaultwarden@viktorbarzin.me me@viktorbarzin.me
--- a/stacks/mailserver/modules/mailserver/main.tf
+++ b/stacks/mailserver/modules/mailserver/main.tf
@ -0,0 +1,510 @@
+variable "tls_secret_name" {}
+variable "tier" { type = string }
+variable "mailserver_accounts" {}
+variable "postfix_account_aliases" {}
+variable "opendkim_key" {}
+variable "sasl_passwd" {} # For sendgrid i.e relayhost
+variable "nfs_server" { type = string }
+
+resource "kubernetes_namespace" "mailserver" {
+  metadata {
+    name = "mailserver"
+    labels = {
+      tier = var.tier
+    }
+    # connecting via localhost does not seem to work?
+    # labels = {
+    #   "istio-injection" : "enabled"
+    # }
+  }
+}
+
+module "tls_secret" {
+  source          = "../../../../modules/kubernetes/setup_tls_secret"
+  namespace       = kubernetes_namespace.mailserver.metadata[0].name
+  tls_secret_name = var.tls_secret_name
+}
+
+resource "kubernetes_config_map" "mailserver_env_config" {
+  metadata {
+    name      = "mailserver.env.config"
+    namespace = kubernetes_namespace.mailserver.metadata[0].name
+    labels = {
+      app = "mailserver"
+    }
+    annotations = {
+      "reloader.stakater.com/match" = "true"
+    }
+  }
+
+  data = {
+    DMS_DEBUG = "0"
+    # LOG_LEVEL                              = "debug"
+    ENABLE_CLAMAV                          = "0"
+    ENABLE_AMAVIS                          = "0"
+    ENABLE_FAIL2BAN                        = "0"
+    ENABLE_FETCHMAIL                       = "0"
+    ENABLE_POSTGREY                        = "0"
+    ENABLE_SASLAUTHD                       = "0"
+    ENABLE_SPAMASSASSIN                    = "0"
+    ENABLE_RSPAMD                          = "1"
+    ENABLE_OPENDKIM                        = "0"
+    ENABLE_OPENDMARC                       = "0"
+    RSPAMD_LEARN                           = "1"
+    ENABLE_SRS                             = "1"
+    FETCHMAIL_POLL                         = "120"
+    ONE_DIR                                = "1"
+    OVERRIDE_HOSTNAME                      = "mail.viktorbarzin.me"
+    POSTFIX_MESSAGE_SIZE_LIMIT             = 1024 * 1024 * 200 # 200 MB
+    POSTFIX_REJECT_UNKNOWN_CLIENT_HOSTNAME = "1"
+    # TLS_LEVEL                              = "intermediate"
+    # DEFAULT_RELAY_HOST = "[smtp.sendgrid.net]:587"
+    DEFAULT_RELAY_HOST = "[smtp.eu.mailgun.org]:587"
+    SPOOF_PROTECTION   = "1"
+    SSL_TYPE           = "manual"
+    SSL_CERT_PATH      = "/tmp/ssl/tls.crt"
+    SSL_KEY_PATH       = "/tmp/ssl/tls.key"
+  }
+}
+
+resource "kubernetes_config_map" "mailserver_config" {
+  metadata {
+    name      = "mailserver.config"
+    namespace = kubernetes_namespace.mailserver.metadata[0].name
+
+    labels = {
+      app = "mailserver"
+    }
+    annotations = {
+      "reloader.stakater.com/match" = "true"
+    }
+  }
+
+  data = {
+    # Actual mail settings
+    "postfix-accounts.cf" = join("\n", [for user, pass in var.mailserver_accounts : "${user}|${bcrypt(pass, 6)}"])
+    "postfix-main.cf"     = var.postfix_cf
+    "postfix-virtual.cf"  = format("%s%s", var.postfix_account_aliases, file("${path.module}/extra/aliases.txt"))
+
+    KeyTable      = "mail._domainkey.viktorbarzin.me viktorbarzin.me:mail:/etc/opendkim/keys/viktorbarzin.me-mail.key\n"
+    SigningTable  = "*@viktorbarzin.me mail._domainkey.viktorbarzin.me\n"
+    TrustedHosts  = "127.0.0.1\nlocalhost\n"
+    "sasl_passwd" = var.sasl_passwd
+    # Rspamd DKIM signing configuration
+    "dkim_signing.conf" = <<-EOF
+    enabled = true;
+    sign_authenticated = true;
+    sign_local = true;
+    use_domain = "header";
+    use_redis = false;
+    use_esld = true;
+    selector = "mail";
+    path = "/tmp/docker-mailserver/rspamd/dkim/viktorbarzin.me/mail.private";
+    domain {
+        viktorbarzin.me {
+            path = "/tmp/docker-mailserver/rspamd/dkim/viktorbarzin.me/mail.private";
+            selector = "mail";
+        }
+    }
+    EOF
+    fail2ban_conf       = <<-EOF
+    [DEFAULT]
+
+    #logtarget = /var/log/fail2ban.log
+    logtarget = SYSOUT
+    EOF
+  }
+  # Password hashes are different each time and avoid changing secret constantly. 
+  # Either 1.Create consistent hashes or 2.Find a way to ignore_changes on per password
+  lifecycle {
+    ignore_changes = [data["postfix-accounts.cf"]]
+  }
+}
+
+# resource "kubernetes_config_map" "user_patches" {
+#   metadata {
+#     name      = "user-patches"
+#    namespace = kubernetes_namespace.mailserver.metadata[0].name
+#     labels = {
+#       "app" = "mailserver"
+#     }
+#   }
+
+#   data = {
+#     user_patches = <<EOF
+# #!/bin/bash
+# cp -f /tmp/dovecot.key /etc/dovecot/ssl/dovecot.key
+# cp -f /tmp/dovecot.crt /etc/dovecot/ssl/dovecot.pem 
+#     EOF
+#   }
+# }
+
+resource "kubernetes_secret" "opendkim_key" {
+  metadata {
+    name      = "mailserver.opendkim.key"
+    namespace = kubernetes_namespace.mailserver.metadata[0].name
+    labels = {
+      "app" = "mailserver"
+    }
+  }
+  type = "Opaque"
+  data = {
+    "viktorbarzin.me-mail.key" = var.opendkim_key
+  }
+}
+
+
+module "nfs_data" {
+  source     = "../../../../modules/kubernetes/nfs_volume"
+  name       = "mailserver-data"
+  namespace  = kubernetes_namespace.mailserver.metadata[0].name
+  nfs_server = var.nfs_server
+  nfs_path   = "/mnt/main/mailserver"
+}
+
+resource "kubernetes_deployment" "mailserver" {
+  metadata {
+    name      = "mailserver"
+    namespace = kubernetes_namespace.mailserver.metadata[0].name
+    labels = {
+      "app" = "mailserver"
+      tier  = var.tier
+    }
+    annotations = {
+      "reloader.stakater.com/search" = "true"
+    }
+  }
+  spec {
+    replicas = "1"
+    strategy {
+      type = "Recreate"
+    }
+    selector {
+      match_labels = {
+        "app" = "mailserver"
+      }
+    }
+    template {
+      metadata {
+        annotations = {
+          # "diun.enable" = "true"
+        }
+        labels = {
+          "app"  = "mailserver"
+          "role" = "mail"
+        }
+      }
+      spec {
+        container {
+          name              = "docker-mailserver"
+          image             = "docker.io/mailserver/docker-mailserver:15.0.0"
+          image_pull_policy = "IfNotPresent"
+          security_context {
+            capabilities {
+              add = ["NET_ADMIN"]
+            }
+          }
+
+          lifecycle {
+            post_start {
+              exec {
+                command = [
+                  "postmap",
+                  "/etc/postfix/sasl/passwd"
+                  # "/bin/sh",
+                  # "-c",
+                  # "cp -f /tmp/user-patches.sh /tmp/docker-mailserver/user-patches.sh && chown root:root /var/log/mail && chmod 755 /var/log/mail",
+                ]
+              }
+            }
+          }
+
+          volume_mount {
+            name       = "config-tls"
+            mount_path = "/tmp/ssl/tls.key"
+            sub_path   = "tls.key"
+            read_only  = true
+          }
+          volume_mount {
+            name       = "config-tls"
+            mount_path = "/tmp/ssl/tls.crt"
+            sub_path   = "tls.crt"
+            read_only  = true
+          }
+          volume_mount {
+            name       = "config"
+            mount_path = "/tmp/docker-mailserver/postfix-accounts.cf"
+            sub_path   = "postfix-accounts.cf"
+            read_only  = true
+          }
+          volume_mount {
+            name       = "config"
+            mount_path = "/tmp/docker-mailserver/postfix-main.cf"
+            sub_path   = "postfix-main.cf"
+            read_only  = true
+          }
+          volume_mount {
+            name       = "config"
+            mount_path = "/tmp/docker-mailserver/postfix-virtual.cf"
+            sub_path   = "postfix-virtual.cf"
+            read_only  = true
+          }
+          volume_mount {
+            name       = "config"
+            mount_path = "/tmp/docker-mailserver/fetchmail.cf"
+            sub_path   = "fetchmail.cf"
+            read_only  = true
+          }
+          # volume_mount {
+          #   name       = "config"
+          #   mount_path = "/tmp/docker-mailserver/dovecot.cf"
+          #   sub_path   = "dovecot.cf"
+          #   read_only  = true
+          # }
+          # volume_mount {
+          #   name       = "user-patches"
+          #   mount_path = "/tmp/user-patches.sh"
+          #   sub_path   = "user-patches.sh"
+          #   read_only  = true
+          # }
+          volume_mount {
+            name       = "config"
+            mount_path = "/tmp/docker-mailserver/opendkim/SigningTable"
+            sub_path   = "SigningTable"
+            read_only  = true
+          }
+          volume_mount {
+            name       = "config"
+            mount_path = "/tmp/docker-mailserver/opendkim/KeyTable"
+            sub_path   = "KeyTable"
+            read_only  = true
+          }
+          volume_mount {
+            name       = "config"
+            mount_path = "/tmp/docker-mailserver/opendkim/TrustedHosts"
+            sub_path   = "TrustedHosts"
+            read_only  = true
+          }
+          volume_mount {
+            name       = "opendkim-key"
+            mount_path = "/tmp/docker-mailserver/opendkim/keys"
+            read_only  = true
+          }
+          volume_mount {
+            name       = "opendkim-key"
+            mount_path = "/tmp/docker-mailserver/rspamd/dkim/viktorbarzin.me/mail.private"
+            sub_path   = "viktorbarzin.me-mail.key"
+            read_only  = true
+          }
+          volume_mount {
+            name       = "config"
+            mount_path = "/tmp/docker-mailserver/rspamd/override.d/dkim_signing.conf"
+            sub_path   = "dkim_signing.conf"
+            read_only  = true
+          }
+          volume_mount {
+            name       = "data"
+            mount_path = "/var/mail"
+            sub_path   = "data"
+          }
+          volume_mount {
+            name       = "data"
+            mount_path = "/var/mail-state"
+            sub_path   = "state"
+          }
+          volume_mount {
+            name       = "data"
+            mount_path = "/var/log/mail"
+            sub_path   = "log"
+          }
+          volume_mount {
+            name       = "var-run-dovecot"
+            mount_path = "/var/run/dovecot"
+          }
+          volume_mount {
+            name       = "config"
+            mount_path = "/etc/postfix/sasl/passwd"
+            sub_path   = "sasl_passwd"
+            read_only  = true
+          }
+          volume_mount {
+            name       = "config"
+            mount_path = "/etc/fail2ban/fail2ban.local"
+            sub_path   = "fail2ban_conf"
+            read_only  = true
+          }
+          port {
+            name           = "smtp"
+            container_port = 25
+            protocol       = "TCP"
+          }
+          port {
+            name           = "smtp-secure"
+            container_port = 465
+            protocol       = "TCP"
+          }
+          port {
+            name           = "smtp-auth"
+            container_port = 587
+            protocol       = "TCP"
+          }
+          port {
+            name           = "imap-secure"
+            container_port = 993
+            protocol       = "TCP"
+          }
+          env_from {
+            config_map_ref {
+              name = "mailserver.env.config"
+            }
+          }
+
+          resources {
+            requests = {
+              cpu    = "25m"
+              memory = "512Mi"
+            }
+            limits = {
+              memory = "512Mi"
+            }
+          }
+
+        }
+
+        container {
+          name  = "dovecot-exporter"
+          image = "viktorbarzin/dovecot_exporter:latest"
+          command = [
+            "/dovecot_exporter/exporter",
+            "--dovecot.socket-path=/var/run/dovecot/stats-reader"
+          ]
+          image_pull_policy = "IfNotPresent"
+          port {
+            name           = "dovecotexporter"
+            container_port = 9166
+            protocol       = "TCP"
+          }
+          volume_mount {
+            name       = "var-run-dovecot"
+            mount_path = "/var/run/dovecot"
+          }
+          resources {
+            requests = {
+              cpu    = "10m"
+              memory = "32Mi"
+            }
+            limits = {
+              memory = "32Mi"
+            }
+          }
+        }
+
+        volume {
+          name = "config"
+          config_map {
+            name = "mailserver.config"
+          }
+        }
+        volume {
+          name = "config-tls"
+          secret {
+            secret_name = var.tls_secret_name
+          }
+        }
+        volume {
+          name = "opendkim-key"
+          secret {
+            secret_name = "mailserver.opendkim.key"
+          }
+        }
+        volume {
+          name = "data"
+          persistent_volume_claim {
+            claim_name = module.nfs_data.claim_name
+          }
+          # iscsi {
+          #   target_portal = "iscsi.viktorbarzin.lan:3260"
+          #   iqn           = "iqn.2020-12.lan.viktorbarzin:storage:mailserver"
+          #   lun           = 0
+          #   fs_type       = "ext4"
+          # }
+        }
+        # volume {
+        #   name = "user-patches"
+        #   config_map {
+        #     name = "user-patches"
+        #   }
+        # }
+        volume {
+          name = "var-run-dovecot"
+          empty_dir {}
+        }
+        dns_config {
+          option {
+            name  = "ndots"
+            value = "2"
+          }
+        }
+      }
+    }
+  }
+}
+
+resource "kubernetes_service" "mailserver" {
+  metadata {
+    name      = "mailserver"
+    namespace = kubernetes_namespace.mailserver.metadata[0].name
+
+    labels = {
+      app = "mailserver"
+    }
+
+    annotations = {
+      "metallb.universe.tf/allow-shared-ip" = "shared"
+    }
+  }
+
+  spec {
+    type = "LoadBalancer"
+    # external_traffic_policy = "Cluster"
+    external_traffic_policy = "Local"
+    selector = {
+      app = "mailserver"
+    }
+
+    port {
+      name        = "smtp"
+      protocol    = "TCP"
+      port        = 25
+      target_port = "smtp"
+    }
+
+    port {
+      name        = "smtp-secure"
+      protocol    = "TCP"
+      port        = 465
+      target_port = "smtp-secure"
+    }
+
+    port {
+      name        = "smtp-auth"
+      protocol    = "TCP"
+      port        = 587
+      target_port = "smtp-auth"
+    }
+
+    port {
+      name        = "imap-secure"
+      protocol    = "TCP"
+      port        = 993
+      target_port = "imap-secure"
+    }
+
+    port {
+      name     = "roundcube"
+      protocol = "TCP"
+      port     = 80
+    }
+  }
+}
+
--- a/stacks/mailserver/modules/mailserver/roundcubemail.tf
+++ b/stacks/mailserver/modules/mailserver/roundcubemail.tf
@ -0,0 +1,237 @@
+variable "roundcube_db_password" {
+  type      = string
+  sensitive = true
+}
+variable "mysql_host" { type = string }
+
+module "nfs_roundcube_html" {
+  source     = "../../../../modules/kubernetes/nfs_volume"
+  name       = "roundcubemail-html"
+  namespace  = kubernetes_namespace.mailserver.metadata[0].name
+  nfs_server = var.nfs_server
+  nfs_path   = "/mnt/main/roundcubemail/html"
+}
+
+module "nfs_roundcube_enigma" {
+  source     = "../../../../modules/kubernetes/nfs_volume"
+  name       = "roundcubemail-enigma"
+  namespace  = kubernetes_namespace.mailserver.metadata[0].name
+  nfs_server = var.nfs_server
+  nfs_path   = "/mnt/main/roundcubemail/enigma"
+}
+
+# If you want to override settings mount this in /var/roundcube/config
+# more info in https://github.com/roundcube/roundcubemail-docker?tab=readme-ov-file
+# resource "kubernetes_config_map" "roundcubemail_config" {
+#   metadata {
+#     name      = "roundcubemail.config"
+#     namespace = "mailserver"
+
+#     labels = {
+#       app = "mailserver"
+#     }
+#     annotations = {
+#       "reloader.stakater.com/match" = "true"
+#     }
+#   }
+
+#   data = {
+#     # if you want to override things see https://github.com/roundcube/roundcubemail/blob/master/config/defaults.inc.php
+#     "imap.php" = <<-EOF
+#     <?php
+#       $config['imap_host'] = 'ssl://mail.viktorbarzin.me:993';
+#     ?>
+#     EOF
+#   }
+# }
+
+
+resource "kubernetes_deployment" "roundcubemail" {
+  metadata {
+    name      = "roundcubemail"
+    namespace = "mailserver"
+    labels = {
+      "app" = "roundcubemail"
+      tier  = var.tier
+    }
+    annotations = {
+      "reloader.stakater.com/search" = "true"
+    }
+  }
+  spec {
+    replicas = "1"
+    strategy {
+      type = "RollingUpdate"
+    }
+    selector {
+      match_labels = {
+        "app" = "roundcubemail"
+      }
+    }
+    template {
+      metadata {
+        labels = {
+          "app" = "roundcubemail"
+        }
+      }
+      spec {
+        container {
+          name  = "roundcube"
+          image = "roundcube/roundcubemail:1.6.13-apache"
+          # Uncomment me to mount additional settings
+          #   volume_mount {
+          #     name       = "imap-config"
+          #     mount_path = "/var/roundcube/config/imap.php"
+          #     sub_path   = "imap.php"
+          #   }
+          env {
+            name  = "ROUNDCUBEMAIL_DEFAULT_HOST"
+            value = "ssl://mail.viktorbarzin.me" # tls cert must be valid!
+          }
+          env {
+            name  = "ROUNDCUBEMAIL_DEFAULT_PORT"
+            value = "993"
+          }
+          env {
+            name  = "ROUNDCUBEMAIL_SMTP_SERVER"
+            value = "tls://mail.viktorbarzin.me" # tls cert must be valid!
+          }
+
+          env {
+            name  = "ROUNDCUBEMAIL_SMTP_PORT"
+            value = 587
+          }
+
+          # DB Settings
+          env {
+            name  = "ROUNDCUBEMAIL_DB_TYPE"
+            value = "mysql"
+          }
+          env {
+            name  = "ROUNDCUBEMAIL_DB_HOST"
+            value = var.mysql_host
+          }
+          env {
+            name  = "ROUNDCUBEMAIL_DB_USER"
+            value = "roundcubemail"
+          }
+          env {
+            name  = "ROUNDCUBEMAIL_DB_PASSWORD"
+            value = var.roundcube_db_password
+          }
+          # Plugins
+          env {
+            name  = "ROUNDCUBEMAIL_COMPOSER_PLUGINS"
+            value = "mmvi/twofactor_webauthn,texxasrulez/persistent_login,dsoares/rcguard"
+          }
+          env {
+            name  = "ROUNDCUBEMAIL_PLUGINS"
+            value = "attachment_reminder,database_attachments,enigma,twofactor_webauthn,persistent_login,rcguard"
+          }
+
+          env {
+            name  = "ROUNDCUBEMAIL_SMTP_DEBUG"
+            value = "false"
+          }
+          env {
+            name  = "ROUNDCUBEMAIL_DEBUG_LEVEL"
+            value = "1"
+          }
+          env {
+            name = "ROUNDCUBEMAIL_LOG_DRIVER"
+            # value = "file"
+            value = "syslog"
+          }
+          port {
+            name           = "web"
+            container_port = 80
+            protocol       = "TCP"
+          }
+          volume_mount {
+            name       = "html"
+            mount_path = "/var/www/html"
+          }
+          volume_mount {
+            name       = "enigma"
+            mount_path = "/var/roundcube/enigma"
+          }
+          resources {
+            requests = {
+              cpu    = "25m"
+              memory = "192Mi"
+            }
+            limits = {
+              memory = "192Mi"
+            }
+          }
+        }
+
+        # volume {
+        #   name = "imap-config"
+        #   config_map {
+        #     name = "roundcubemail.config"
+        #   }
+        # }
+
+        volume {
+          name = "html"
+          persistent_volume_claim {
+            claim_name = module.nfs_roundcube_html.claim_name
+          }
+        }
+        volume {
+          name = "enigma"
+          persistent_volume_claim {
+            claim_name = module.nfs_roundcube_enigma.claim_name
+          }
+        }
+        dns_config {
+          option {
+            name  = "ndots"
+            value = "2"
+          }
+        }
+      }
+    }
+  }
+}
+
+resource "kubernetes_service" "roundcubemail" {
+  metadata {
+    name      = "roundcubemail"
+    namespace = "mailserver"
+
+    labels = {
+      app = "roundcubemail"
+    }
+  }
+
+  spec {
+    selector = {
+      app = "roundcubemail"
+    }
+
+    port {
+      name     = "roundcube"
+      protocol = "TCP"
+      port     = 80
+    }
+  }
+}
+
+module "ingress" {
+  source          = "../../../../modules/kubernetes/ingress_factory"
+  namespace       = "mailserver"
+  name            = "mail"
+  service_name    = "roundcubemail"
+  tls_secret_name = var.tls_secret_name
+  rybbit_site_id  = "082f164faa7d"
+  extra_annotations = {
+    "gethomepage.dev/enabled"      = "true"
+    "gethomepage.dev/name"         = "Roundcube Mail"
+    "gethomepage.dev/description"  = "Webmail client"
+    "gethomepage.dev/icon"         = "roundcube.png"
+    "gethomepage.dev/group"        = "Other"
+    "gethomepage.dev/pod-selector" = ""
+  }
+}
--- a/stacks/mailserver/modules/mailserver/variables.tf
+++ b/stacks/mailserver/modules/mailserver/variables.tf
@ -0,0 +1,163 @@
+# this is appended and merged to the main postfix.cf
+# see defaults - https://github.com/docker-mailserver/docker-mailserver/blob/master/target/postfix/main.cf
+variable "postfix_cf" {
+  default = <<EOT
+#relayhost = [smtp.sendgrid.net]:587
+relayhost = [smtp.eu.mailgun.org]:587
+smtp_sasl_auth_enable = yes
+smtp_sasl_password_maps = hash:/etc/postfix/sasl/passwd
+smtp_sasl_security_options = noanonymous
+smtp_sasl_tls_security_options = noanonymous
+smtp_tls_security_level = encrypt
+smtpd_tls_cert_file=/tmp/ssl/tls.crt
+smtpd_tls_key_file=/tmp/ssl/tls.key
+smtpd_use_tls=yes
+header_size_limit = 4096000
+
+# Debug mail tls
+smtpd_tls_loglevel = 1
+#smtpd_tls_ciphers = TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA256:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA384:DHE-RSA-AES128-SHA256:DHE-RSA-AES256-SHA256:!aNULL:!SEED:!CAMELLIA:!RSA+AES:!SHA1
+#tls_medium_cipherlist = ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA256:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA384:DHE-RSA-AES128-SHA256:DHE-RSA-AES256-SHA256:!aNULL:!SEED:!CAMELLIA:!RSA+AES:!SHA1
+
+# Rate limiting (brute-force protection)
+smtpd_client_connection_rate_limit = 10
+smtpd_client_message_rate_limit = 30
+anvil_rate_time_unit = 60s
+EOT
+}
+
+variable "postfix_cf_reference_DO_NOT_USE" {
+  default = <<EOT
+# See /usr/share/postfix/main.cf.dist for a commented, more complete version
+
+smtpd_banner = $myhostname ESMTP $mail_name (Debian)
+biff = no
+append_dot_mydomain = no
+readme_directory = no
+
+# Basic configuration
+# myhostname =
+alias_maps = hash:/etc/aliases
+alias_database = hash:/etc/aliases
+mydestination = $myhostname, localhost.$mydomain, localhost
+mynetworks = 127.0.0.0/8 [::1]/128 [fe80::]/64 
+mailbox_size_limit = 0
+recipient_delimiter = +
+inet_interfaces = all
+inet_protocols = ipv4
+
+# TLS parameters
+smtpd_tls_cert_file=/tmp/ssl/tls.crt
+smtpd_tls_key_file=/tmp/ssl/tls.key
+#smtpd_tls_CAfile=
+#smtp_tls_CAfile=
+smtpd_tls_security_level = may
+smtpd_use_tls=yes
+smtpd_tls_loglevel = 1
+smtp_tls_loglevel = 1
+tls_ssl_options = NO_COMPRESSION
+tls_high_cipherlist = ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA256:ECDHE-ECDSA-AES128-SHA:ECDHE-RSA-AES256-SHA384:ECDHE-RSA-AES128-SHA:ECDHE-ECDSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA:ECDHE-RSA-AES256-SHA:DHE-RSA-AES128-SHA256:DHE-RSA-AES128-SHA:DHE-RSA-AES256-SHA256:DHE-RSA-AES256-SHA:ECDHE-ECDSA-DES-CBC3-SHA:ECDHE-RSA-DES-CBC3-SHA:EDH-RSA-DES-CBC3-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA256:AES256-SHA256:AES128-SHA:AES256-SHA:DES-CBC3-SHA:!DSS
+tls_preempt_cipherlist = yes
+smtpd_tls_protocols = !SSLv2,!SSLv3
+smtp_tls_protocols = !SSLv2,!SSLv3
+smtpd_tls_mandatory_ciphers = high
+smtpd_tls_mandatory_protocols = !SSLv2,!SSLv3
+smtpd_tls_exclude_ciphers = aNULL, LOW, EXP, MEDIUM, ADH, AECDH, MD5, DSS, ECDSA, CAMELLIA128, 3DES, CAMELLIA256, RSA+AES, eNULL
+smtpd_tls_dh1024_param_file = /etc/postfix/dhparams.pem
+smtpd_tls_CApath = /etc/ssl/certs
+smtp_tls_CApath = /etc/ssl/certs
+
+# Settings to prevent SPAM early
+smtpd_helo_required = yes
+smtpd_delay_reject = yes
+smtpd_helo_restrictions = permit_mynetworks, reject_invalid_helo_hostname, permit
+#smtpd_relay_restrictions = permit_mynetworks permit_sasl_authenticated defer_unauth_destination
+#smtpd_relay_restrictions = reject_sender_login_mismatch permit_sasl_authenticated permit_mynetworks defer_unauth_destination
+smtpd_relay_restrictions = reject_sender_login_mismatch permit_sasl_authenticated permit_mynetworks defer_unauth_destination
+smtpd_recipient_restrictions = permit_sasl_authenticated, reject_unauth_destination, reject_unauth_pipelining, reject_invalid_helo_hostname, reject_non_fqdn_helo_hostname, reject_unknown_recipient_domain, reject_rbl_client bl.spamcop.net, permit_mynetworks
+smtpd_client_restrictions = permit_mynetworks, permit_sasl_authenticated, reject_unauth_destination, reject_unauth_pipelining
+#smtpd_sender_restrictions = reject_sender_login_mismatch, permit_sasl_authenticated, permit_mynetworks, reject_unknown_sender_domain 
+smtpd_sender_restrictions = reject_sender_login_mismatch, reject_authenticated_sender_login_mismatch,  reject_unknown_sender_domain, permit_sasl_authenticated, permit_mynetworks
+disable_vrfy_command = yes
+
+# Postscreen settings to drop zombies/open relays/spam early
+#postscreen_dnsbl_action = enforce
+postscreen_dnsbl_action = ignore
+postscreen_dnsbl_sites = zen.spamhaus.org*2
+        bl.mailspike.net
+        b.barracudacentral.org*2
+        bl.spameatingmonkey.net
+        bl.spamcop.net
+        dnsbl.sorbs.net
+        psbl.surriel.com
+        list.dnswl.org=127.0.[0..255].0*-2
+        list.dnswl.org=127.0.[0..255].1*-3
+        list.dnswl.org=127.0.[0..255].[2..3]*-4
+postscreen_dnsbl_threshold = 3
+postscreen_dnsbl_whitelist_threshold = -1
+postscreen_greet_action = enforce
+postscreen_bare_newline_action = enforce
+
+# SASL
+smtpd_sasl_auth_enable = no
+#smtpd_sasl_auth_enable = yes
+##smtpd_sasl_path = /var/spool/postfix/private/auth
+#smtpd_sasl_path = /var/spool/postfix/private/smtpd
+##smtpd_sasl_type = dovecot
+#smtpd_sasl_type = dovecot
+##smtpd_sasl_security_options = noanonymous
+#smtpd_sasl_security_options = noanonymous
+##smtpd_sasl_local_domain = $mydomain
+##broken_sasl_auth_clients = yes
+#broken_sasl_auth_clients = yes
+
+# SMTP configuration
+smtp_sasl_auth_enable = yes
+smtp_sasl_password_maps = hash:/etc/postfix/sasl/passwd
+smtp_sasl_security_options = noanonymous
+smtp_sasl_tls_security_options = noanonymous
+smtp_tls_security_level = encrypt
+header_size_limit = 4096000
+relayhost = [smtp.sendgrid.net]:587
+
+# Mail directory
+virtual_transport = lmtp:unix:/var/run/dovecot/lmtp
+virtual_mailbox_domains = /etc/postfix/vhost
+virtual_mailbox_maps = texthash:/etc/postfix/vmailbox
+virtual_alias_maps = texthash:/etc/postfix/virtual
+
+# Additional option for filtering
+content_filter = smtp-amavis:[127.0.0.1]:10024
+
+# Milters used by DKIM
+milter_protocol = 6
+milter_default_action = accept
+dkim_milter = inet:localhost:8891
+dmarc_milter = inet:localhost:8893
+smtpd_milters = $dkim_milter,$dmarc_milter
+non_smtpd_milters = $dkim_milter
+
+# SPF policy settings
+policyd-spf_time_limit = 3600
+
+# Header checks for content inspection on receiving
+header_checks = pcre:/etc/postfix/maps/header_checks.pcre
+
+# Remove unwanted headers that reveail our privacy
+smtp_header_checks = pcre:/etc/postfix/maps/sender_header_filter.pcre
+myhostname = mail.viktorbarzin.me
+mydomain = viktorbarzin.me
+smtputf8_enable = no
+message_size_limit = 20480000
+sender_canonical_maps = tcp:localhost:10001
+sender_canonical_classes = envelope_sender
+recipient_canonical_maps = tcp:localhost:10002
+recipient_canonical_classes = envelope_recipient,header_recipient
+compatibility_level = 2
+# enable_original_recipient = no  # b4 uncommenting see https://serverfault.com/questions/661615/how-to-drop-orig-to-using-postfix-virtual-domains
+always_add_missing_headers = yes
+
+anvil_status_update_time = 5s
+EOT
+}
+
--- a/stacks/mailserver/secrets
+++ b/stacks/mailserver/secrets
@ -0,0 +1 @@
+../../secrets
--- a/stacks/mailserver/terragrunt.hcl
+++ b/stacks/mailserver/terragrunt.hcl
@ -0,0 +1,8 @@
+include "root" {
+  path = find_in_parent_folders()
+}
+
+dependency "infra" {
+  config_path  = "../infra"
+  skip_outputs = true
+}
--- a/stacks/mailserver/tiers.tf
+++ b/stacks/mailserver/tiers.tf
@ -0,0 +1,10 @@
+# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
+locals {
+  tiers = {
+    core    = "0-core"
+    cluster = "1-cluster"
+    gpu     = "2-gpu"
+    edge    = "3-edge"
+    aux     = "4-aux"
+  }
+}
--- a/stacks/monitoring/main.tf
+++ b/stacks/monitoring/main.tf
@ -0,0 +1,29 @@
+# =============================================================================
+# Monitoring Stack — Prometheus / Grafana / Loki
+# =============================================================================
+
+variable "tls_secret_name" { type = string }
+variable "nfs_server" { type = string }
+variable "mysql_host" { type = string }
+variable "monitoring_idrac_username" { type = string }
+
+data "vault_kv_secret_v2" "secrets" {
+  mount = "secret"
+  name  = "platform"
+}
+
+module "monitoring" {
+  source                        = "./modules/monitoring"
+  tls_secret_name               = var.tls_secret_name
+  nfs_server                    = var.nfs_server
+  mysql_host                    = var.mysql_host
+  alertmanager_account_password = data.vault_kv_secret_v2.secrets.data["alertmanager_account_password"]
+  idrac_username                = var.monitoring_idrac_username
+  idrac_password                = data.vault_kv_secret_v2.secrets.data["monitoring_idrac_password"]
+  alertmanager_slack_api_url    = data.vault_kv_secret_v2.secrets.data["alertmanager_slack_api_url"]
+  tiny_tuya_service_secret      = data.vault_kv_secret_v2.secrets.data["tiny_tuya_service_secret"]
+  haos_api_token                = data.vault_kv_secret_v2.secrets.data["haos_api_token"]
+  pve_password                  = data.vault_kv_secret_v2.secrets.data["pve_password"]
+  grafana_admin_password        = data.vault_kv_secret_v2.secrets.data["grafana_admin_password"]
+  tier                          = local.tiers.cluster
+}
--- a/stacks/monitoring/modules/monitoring/Dockerfile
+++ b/stacks/monitoring/modules/monitoring/Dockerfile
@ -0,0 +1,27 @@
+# dockerhub: viktorbarzin/redfish-exporter
+# repo: https://pkg.go.dev/github.com/jenningsloy318/redfish_exporter#section-readme
+FROM golang:rc-bullseye AS builder
+
+LABEL maintainer="Viktor Barzin <me@viktorbarzin.me>"
+
+ARG ARCH=amd64
+
+ENV GOROOT /usr/local/go
+ENV GOPATH /go
+ENV PATH "$GOROOT/bin:$GOPATH/bin:$PATH"
+ENV GO_VERSION 1.15.2
+ENV GO111MODULE=on 
+
+
+# Build dependencies
+RUN mkdir -p /go/src/github.com/ && \
+    git clone https://github.com/jenningsloy318/redfish_exporter /go/src/github.com/jenningsloy318/redfish_exporter && \
+    cd /go/src/github.com/jenningsloy318/redfish_exporter && \
+    make build
+
+FROM golang:rc-bullseye
+
+COPY --from=builder /go/src/github.com/jenningsloy318/redfish_exporter/build/redfish_exporter /usr/local/bin/redfish_exporter
+RUN mkdir /etc/prometheus
+# config file mounter at runtime
+CMD ["/usr/local/bin/redfish_exporter", "--config.file", "/etc/prometheus/redfish_exporter.yml"]
--- a/stacks/monitoring/modules/monitoring/alloy.yaml
+++ b/stacks/monitoring/modules/monitoring/alloy.yaml
@ -0,0 +1,207 @@
+alloy:
+  configMap:
+    content: |-
+      // Write your Alloy config here:
+      logging {
+        level = "info"
+        format = "logfmt"
+      }
+      loki.write "default" {
+        endpoint {
+          url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"
+        }
+      }
+
+      // discovery.kubernetes allows you to find scrape targets from Kubernetes resources.
+      // It watches cluster state and ensures targets are continually synced with what is currently running in your cluster.
+      discovery.kubernetes "pod" {
+        role = "pod"
+      }
+
+      // discovery.relabel rewrites the label set of the input targets by applying one or more relabeling rules.
+      // If no rules are defined, then the input targets are exported as-is.
+      discovery.relabel "pod_logs" {
+        targets = discovery.kubernetes.pod.targets
+
+        // Label creation - "namespace" field from "__meta_kubernetes_namespace"
+        rule {
+          source_labels = ["__meta_kubernetes_namespace"]
+          action = "replace"
+          target_label = "namespace"
+        }
+
+        // Label creation - "pod" field from "__meta_kubernetes_pod_name"
+        rule {
+          source_labels = ["__meta_kubernetes_pod_name"]
+          action = "replace"
+          target_label = "pod"
+        }
+
+        // Label creation - "container" field from "__meta_kubernetes_pod_container_name"
+        rule {
+          source_labels = ["__meta_kubernetes_pod_container_name"]
+          action = "replace"
+          target_label = "container"
+        }
+
+        // Label creation -  "app" field from "__meta_kubernetes_pod_label_app_kubernetes_io_name"
+        rule {
+          source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
+          action = "replace"
+          target_label = "app"
+        }
+
+        // Label creation -  "job" field from "__meta_kubernetes_namespace" and "__meta_kubernetes_pod_container_name"
+        // Concatenate values __meta_kubernetes_namespace/__meta_kubernetes_pod_container_name
+        rule {
+          source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_container_name"]
+          action = "replace"
+          target_label = "job"
+          separator = "/"
+          replacement = "$1"
+        }
+
+        // Label creation - "container" field from "__meta_kubernetes_pod_uid" and "__meta_kubernetes_pod_container_name"
+        // Concatenate values __meta_kubernetes_pod_uid/__meta_kubernetes_pod_container_name.log
+        rule {
+          source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
+          action = "replace"
+          target_label = "__path__"
+          separator = "/"
+          replacement = "/var/log/pods/*$1/*.log"
+        }
+
+        // Label creation -  "container_runtime" field from "__meta_kubernetes_pod_container_id"
+        rule {
+          source_labels = ["__meta_kubernetes_pod_container_id"]
+          action = "replace"
+          target_label = "container_runtime"
+          regex = "^(\\S+):\\/\\/.+$"
+          replacement = "$1"
+        }
+      }
+
+      // loki.source.kubernetes tails logs from Kubernetes containers using the Kubernetes API.
+      loki.source.kubernetes "pod_logs" {
+        targets    = discovery.relabel.pod_logs.output
+        forward_to = [loki.process.pod_logs.receiver]
+      }
+
+      // loki.process receives log entries from other Loki components, applies one or more processing stages,
+      // and forwards the results to the list of receivers in the component's arguments.
+      loki.process "pod_logs" {
+        stage.static_labels {
+            values = {
+              cluster = "default",
+            }
+        }
+
+        forward_to = [loki.write.default.receiver]
+      }
+
+      // Node-level journal log collection for kernel panics, OOMs, hung tasks, etc.
+      // Ships system logs off-node so they survive hard resets.
+      loki.source.journal "node_journal" {
+        forward_to = [loki.process.journal.receiver]
+        relabel_rules = loki.relabel.journal.rules
+        labels = {
+          job = "node-journal",
+        }
+        max_age = "12h"
+      }
+
+      loki.relabel "journal" {
+        forward_to = []
+
+        rule {
+          source_labels = ["__journal__hostname"]
+          target_label  = "node"
+        }
+        rule {
+          source_labels = ["__journal__systemd_unit"]
+          target_label  = "unit"
+        }
+        rule {
+          source_labels = ["__journal_priority_keyword"]
+          target_label  = "level"
+        }
+        rule {
+          source_labels = ["__journal__transport"]
+          target_label  = "transport"
+        }
+      }
+
+      // Forward warning+ journal entries (priority 0-4: emerg, alert, crit, err, warning)
+      // Also forwards kernel transport entries regardless of priority for OOM/panic detection.
+      loki.process "journal" {
+        stage.static_labels {
+          values = {
+            cluster = "default",
+          }
+        }
+
+        // Drop info/debug/notice entries that aren't from the kernel transport
+        stage.match {
+          selector = "{job=\"node-journal\", level=~\"info|notice|debug\", transport!=\"kernel\"}"
+          action   = "drop"
+        }
+
+        forward_to = [loki.write.default.receiver]
+      }
+
+      // Kubernetes audit log collection from /var/log/kubernetes/audit.log
+      // Requires alloy.mounts.varlog=true to mount /var/log from the host
+      local.file_match "audit_logs" {
+        path_targets = [{
+          __path__ = "/var/log/kubernetes/audit.log",
+          job      = "kubernetes-audit",
+          node     = env("HOSTNAME"),
+        }]
+      }
+
+      loki.source.file "audit_logs" {
+        targets    = local.file_match.audit_logs.targets
+        forward_to = [loki.write.default.receiver]
+      }
+
+  # Mount /var/log from the host for file-based log collection (audit logs)
+  mounts:
+    varlog: true
+    # Mount journal directories for loki.source.journal
+    extra:
+      - name: journal-run
+        mountPath: /run/log/journal
+        readOnly: true
+      - name: journal-var
+        mountPath: /var/log/journal
+        readOnly: true
+      - name: machine-id
+        mountPath: /etc/machine-id
+        readOnly: true
+
+controller:
+  volumes:
+    extra:
+      - name: journal-run
+        hostPath:
+          path: /run/log/journal
+          type: DirectoryOrCreate
+      - name: journal-var
+        hostPath:
+          path: /var/log/journal
+          type: DirectoryOrCreate
+      - name: machine-id
+        hostPath:
+          path: /etc/machine-id
+          type: File
+
+  # Resource limits for DaemonSet pods
+  # Alloy tails logs from all containers on the node via K8s API and batches
+  # them to Loki. Memory scales with number of active log streams (~30-50 per node).
+  # 128Mi was OOMKilled; steady-state usage is ~400-450Mi per pod.
+  resources:
+    requests:
+      cpu: 50m
+      memory: 512Mi
+    limits:
+      memory: 1Gi
--- a/stacks/monitoring/modules/monitoring/caretta.tf
+++ b/stacks/monitoring/modules/monitoring/caretta.tf
@ -0,0 +1,62 @@
+resource "helm_release" "caretta" {
+  namespace        = kubernetes_namespace.monitoring.metadata[0].name
+  create_namespace = true
+  name             = "caretta"
+
+  repository = "https://helm.groundcover.com/"
+  chart      = "caretta"
+  version    = "0.0.16"
+
+  values = [yamlencode({
+    grafana = {
+      enabled = false
+    }
+    victoria-metrics-single = {
+      enabled = false
+    }
+    tolerations = [
+      {
+        key      = "node-role.kubernetes.io/control-plane"
+        operator = "Exists"
+        effect   = "NoSchedule"
+      },
+      {
+        key      = "nvidia.com/gpu"
+        operator = "Exists"
+        effect   = "NoSchedule"
+      }
+    ]
+    resources = {
+      requests = {
+        cpu    = "10m"
+        memory = "600Mi"
+      }
+      limits = {
+        memory = "600Mi"
+      }
+    }
+  })]
+}
+
+resource "kubernetes_service" "caretta_metrics" {
+  metadata {
+    name      = "caretta-metrics"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    labels = {
+      app = "caretta"
+    }
+  }
+  spec {
+    selector = {
+      app = "caretta"
+    }
+    port {
+      name        = "metrics"
+      port        = 7117
+      target_port = 7117
+      protocol    = "TCP"
+    }
+  }
+}
+
+# Caretta dashboard is now loaded via the grafana_dashboards for_each in grafana.tf
--- a/stacks/monitoring/modules/monitoring/dashboards/api_server.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/api_server.json
--- a/stacks/monitoring/modules/monitoring/dashboards/caretta-dashboard.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/caretta-dashboard.json
@ -0,0 +1,861 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "target": {
+          "limit": 100,
+          "matchAny": false,
+          "tags": [],
+          "type": "dashboard"
+        },
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "",
+      "gridPos": {
+        "h": 28,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 2,
+      "interval": "15s",
+      "options": {
+        "nodes": {
+          "mainStatUnit": ""
+        },
+        "edges": {
+          "mainStatUnit": ""
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "exemplar": false,
+          "expr": "increase((sum by (id, title, subTitle, detail__kind, color) (label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace((label_replace(label_replace(label_replace(label_replace((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"}), \"detail__kind\", \"$1\", \"server_kind\", \"(.*)\"), \"subTitle\", \"$1\", \"server_namespace\", \"(.*)\"), \"title\", \"$1\", \"server_name\", \"(.*)\"), \"id\", \"$1\", \"server_id\", \"(.*)\") or label_replace(label_replace(label_replace(label_replace((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"}), \"detail__kind\", \"$1\", \"client_kind\", \"(.*)\"), \"subTitle\", \"$1\", \"client_namespace\", \"(.*)\"), \"title\", \"$1\", \"client_name\", \"(.*)\"), \"id\", \"$1\", \"client_id\", \"(.*)\")), \"color\", \"#8F8F8F\", \"subTitle\", \"(.*)\"), \"color\", \"#F2495C\", \"subTitle\", \"^external$\"), \"color\", \"#8AB8FF\", \"title\", \"^10\\\\..*\"), \"color\", \"#8AB8FF\", \"title\", \"^192\\\\.168\\\\..*\"), \"color\", \"#8AB8FF\", \"title\", \"^172\\\\.(1[6-9]|2[0-9]|3[01])\\\\..*\"), \"color\", \"#8AB8FF\", \"title\", \"^(0\\\\.0\\\\.0\\\\.0|localhost)$\"), \"color\", \"#8AB8FF\", \"subTitle\", \"^node$\"), \"color\", \"#FF9830\", \"subTitle\", \"^traefik$\"), \"color\", \"#5794F2\", \"subTitle\", \"^monitoring$\"), \"color\", \"#73BF69\", \"subTitle\", \"^dbaas$\"), \"color\", \"#B877D9\", \"subTitle\", \"^authentik$\"), \"color\", \"#FF7383\", \"subTitle\", \"^crowdsec$\"), \"color\", \"#FADE2A\", \"subTitle\", \"^uptime-kuma$\"), \"color\", \"#56A64B\", \"subTitle\", \"^immich$\"), \"color\", \"#C0D8FF\", \"subTitle\", \"^technitium$\"), \"color\", \"#FF6600\", \"subTitle\", \"^kyverno$\"), \"color\", \"#76B900\", \"subTitle\", \"^nvidia$\")))[$__range:$__interval]) > 0",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "__auto",
+          "range": false,
+          "refId": "nodes"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "exemplar": false,
+          "expr": "increase((sum by (id, source, target, mainStat) ((label_replace(label_replace(label_replace(label_replace((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"}), \"id\", \"$1\", \"link_id\", \"(.*)\"), \"source\", \"$1\", \"client_id\", \"(.*)\"), \"target\", \"$1\", \"server_id\", \"(.*)\"), \"mainStat\", \"$1\", \"server_port\", \"(.*)\"))))[$__range:$__interval]) > 0",
+          "format": "table",
+          "hide": false,
+          "instant": true,
+          "legendFormat": "__auto",
+          "range": false,
+          "refId": "edges"
+        }
+      ],
+      "title": "Service Map",
+      "type": "nodeGraph",
+      "fieldConfig": {
+        "defaults": {},
+        "overrides": []
+      }
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "fixedColor": "blue",
+            "mode": "fixed"
+          },
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            }
+          },
+          "links": [],
+          "mappings": []
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 6,
+        "x": 0,
+        "y": 28
+      },
+      "id": 4,
+      "options": {
+        "displayLabels": [
+          "name"
+        ],
+        "legend": {
+          "displayMode": "list",
+          "placement": "right",
+          "showLegend": false
+        },
+        "pieType": "donut",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "sum by (server_port) (increase((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"})[$__range:$__interval])) > 0",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Active Ports",
+      "type": "piechart"
+    },
+    {
+      "datasource": {
+        "type": "datasource",
+        "uid": "grafana"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 3,
+        "x": 21,
+        "y": 36
+      },
+      "id": 10,
+      "options": {
+        "code": {
+          "language": "plaintext",
+          "showLineNumbers": false,
+          "showMiniMap": false
+        },
+        "content": "<table style=\"width:100%; height:100%;border:0px solid black;\">\n     <td style=\"text-align: center;vertical-align: middle;border:0px solid black; \">\n<div style=\"text-align: center\">\n<p align=\"center\">\n  <img src=\"https://raw.githubusercontent.com/groundcover-com/caretta/main/images/logo.svg\" width=\"75%\" alt=\"caretta\" title=\"caretta\" />\n  <h4>by <a href=\"https://www.groundcover.com\">groundcover</h4>\n\n  \n  [![slack](https://img.shields.io/badge/slack-groundcover-yellowgreen.svg?logo=slack)](http://www.groundcover.com/join-slack)\n  \n</div>\n</p>\n</div>\n</td>\n</table>\n",
+        "mode": "markdown"
+      },
+      "pluginVersion": "10.1.2",
+      "type": "text"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "fixedColor": "purple",
+            "mode": "continuous-blues"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "Bps"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 9,
+        "x": 15,
+        "y": 28
+      },
+      "id": 8,
+      "options": {
+        "displayMode": "gradient",
+        "minVizHeight": 10,
+        "minVizWidth": 0,
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showUnfilled": true,
+        "valueMode": "color"
+      },
+      "pluginVersion": "10.1.2",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "exemplar": false,
+          "expr": "topk(8, sum by (client_name) ((rate(caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"}[$__range:$__interval]))))",
+          "format": "time_series",
+          "instant": true,
+          "legendFormat": "__auto",
+          "range": false,
+          "refId": "A"
+        }
+      ],
+      "title": "Top Throughput Workloads",
+      "type": "bargauge"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "continuous-blues"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "Bps"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 9,
+        "x": 6,
+        "y": 28
+      },
+      "id": 6,
+      "options": {
+        "colorMode": "background",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "text": {},
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.1.2",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "exemplar": false,
+          "expr": "topk(7, sum by (client_name, server_name) ( rate( (caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\", client_kind!~\"(node|external)\",} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\", server_kind!~\"(node|external)\"})[$__range:$__interval]) ) )",
+          "format": "time_series",
+          "instant": true,
+          "legendFormat": "{{client_name}} \u2b82 {{server_name}}",
+          "range": false,
+          "refId": "A"
+        }
+      ],
+      "title": "Top Throughput Connections",
+      "type": "stat"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 40
+      },
+      "id": 11,
+      "title": "Network Flows (GoFlow2 / pfSense NetFlow)",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisLabel": "flows/s",
+            "drawStyle": "line",
+            "fillOpacity": 20,
+            "gradientMode": "scheme",
+            "lineWidth": 2,
+            "pointSize": 5,
+            "showPoints": "never",
+            "spanNulls": true
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 41
+      },
+      "id": 12,
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(goflow2_flow_process_nf_flowset_records_total{type=\"DataFlowSet\"}[5m])",
+          "legendFormat": "Flows/s ({{router}})",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "NetFlow Ingestion Rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisLabel": "",
+            "drawStyle": "line",
+            "fillOpacity": 20,
+            "gradientMode": "scheme",
+            "lineWidth": 2,
+            "pointSize": 5,
+            "showPoints": "never",
+            "spanNulls": true
+          },
+          "unit": "Bps"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 41
+      },
+      "id": 13,
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(goflow2_flow_traffic_bytes_total[5m])",
+          "legendFormat": "Bytes/s from {{remote_ip}}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "NetFlow Traffic Volume",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 10000
+              },
+              {
+                "color": "red",
+                "value": 100000
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 0,
+        "y": 49
+      },
+      "id": 14,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "goflow2_flow_process_nf_flowset_records_total{type=\"DataFlowSet\"}",
+          "legendFormat": "Total Flows",
+          "instant": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Total Flows Processed",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 4,
+        "y": 49
+      },
+      "id": 15,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "goflow2_flow_process_nf_total",
+          "legendFormat": "Messages",
+          "instant": true,
+          "refId": "A"
+        }
+      ],
+      "title": "NetFlow Messages",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "blue",
+                "value": null
+              }
+            ]
+          },
+          "unit": "decbytes"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 8,
+        "y": 49
+      },
+      "id": 16,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "goflow2_flow_traffic_bytes_total",
+          "legendFormat": "Bytes",
+          "instant": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Total NetFlow Bytes",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "lineWidth": 2,
+            "pointSize": 5,
+            "showPoints": "never",
+            "spanNulls": true
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 12,
+        "x": 12,
+        "y": 49
+      },
+      "id": 17,
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(goflow2_flow_process_nf_delay_seconds_sum[5m]) / rate(goflow2_flow_process_nf_delay_seconds_count[5m])",
+          "legendFormat": "Avg Delay",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Flow Processing Delay",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "1h",
+  "schemaVersion": 38,
+  "style": "dark",
+  "tags": [
+    "network",
+    "caretta",
+    "goflow2"
+  ],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "default",
+          "value": "default"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": "datasource",
+        "multi": false,
+        "name": "DS_PROMETHEUS",
+        "options": [],
+        "query": "prometheus",
+        "queryValue": "",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      },
+      {
+        "allValue": "(.*)",
+        "current": {
+          "selected": true,
+          "text": [
+            "All"
+          ],
+          "value": [
+            "$__all"
+          ]
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${DS_PROMETHEUS}"
+        },
+        "definition": "query_result(caretta_links_observed)",
+        "hide": 0,
+        "includeAll": true,
+        "multi": true,
+        "name": "namespace",
+        "options": [],
+        "query": {
+          "query": "query_result(caretta_links_observed)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 1,
+        "regex": "/.*_namespace=\"([^\"]*).*/",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      },
+      {
+        "allValue": "(.*)",
+        "current": {
+          "selected": true,
+          "text": [
+            "All"
+          ],
+          "value": [
+            "$__all"
+          ]
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${DS_PROMETHEUS}"
+        },
+        "definition": "query_result(caretta_links_observed)",
+        "hide": 0,
+        "includeAll": true,
+        "multi": true,
+        "name": "kind",
+        "options": [],
+        "query": {
+          "query": "query_result(caretta_links_observed)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 1,
+        "regex": "/.*_kind=\"([^\"]*).*/",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      },
+      {
+        "allValue": "(.*)",
+        "current": {
+          "selected": true,
+          "text": [
+            "All"
+          ],
+          "value": [
+            "$__all"
+          ]
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${DS_PROMETHEUS}"
+        },
+        "definition": "query_result(caretta_links_observed)",
+        "hide": 0,
+        "includeAll": true,
+        "label": "workload",
+        "multi": true,
+        "name": "workload",
+        "options": [],
+        "query": {
+          "query": "query_result(caretta_links_observed)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 2,
+        "regex": "/.*_name=\"([^\"]*).*/",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      },
+      {
+        "allValue": "(.*)",
+        "current": {
+          "selected": true,
+          "text": [
+            "All"
+          ],
+          "value": [
+            "$__all"
+          ]
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${DS_PROMETHEUS}"
+        },
+        "definition": "label_values(server_port)",
+        "hide": 0,
+        "includeAll": true,
+        "label": "server port",
+        "multi": true,
+        "name": "port",
+        "options": [],
+        "query": {
+          "query": "label_values(server_port)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-15m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Network Observability",
+  "uid": "network-observability",
+  "version": 2,
+  "weekStart": ""
+}
--- a/stacks/monitoring/modules/monitoring/dashboards/cluster_health.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/cluster_health.json
--- a/stacks/monitoring/modules/monitoring/dashboards/core_dns.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/core_dns.json
--- a/stacks/monitoring/modules/monitoring/dashboards/idrac.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/idrac.json
--- a/stacks/monitoring/modules/monitoring/dashboards/k8s-audit.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/k8s-audit.json
@ -0,0 +1,204 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": { "type": "datasource", "uid": "grafana" },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "Kubernetes API server audit logs from Loki",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": 0,
+  "links": [],
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+      "id": 100,
+      "panels": [],
+      "title": "Recent Activity",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
+      "description": "Recent Kubernetes API actions from audit logs",
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "custom": {
+            "align": "auto",
+            "cellOptions": { "type": "auto" },
+            "inspect": false
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 12, "w": 24, "x": 0, "y": 1 },
+      "id": 1,
+      "options": {
+        "cellHeight": "sm",
+        "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false },
+        "showHeader": true,
+        "sortBy": [{ "desc": true, "displayName": "Time" }]
+      },
+      "pluginVersion": "12.3.0",
+      "targets": [
+        {
+          "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
+          "editorMode": "code",
+          "expr": "{job=\"kubernetes-audit\"} | json | line_format \"{{.user.username}} {{.verb}} {{.objectRef.resource}} {{.objectRef.namespace}}\"",
+          "legendFormat": "",
+          "queryType": "range",
+          "refId": "A"
+        }
+      ],
+      "title": "Recent Actions",
+      "type": "table"
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 },
+      "id": 101,
+      "panels": [],
+      "title": "Request Rates",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
+      "description": "API request count by user over time",
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 20,
+            "gradientMode": "none",
+            "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+            "insertNulls": false,
+            "lineInterpolation": "smooth",
+            "lineWidth": 2,
+            "pointSize": 5,
+            "scaleDistribution": { "type": "linear" },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": { "group": "A", "mode": "none" },
+            "thresholdsStyle": { "mode": "off" }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [{ "color": "green", "value": null }]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 14 },
+      "id": 2,
+      "options": {
+        "legend": { "calcs": ["sum", "lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi", "sort": "desc" }
+      },
+      "pluginVersion": "12.3.0",
+      "targets": [
+        {
+          "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
+          "editorMode": "code",
+          "expr": "sum by (user_username) (count_over_time({job=\"kubernetes-audit\"} | json [5m]))",
+          "legendFormat": "{{user_username}}",
+          "queryType": "range",
+          "refId": "A"
+        }
+      ],
+      "title": "Request Count by User",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 },
+      "id": 102,
+      "panels": [],
+      "title": "Denied Requests",
+      "type": "row"
+    },
+    {
+      "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
+      "description": "API requests denied with HTTP 403+ status codes",
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "custom": {
+            "align": "auto",
+            "cellOptions": { "type": "auto" },
+            "inspect": false
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "red", "value": 403 }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 12, "w": 24, "x": 0, "y": 25 },
+      "id": 3,
+      "options": {
+        "cellHeight": "sm",
+        "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false },
+        "showHeader": true,
+        "sortBy": [{ "desc": true, "displayName": "Time" }]
+      },
+      "pluginVersion": "12.3.0",
+      "targets": [
+        {
+          "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
+          "editorMode": "code",
+          "expr": "{job=\"kubernetes-audit\"} | json | responseStatus_code >= 403",
+          "legendFormat": "",
+          "queryType": "range",
+          "refId": "A"
+        }
+      ],
+      "title": "Denied Requests (403+)",
+      "type": "table"
+    }
+  ],
+  "preload": false,
+  "refresh": "30s",
+  "schemaVersion": 42,
+  "tags": ["kubernetes", "audit", "security"],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-24h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Kubernetes Audit Logs",
+  "uid": "k8s-audit",
+  "version": 1
+}
--- a/stacks/monitoring/modules/monitoring/dashboards/kube-state-metrics.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/kube-state-metrics.json
--- a/stacks/monitoring/modules/monitoring/dashboards/loki.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/loki.json
@ -0,0 +1,288 @@
+{
+    "annotations": {
+      "list": [
+        {
+          "builtIn": 1,
+          "datasource": {
+            "type": "datasource",
+            "uid": "grafana"
+          },
+          "enable": true,
+          "hide": true,
+          "iconColor": "rgba(0, 211, 255, 1)",
+          "name": "Annotations & Alerts",
+          "target": {
+            "limit": 100,
+            "matchAny": false,
+            "tags": [],
+            "type": "dashboard"
+          },
+          "type": "dashboard"
+        }
+      ]
+    },
+    "description": "Logs collected from Kubernetes, stored in Loki",
+    "editable": true,
+    "fiscalYearStartMonth": 0,
+    "gnetId": 15141,
+    "graphTooltip": 0,
+    "id": 25,
+    "links": [],
+    "panels": [
+      {
+        "datasource": {
+          "type": "loki",
+          "uid": "P8E80F9AEF21F6940"
+        },
+        "description": "",
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisBorderShow": false,
+              "axisCenteredZero": false,
+              "axisColorMode": "text",
+              "axisLabel": "",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "drawStyle": "bars",
+              "fillOpacity": 0,
+              "gradientMode": "none",
+              "hideFrom": {
+                "legend": false,
+                "tooltip": false,
+                "viz": false
+              },
+              "insertNulls": false,
+              "lineInterpolation": "linear",
+              "lineWidth": 1,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "auto",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "red",
+                  "value": 80
+                }
+              ]
+            }
+          },
+          "overrides": []
+        },
+        "gridPos": {
+          "h": 4,
+          "w": 24,
+          "x": 0,
+          "y": 0
+        },
+        "id": 4,
+        "options": {
+          "legend": {
+            "calcs": [],
+            "displayMode": "list",
+            "placement": "bottom",
+            "showLegend": false
+          },
+          "tooltip": {
+            "mode": "single",
+            "sort": "none"
+          }
+        },
+        "targets": [
+          {
+            "datasource": {
+              "type": "loki",
+              "uid": "P8E80F9AEF21F6940"
+            },
+            "editorMode": "code",
+            "expr": "sum(count_over_time({namespace=~\"$namespace\", container =~\"$container\"} |= \"$query\" [$__interval]))",
+            "instant": false,
+            "legendFormat": "Log count",
+            "queryType": "range",
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "type": "timeseries"
+      },
+      {
+        "datasource": {
+          "type": "loki",
+          "uid": "P8E80F9AEF21F6940"
+        },
+        "description": "Logs from services running in Kubernetes",
+        "gridPos": {
+          "h": 25,
+          "w": 24,
+          "x": 0,
+          "y": 4
+        },
+        "id": 2,
+        "options": {
+          "dedupStrategy": "none",
+          "enableLogDetails": true,
+          "prettifyLogMessage": false,
+          "showCommonLabels": false,
+          "showLabels": false,
+          "showTime": false,
+          "sortOrder": "Descending",
+          "wrapLogMessage": false
+        },
+        "targets": [
+          {
+            "datasource": {
+              "type": "loki",
+              "uid": "P8E80F9AEF21F6940"
+            },
+            "editorMode": "code",
+            "expr": "{namespace=~\"$namespace\", container =~\"$container\"} |= \"$query\"",
+            "queryType": "range",
+            "refId": "A"
+          }
+        ],
+        "type": "logs"
+      }
+    ],
+    "refresh": "5s",
+    "schemaVersion": 39,
+    "tags": [],
+    "templating": {
+      "list": [
+        {
+          "current": {
+            "selected": false,
+            "text": "",
+            "value": ""
+          },
+          "description": "String to search for",
+          "hide": 0,
+          "label": "Search Query",
+          "name": "query",
+          "options": [
+            {
+              "selected": true,
+              "text": "",
+              "value": ""
+            }
+          ],
+          "query": "",
+          "skipUrlSync": false,
+          "type": "textbox"
+        },
+        {
+          "allValue": ".+",
+          "current": {
+            "selected": true,
+            "text": [
+              "dbaas"
+            ],
+            "value": [
+              "dbaas"
+            ]
+          },
+          "datasource": {
+            "type": "loki",
+            "uid": "P8E80F9AEF21F6940"
+          },
+          "definition": "label_values(namespace)",
+          "hide": 0,
+          "includeAll": true,
+          "multi": true,
+          "name": "namespace",
+          "options": [],
+          "query": "label_values(namespace)",
+          "refresh": 1,
+          "regex": "",
+          "skipUrlSync": false,
+          "sort": 0,
+          "type": "query"
+        },
+        {
+          "allValue": ".+",
+          "current": {
+            "selected": true,
+            "text": [
+              "All"
+            ],
+            "value": [
+              "$__all"
+            ]
+          },
+          "datasource": {
+            "type": "loki",
+            "uid": "P8E80F9AEF21F6940"
+          },
+          "definition": "label_values(stream)",
+          "hide": 0,
+          "includeAll": true,
+          "multi": true,
+          "name": "stream",
+          "options": [],
+          "query": "label_values(stream)",
+          "refresh": 1,
+          "regex": "",
+          "skipUrlSync": false,
+          "sort": 0,
+          "type": "query"
+        },
+        {
+          "allValue": ".+",
+          "current": {
+            "selected": true,
+            "text": [
+              "All"
+            ],
+            "value": [
+              "$__all"
+            ]
+          },
+          "datasource": {
+            "type": "loki",
+            "uid": "P8E80F9AEF21F6940"
+          },
+          "definition": "label_values(container)",
+          "hide": 0,
+          "includeAll": true,
+          "multi": true,
+          "name": "container",
+          "options": [],
+          "query": "label_values(container)",
+          "refresh": 1,
+          "regex": "",
+          "skipUrlSync": false,
+          "sort": 0,
+          "type": "query"
+        }
+      ]
+    },
+    "time": {
+      "from": "now-5m",
+      "to": "now"
+    },
+    "timepicker": {},
+    "timezone": "",
+    "title": "Loki Kubernetes Logs",
+    "uid": "o6-BGgnnk",
+    "version": 2,
+    "weekStart": ""
+  }
--- a/stacks/monitoring/modules/monitoring/dashboards/nginx_ingress.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/nginx_ingress.json
--- a/stacks/monitoring/modules/monitoring/dashboards/node_exporter_full.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/node_exporter_full.json
--- a/stacks/monitoring/modules/monitoring/dashboards/nodes.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/nodes.json
--- a/stacks/monitoring/modules/monitoring/dashboards/nvidia.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/nvidia.json
@ -0,0 +1,816 @@
+{
+    "annotations": {
+        "list": [
+            {
+                "$$hashKey": "object:192",
+                "builtIn": 1,
+                "datasource": {
+                    "type": "datasource",
+                    "uid": "grafana"
+                },
+                "enable": true,
+                "hide": true,
+                "iconColor": "rgba(0, 211, 255, 1)",
+                "name": "Annotations & Alerts",
+                "type": "dashboard"
+            }
+        ]
+    },
+    "description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.13+) cluster",
+    "editable": true,
+    "fiscalYearStartMonth": 0,
+    "graphTooltip": 0,
+    "id": 0,
+    "links": [],
+    "panels": [
+        {
+            "datasource": {
+                "type": "prometheus",
+                "uid": "PBFA97CFB590B2093"
+            },
+            "fieldConfig": {
+                "defaults": {
+                    "color": {
+                        "mode": "palette-classic"
+                    },
+                    "custom": {
+                        "axisBorderShow": false,
+                        "axisCenteredZero": false,
+                        "axisColorMode": "text",
+                        "axisLabel": "",
+                        "axisPlacement": "auto",
+                        "barAlignment": 0,
+                        "barWidthFactor": 0.6,
+                        "drawStyle": "line",
+                        "fillOpacity": 10,
+                        "gradientMode": "none",
+                        "hideFrom": {
+                            "legend": false,
+                            "tooltip": false,
+                            "viz": false
+                        },
+                        "insertNulls": false,
+                        "lineInterpolation": "linear",
+                        "lineWidth": 2,
+                        "pointSize": 5,
+                        "scaleDistribution": {
+                            "type": "linear"
+                        },
+                        "showPoints": "never",
+                        "showValues": false,
+                        "spanNulls": false,
+                        "stacking": {
+                            "group": "A",
+                            "mode": "none"
+                        },
+                        "thresholdsStyle": {
+                            "mode": "off"
+                        }
+                    },
+                    "mappings": [],
+                    "thresholds": {
+                        "mode": "absolute",
+                        "steps": [
+                            {
+                                "color": "green",
+                                "value": 0
+                            },
+                            {
+                                "color": "red",
+                                "value": 80
+                            }
+                        ]
+                    },
+                    "unit": "celsius"
+                },
+                "overrides": []
+            },
+            "gridPos": {
+                "h": 8,
+                "w": 18,
+                "x": 0,
+                "y": 0
+            },
+            "id": 12,
+            "options": {
+                "legend": {
+                    "calcs": [
+                        "mean",
+                        "lastNotNull",
+                        "max"
+                    ],
+                    "displayMode": "table",
+                    "placement": "bottom",
+                    "showLegend": true
+                },
+                "tooltip": {
+                    "hideZeros": false,
+                    "mode": "multi",
+                    "sort": "none"
+                }
+            },
+            "pluginVersion": "12.3.1",
+            "targets": [
+                {
+                    "datasource": {
+                        "type": "prometheus",
+                        "uid": "PBFA97CFB590B2093"
+                    },
+                    "editorMode": "code",
+                    "expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP",
+                    "instant": false,
+                    "interval": "",
+                    "legendFormat": "GPU 0",
+                    "refId": "A"
+                }
+            ],
+            "title": "GPU Temperature",
+            "type": "timeseries"
+        },
+        {
+            "datasource": {
+                "type": "prometheus",
+                "uid": "PBFA97CFB590B2093"
+            },
+            "fieldConfig": {
+                "defaults": {
+                    "color": {
+                        "mode": "thresholds"
+                    },
+                    "mappings": [],
+                    "max": 100,
+                    "min": 0,
+                    "thresholds": {
+                        "mode": "absolute",
+                        "steps": [
+                            {
+                                "color": "green",
+                                "value": 0
+                            },
+                            {
+                                "color": "#EAB839",
+                                "value": 70
+                            },
+                            {
+                                "color": "red",
+                                "value": 80
+                            }
+                        ]
+                    },
+                    "unit": "celsius"
+                },
+                "overrides": []
+            },
+            "gridPos": {
+                "h": 8,
+                "w": 6,
+                "x": 18,
+                "y": 0
+            },
+            "id": 14,
+            "options": {
+                "minVizHeight": 75,
+                "minVizWidth": 75,
+                "orientation": "auto",
+                "reduceOptions": {
+                    "calcs": [
+                        "lastNotNull"
+                    ],
+                    "fields": "",
+                    "values": false
+                },
+                "showThresholdLabels": false,
+                "showThresholdMarkers": true,
+                "sizing": "auto"
+            },
+            "pluginVersion": "12.3.1",
+            "targets": [
+                {
+                    "datasource": {
+                        "type": "prometheus",
+                        "uid": "PBFA97CFB590B2093"
+                    },
+                    "editorMode": "code",
+                    "expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP",
+                    "interval": "",
+                    "legendFormat": "",
+                    "range": true,
+                    "refId": "A"
+                }
+            ],
+            "title": "GPU Current Temp",
+            "type": "gauge"
+        },
+        {
+            "datasource": {
+                "type": "prometheus",
+                "uid": "PBFA97CFB590B2093"
+            },
+            "fieldConfig": {
+                "defaults": {
+                    "color": {
+                        "mode": "palette-classic"
+                    },
+                    "custom": {
+                        "axisBorderShow": false,
+                        "axisCenteredZero": false,
+                        "axisColorMode": "text",
+                        "axisLabel": "",
+                        "axisPlacement": "auto",
+                        "barAlignment": 0,
+                        "barWidthFactor": 0.6,
+                        "drawStyle": "line",
+                        "fillOpacity": 10,
+                        "gradientMode": "none",
+                        "hideFrom": {
+                            "legend": false,
+                            "tooltip": false,
+                            "viz": false
+                        },
+                        "insertNulls": false,
+                        "lineInterpolation": "linear",
+                        "lineWidth": 2,
+                        "pointSize": 5,
+                        "scaleDistribution": {
+                            "type": "linear"
+                        },
+                        "showPoints": "never",
+                        "showValues": false,
+                        "spanNulls": false,
+                        "stacking": {
+                            "group": "A",
+                            "mode": "none"
+                        },
+                        "thresholdsStyle": {
+                            "mode": "off"
+                        }
+                    },
+                    "mappings": [],
+                    "thresholds": {
+                        "mode": "absolute",
+                        "steps": [
+                            {
+                                "color": "green",
+                                "value": 0
+                            },
+                            {
+                                "color": "red",
+                                "value": 80
+                            }
+                        ]
+                    },
+                    "unit": "watt"
+                },
+                "overrides": []
+            },
+            "gridPos": {
+                "h": 8,
+                "w": 18,
+                "x": 0,
+                "y": 8
+            },
+            "id": 10,
+            "options": {
+                "legend": {
+                    "calcs": [
+                        "mean",
+                        "lastNotNull",
+                        "max"
+                    ],
+                    "displayMode": "table",
+                    "placement": "bottom",
+                    "showLegend": true
+                },
+                "tooltip": {
+                    "hideZeros": false,
+                    "mode": "multi",
+                    "sort": "none"
+                }
+            },
+            "pluginVersion": "12.3.1",
+            "targets": [
+                {
+                    "datasource": {
+                        "type": "prometheus",
+                        "uid": "PBFA97CFB590B2093"
+                    },
+                    "editorMode": "code",
+                    "expr": "nvidia_tesla_t4_DCGM_FI_DEV_POWER_USAGE",
+                    "interval": "",
+                    "legendFormat": "GPU {{gpu}}",
+                    "range": true,
+                    "refId": "A"
+                }
+            ],
+            "title": "GPU Power Usage",
+            "type": "timeseries"
+        },
+        {
+            "datasource": {
+                "type": "prometheus",
+                "uid": "PBFA97CFB590B2093"
+            },
+            "fieldConfig": {
+                "defaults": {
+                    "color": {
+                        "mode": "thresholds"
+                    },
+                    "mappings": [],
+                    "max": 2400,
+                    "min": 0,
+                    "thresholds": {
+                        "mode": "absolute",
+                        "steps": [
+                            {
+                                "color": "green",
+                                "value": 0
+                            },
+                            {
+                                "color": "#EAB839",
+                                "value": 1800
+                            },
+                            {
+                                "color": "red",
+                                "value": 2200
+                            }
+                        ]
+                    },
+                    "unit": "watt"
+                },
+                "overrides": []
+            },
+            "gridPos": {
+                "h": 8,
+                "w": 6,
+                "x": 18,
+                "y": 8
+            },
+            "id": 16,
+            "options": {
+                "minVizHeight": 75,
+                "minVizWidth": 75,
+                "orientation": "horizontal",
+                "reduceOptions": {
+                    "calcs": [
+                        "sum"
+                    ],
+                    "fields": "",
+                    "values": false
+                },
+                "showThresholdLabels": false,
+                "showThresholdMarkers": true,
+                "sizing": "auto"
+            },
+            "pluginVersion": "12.3.1",
+            "targets": [
+                {
+                    "datasource": {
+                        "type": "prometheus",
+                        "uid": "PBFA97CFB590B2093"
+                    },
+                    "editorMode": "code",
+                    "exemplar": false,
+                    "expr": "sum(nvidia_tesla_t4_DCGM_FI_DEV_POWER_USAGE)",
+                    "instant": true,
+                    "interval": "",
+                    "legendFormat": "",
+                    "range": false,
+                    "refId": "A"
+                }
+            ],
+            "title": "GPU Power Total",
+            "type": "gauge"
+        },
+        {
+            "datasource": {
+                "type": "prometheus",
+                "uid": "PBFA97CFB590B2093"
+            },
+            "fieldConfig": {
+                "defaults": {
+                    "color": {
+                        "mode": "palette-classic"
+                    },
+                    "custom": {
+                        "axisBorderShow": false,
+                        "axisCenteredZero": false,
+                        "axisColorMode": "text",
+                        "axisLabel": "",
+                        "axisPlacement": "auto",
+                        "barAlignment": 0,
+                        "barWidthFactor": 0.6,
+                        "drawStyle": "line",
+                        "fillOpacity": 10,
+                        "gradientMode": "none",
+                        "hideFrom": {
+                            "legend": false,
+                            "tooltip": false,
+                            "viz": false
+                        },
+                        "insertNulls": false,
+                        "lineInterpolation": "linear",
+                        "lineWidth": 2,
+                        "pointSize": 5,
+                        "scaleDistribution": {
+                            "type": "linear"
+                        },
+                        "showPoints": "never",
+                        "showValues": false,
+                        "spanNulls": false,
+                        "stacking": {
+                            "group": "A",
+                            "mode": "none"
+                        },
+                        "thresholdsStyle": {
+                            "mode": "off"
+                        }
+                    },
+                    "mappings": [],
+                    "max": 100,
+                    "min": 0,
+                    "thresholds": {
+                        "mode": "absolute",
+                        "steps": [
+                            {
+                                "color": "green",
+                                "value": 0
+                            },
+                            {
+                                "color": "red",
+                                "value": 80
+                            }
+                        ]
+                    },
+                    "unit": "percent"
+                },
+                "overrides": []
+            },
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 16
+            },
+            "id": 6,
+            "options": {
+                "legend": {
+                    "calcs": [
+                        "mean",
+                        "lastNotNull",
+                        "max"
+                    ],
+                    "displayMode": "table",
+                    "placement": "bottom",
+                    "showLegend": true
+                },
+                "tooltip": {
+                    "hideZeros": false,
+                    "mode": "multi",
+                    "sort": "none"
+                }
+            },
+            "pluginVersion": "12.3.1",
+            "targets": [
+                {
+                    "datasource": {
+                        "type": "prometheus",
+                        "uid": "PBFA97CFB590B2093"
+                    },
+                    "editorMode": "code",
+                    "expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_UTIL",
+                    "interval": "",
+                    "legendFormat": "GPU {{gpu}}",
+                    "range": true,
+                    "refId": "A"
+                }
+            ],
+            "title": "GPU Utilization",
+            "type": "timeseries"
+        },
+        {
+            "datasource": {
+                "type": "prometheus",
+                "uid": "PBFA97CFB590B2093"
+            },
+            "fieldConfig": {
+                "defaults": {
+                    "color": {
+                        "mode": "palette-classic"
+                    },
+                    "custom": {
+                        "axisBorderShow": false,
+                        "axisCenteredZero": false,
+                        "axisColorMode": "text",
+                        "axisLabel": "",
+                        "axisPlacement": "auto",
+                        "barAlignment": 0,
+                        "barWidthFactor": 0.6,
+                        "drawStyle": "line",
+                        "fillOpacity": 10,
+                        "gradientMode": "none",
+                        "hideFrom": {
+                            "legend": false,
+                            "tooltip": false,
+                            "viz": false
+                        },
+                        "insertNulls": false,
+                        "lineInterpolation": "linear",
+                        "lineWidth": 2,
+                        "pointSize": 5,
+                        "scaleDistribution": {
+                            "type": "linear"
+                        },
+                        "showPoints": "never",
+                        "showValues": false,
+                        "spanNulls": false,
+                        "stacking": {
+                            "group": "A",
+                            "mode": "none"
+                        },
+                        "thresholdsStyle": {
+                            "mode": "off"
+                        }
+                    },
+                    "mappings": [],
+                    "thresholds": {
+                        "mode": "absolute",
+                        "steps": [
+                            {
+                                "color": "green",
+                                "value": 0
+                            },
+                            {
+                                "color": "red",
+                                "value": 80
+                            }
+                        ]
+                    },
+                    "unit": "decmbytes"
+                },
+                "overrides": []
+            },
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 16
+            },
+            "id": 18,
+            "options": {
+                "legend": {
+                    "calcs": [
+                        "mean",
+                        "max"
+                    ],
+                    "displayMode": "list",
+                    "placement": "bottom",
+                    "showLegend": true
+                },
+                "tooltip": {
+                    "hideZeros": false,
+                    "mode": "multi",
+                    "sort": "none"
+                }
+            },
+            "pluginVersion": "12.3.1",
+            "targets": [
+                {
+                    "datasource": {
+                        "type": "prometheus",
+                        "uid": "PBFA97CFB590B2093"
+                    },
+                    "editorMode": "code",
+                    "expr": "nvidia_tesla_t4_DCGM_FI_DEV_FB_USED",
+                    "interval": "",
+                    "legendFormat": "GPU {{gpu}}",
+                    "range": true,
+                    "refId": "A"
+                }
+            ],
+            "title": "GPU Framebuffer Mem Used",
+            "type": "timeseries"
+        },
+        {
+            "datasource": {
+                "type": "prometheus",
+                "uid": "PBFA97CFB590B2093"
+            },
+            "fieldConfig": {
+                "defaults": {
+                    "color": {
+                        "mode": "palette-classic"
+                    },
+                    "custom": {
+                        "axisBorderShow": false,
+                        "axisCenteredZero": false,
+                        "axisColorMode": "text",
+                        "axisLabel": "",
+                        "axisPlacement": "auto",
+                        "barAlignment": 0,
+                        "barWidthFactor": 0.6,
+                        "drawStyle": "line",
+                        "fillOpacity": 10,
+                        "gradientMode": "none",
+                        "hideFrom": {
+                            "legend": false,
+                            "tooltip": false,
+                            "viz": false
+                        },
+                        "insertNulls": false,
+                        "lineInterpolation": "linear",
+                        "lineWidth": 2,
+                        "pointSize": 5,
+                        "scaleDistribution": {
+                            "type": "linear"
+                        },
+                        "showPoints": "never",
+                        "showValues": false,
+                        "spanNulls": false,
+                        "stacking": {
+                            "group": "A",
+                            "mode": "none"
+                        },
+                        "thresholdsStyle": {
+                            "mode": "off"
+                        }
+                    },
+                    "mappings": [],
+                    "thresholds": {
+                        "mode": "absolute",
+                        "steps": [
+                            {
+                                "color": "green",
+                                "value": 0
+                            },
+                            {
+                                "color": "red",
+                                "value": 80
+                            }
+                        ]
+                    },
+                    "unit": "hertz"
+                },
+                "overrides": []
+            },
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 24
+            },
+            "id": 2,
+            "options": {
+                "legend": {
+                    "calcs": [
+                        "mean",
+                        "lastNotNull",
+                        "max"
+                    ],
+                    "displayMode": "table",
+                    "placement": "bottom",
+                    "showLegend": true
+                },
+                "tooltip": {
+                    "hideZeros": false,
+                    "mode": "multi",
+                    "sort": "none"
+                }
+            },
+            "pluginVersion": "12.3.1",
+            "targets": [
+                {
+                    "datasource": {
+                        "type": "prometheus",
+                        "uid": "PBFA97CFB590B2093"
+                    },
+                    "editorMode": "code",
+                    "expr": "nvidia_tesla_t4_DCGM_FI_DEV_SM_CLOCK* 1000000",
+                    "format": "time_series",
+                    "interval": "",
+                    "intervalFactor": 1,
+                    "legendFormat": "GPU {{gpu}}",
+                    "range": true,
+                    "refId": "A"
+                }
+            ],
+            "title": "GPU SM Clocks",
+            "type": "timeseries"
+        },
+        {
+            "datasource": {
+                "type": "prometheus",
+                "uid": "PBFA97CFB590B2093"
+            },
+            "fieldConfig": {
+                "defaults": {
+                    "color": {
+                        "mode": "palette-classic"
+                    },
+                    "custom": {
+                        "axisBorderShow": false,
+                        "axisCenteredZero": false,
+                        "axisColorMode": "text",
+                        "axisLabel": "",
+                        "axisPlacement": "auto",
+                        "barAlignment": 0,
+                        "barWidthFactor": 0.6,
+                        "drawStyle": "line",
+                        "fillOpacity": 0,
+                        "gradientMode": "none",
+                        "hideFrom": {
+                            "legend": false,
+                            "tooltip": false,
+                            "viz": false
+                        },
+                        "insertNulls": false,
+                        "lineInterpolation": "linear",
+                        "lineWidth": 1,
+                        "pointSize": 5,
+                        "scaleDistribution": {
+                            "type": "linear"
+                        },
+                        "showPoints": "auto",
+                        "showValues": false,
+                        "spanNulls": false,
+                        "stacking": {
+                            "group": "A",
+                            "mode": "none"
+                        },
+                        "thresholdsStyle": {
+                            "mode": "off"
+                        }
+                    },
+                    "mappings": [],
+                    "thresholds": {
+                        "mode": "absolute",
+                        "steps": [
+                            {
+                                "color": "green",
+                                "value": 0
+                            },
+                            {
+                                "color": "red",
+                                "value": 80
+                            }
+                        ]
+                    },
+                    "unit": "bytes"
+                },
+                "overrides": []
+            },
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 24
+            },
+            "id": 19,
+            "options": {
+                "legend": {
+                    "calcs": [],
+                    "displayMode": "list",
+                    "placement": "bottom",
+                    "showLegend": true
+                },
+                "tooltip": {
+                    "hideZeros": false,
+                    "mode": "single",
+                    "sort": "none"
+                }
+            },
+            "pluginVersion": "12.3.1",
+            "targets": [
+                {
+                    "datasource": {
+                        "type": "prometheus",
+                        "uid": "PBFA97CFB590B2093"
+                    },
+                    "editorMode": "code",
+                    "expr": "sum by (namespace) (gpu_pod_memory_used_bytes)",
+                    "instant": false,
+                    "legendFormat": "{{namespace}}",
+                    "range": true,
+                    "refId": "A"
+                }
+            ],
+            "title": "GPU Memory per Application",
+            "type": "timeseries"
+        }
+    ],
+    "preload": false,
+    "refresh": "auto",
+    "schemaVersion": 42,
+    "tags": [],
+    "templating": {
+        "list": []
+    },
+    "time": {
+        "from": "now-12h",
+        "to": "now"
+    },
+    "timepicker": {},
+    "timezone": "",
+    "title": "NVIDIA DCGM Exporter Dashboard",
+    "uid": "Oxed_c6Wz",
+    "version": 9
+}
--- a/stacks/monitoring/modules/monitoring/dashboards/pods.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/pods.json
--- a/stacks/monitoring/modules/monitoring/dashboards/proxmox_node_exporter.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/proxmox_node_exporter.json
--- a/stacks/monitoring/modules/monitoring/dashboards/realestate-crawler.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/realestate-crawler.json
--- a/stacks/monitoring/modules/monitoring/dashboards/registry.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/registry.json
--- a/stacks/monitoring/modules/monitoring/dashboards/technitium-dns.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/technitium-dns.json
@ -0,0 +1,488 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": { "type": "datasource", "uid": "grafana" },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "Technitium DNS query logs from MySQL",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "title": "Total Queries",
+      "type": "stat",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "steps": [
+              { "color": "green", "value": null }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "textMode": "auto",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT COUNT(*) as total_queries FROM dns_logs WHERE $__timeFilter(timestamp)",
+          "format": "table",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "Cached %",
+      "type": "stat",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "unit": "percentunit",
+          "thresholds": {
+            "steps": [
+              { "color": "red", "value": null },
+              { "color": "yellow", "value": 0.3 },
+              { "color": "green", "value": 0.5 }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "textMode": "auto",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT SUM(CASE WHEN response_type = 3 THEN 1 ELSE 0 END) / COUNT(*) as cached_pct FROM dns_logs WHERE $__timeFilter(timestamp)",
+          "format": "table",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "Blocked %",
+      "type": "stat",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "unit": "percentunit",
+          "thresholds": {
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 0.1 },
+              { "color": "red", "value": 0.3 }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "textMode": "auto",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT SUM(CASE WHEN response_type = 4 THEN 1 ELSE 0 END) / COUNT(*) as blocked_pct FROM dns_logs WHERE $__timeFilter(timestamp)",
+          "format": "table",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "NxDomain %",
+      "type": "stat",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "unit": "percentunit",
+          "thresholds": {
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 0.2 },
+              { "color": "red", "value": 0.5 }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "textMode": "auto",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT SUM(CASE WHEN rcode = 3 THEN 1 ELSE 0 END) / COUNT(*) as nxdomain_pct FROM dns_logs WHERE $__timeFilter(timestamp)",
+          "format": "table",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "Avg Response Time",
+      "type": "stat",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "unit": "ms",
+          "thresholds": {
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 50 },
+              { "color": "red", "value": 200 }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "textMode": "auto",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT AVG(response_rtt) as avg_rtt_ms FROM dns_logs WHERE $__timeFilter(timestamp) AND response_rtt IS NOT NULL",
+          "format": "table",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "Queries by Protocol",
+      "type": "stat",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 4, "w": 4, "x": 20, "y": 0 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "background",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "textMode": "auto",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": true }
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT SUM(CASE WHEN protocol = 0 THEN 1 ELSE 0 END) as UDP, SUM(CASE WHEN protocol = 1 THEN 1 ELSE 0 END) as TCP, SUM(CASE WHEN protocol = 3 THEN 1 ELSE 0 END) as DoH, SUM(CASE WHEN protocol = 4 THEN 1 ELSE 0 END) as DoT FROM dns_logs WHERE $__timeFilter(timestamp)",
+          "format": "table",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "Queries Over Time",
+      "type": "timeseries",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "bars",
+            "fillOpacity": 50,
+            "gradientMode": "none",
+            "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": { "type": "linear" },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": { "group": "A", "mode": "normal" }
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": { "calcs": ["sum"], "displayMode": "list", "placement": "bottom" },
+        "tooltip": { "mode": "multi", "sort": "desc" }
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT $__timeGroup(timestamp, $__interval) as time, SUM(CASE WHEN response_type = 1 THEN 1 ELSE 0 END) as Authoritative, SUM(CASE WHEN response_type = 2 THEN 1 ELSE 0 END) as Recursive, SUM(CASE WHEN response_type = 3 THEN 1 ELSE 0 END) as Cached, SUM(CASE WHEN response_type = 4 THEN 1 ELSE 0 END) as Blocked, SUM(CASE WHEN response_type = 5 THEN 1 ELSE 0 END) as Dropped FROM dns_logs WHERE $__timeFilter(timestamp) GROUP BY time ORDER BY time",
+          "format": "time_series",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "Response Codes",
+      "type": "piechart",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" }
+        },
+        "overrides": [
+          { "matcher": { "id": "byName", "options": "NOERROR" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
+          { "matcher": { "id": "byName", "options": "NXDOMAIN" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
+          { "matcher": { "id": "byName", "options": "SERVFAIL" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
+          { "matcher": { "id": "byName", "options": "REFUSED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
+        ]
+      },
+      "options": {
+        "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] },
+        "pieType": "donut",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": true },
+        "tooltip": { "mode": "single" }
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT SUM(CASE WHEN rcode = 0 THEN 1 ELSE 0 END) as NOERROR, SUM(CASE WHEN rcode = 2 THEN 1 ELSE 0 END) as SERVFAIL, SUM(CASE WHEN rcode = 3 THEN 1 ELSE 0 END) as NXDOMAIN, SUM(CASE WHEN rcode = 5 THEN 1 ELSE 0 END) as REFUSED, SUM(CASE WHEN rcode NOT IN (0,2,3,5) THEN 1 ELSE 0 END) as Other FROM dns_logs WHERE $__timeFilter(timestamp)",
+          "format": "table",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "Response Types",
+      "type": "piechart",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" }
+        },
+        "overrides": [
+          { "matcher": { "id": "byName", "options": "Cached" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
+          { "matcher": { "id": "byName", "options": "Blocked" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
+          { "matcher": { "id": "byName", "options": "Recursive" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
+          { "matcher": { "id": "byName", "options": "Authoritative" }, "properties": [{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }] }
+        ]
+      },
+      "options": {
+        "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] },
+        "pieType": "donut",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": true },
+        "tooltip": { "mode": "single" }
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT SUM(CASE WHEN response_type = 1 THEN 1 ELSE 0 END) as Authoritative, SUM(CASE WHEN response_type = 2 THEN 1 ELSE 0 END) as Recursive, SUM(CASE WHEN response_type = 3 THEN 1 ELSE 0 END) as Cached, SUM(CASE WHEN response_type = 4 THEN 1 ELSE 0 END) as Blocked, SUM(CASE WHEN response_type = 5 THEN 1 ELSE 0 END) as Dropped FROM dns_logs WHERE $__timeFilter(timestamp)",
+          "format": "table",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "Query Types",
+      "type": "piechart",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] },
+        "pieType": "donut",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": true },
+        "tooltip": { "mode": "single" }
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT SUM(CASE WHEN qtype = 1 THEN 1 ELSE 0 END) as A, SUM(CASE WHEN qtype = 28 THEN 1 ELSE 0 END) as AAAA, SUM(CASE WHEN qtype = 5 THEN 1 ELSE 0 END) as CNAME, SUM(CASE WHEN qtype = 15 THEN 1 ELSE 0 END) as MX, SUM(CASE WHEN qtype = 16 THEN 1 ELSE 0 END) as TXT, SUM(CASE WHEN qtype = 33 THEN 1 ELSE 0 END) as SRV, SUM(CASE WHEN qtype = 12 THEN 1 ELSE 0 END) as PTR, SUM(CASE WHEN qtype = 6 THEN 1 ELSE 0 END) as SOA, SUM(CASE WHEN qtype = 2 THEN 1 ELSE 0 END) as NS, SUM(CASE WHEN qtype = 65 THEN 1 ELSE 0 END) as HTTPS, SUM(CASE WHEN qtype NOT IN (1,2,5,6,12,15,16,28,33,65) THEN 1 ELSE 0 END) as Other FROM dns_logs WHERE $__timeFilter(timestamp)",
+          "format": "table",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "Top 20 Queried Domains",
+      "type": "table",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 10, "w": 12, "x": 0, "y": 20 },
+      "fieldConfig": {
+        "defaults": {
+          "custom": { "filterable": true }
+        },
+        "overrides": [
+          { "matcher": { "id": "byName", "options": "count" }, "properties": [{ "id": "custom.width", "value": 100 }] }
+        ]
+      },
+      "options": {
+        "showHeader": true,
+        "sortBy": [{ "desc": true, "displayName": "count" }]
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT qname as domain, COUNT(*) as count FROM dns_logs WHERE $__timeFilter(timestamp) GROUP BY qname ORDER BY count DESC LIMIT 20",
+          "format": "table",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "Top 20 Clients",
+      "type": "table",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 10, "w": 12, "x": 12, "y": 20 },
+      "fieldConfig": {
+        "defaults": {
+          "custom": { "filterable": true }
+        },
+        "overrides": [
+          { "matcher": { "id": "byName", "options": "count" }, "properties": [{ "id": "custom.width", "value": 100 }] }
+        ]
+      },
+      "options": {
+        "showHeader": true,
+        "sortBy": [{ "desc": true, "displayName": "count" }]
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT client_ip, COUNT(*) as count FROM dns_logs WHERE $__timeFilter(timestamp) GROUP BY client_ip ORDER BY count DESC LIMIT 20",
+          "format": "table",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "Average Response Time Over Time",
+      "type": "timeseries",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 30 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "unit": "ms",
+          "custom": {
+            "axisBorderShow": false,
+            "axisLabel": "Response Time (ms)",
+            "axisPlacement": "auto",
+            "drawStyle": "line",
+            "fillOpacity": 20,
+            "gradientMode": "none",
+            "lineWidth": 2,
+            "pointSize": 5,
+            "showPoints": "never",
+            "spanNulls": true
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": { "calcs": ["mean", "max"], "displayMode": "list", "placement": "bottom" },
+        "tooltip": { "mode": "multi", "sort": "desc" }
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT $__timeGroup(timestamp, $__interval) as time, AVG(response_rtt) as avg_rtt, MAX(response_rtt) as max_rtt FROM dns_logs WHERE $__timeFilter(timestamp) AND response_rtt IS NOT NULL GROUP BY time ORDER BY time",
+          "format": "time_series",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "Top 20 NxDomain Domains",
+      "type": "table",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 10, "w": 12, "x": 0, "y": 38 },
+      "fieldConfig": {
+        "defaults": {
+          "custom": { "filterable": true }
+        },
+        "overrides": [
+          { "matcher": { "id": "byName", "options": "count" }, "properties": [{ "id": "custom.width", "value": 100 }] }
+        ]
+      },
+      "options": {
+        "showHeader": true,
+        "sortBy": [{ "desc": true, "displayName": "count" }]
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT qname as domain, COUNT(*) as count FROM dns_logs WHERE $__timeFilter(timestamp) AND rcode = 3 GROUP BY qname ORDER BY count DESC LIMIT 20",
+          "format": "table",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "title": "Top 20 Blocked Domains",
+      "type": "table",
+      "datasource": { "type": "mysql", "uid": "technitium-mysql" },
+      "gridPos": { "h": 10, "w": 12, "x": 12, "y": 38 },
+      "fieldConfig": {
+        "defaults": {
+          "custom": { "filterable": true }
+        },
+        "overrides": [
+          { "matcher": { "id": "byName", "options": "count" }, "properties": [{ "id": "custom.width", "value": 100 }] }
+        ]
+      },
+      "options": {
+        "showHeader": true,
+        "sortBy": [{ "desc": true, "displayName": "count" }]
+      },
+      "targets": [
+        {
+          "rawSql": "SELECT qname as domain, COUNT(*) as count FROM dns_logs WHERE $__timeFilter(timestamp) AND response_type = 4 GROUP BY qname ORDER BY count DESC LIMIT 20",
+          "format": "table",
+          "refId": "A"
+        }
+      ]
+    }
+  ],
+  "refresh": "5m",
+  "schemaVersion": 39,
+  "tags": ["dns", "technitium", "mysql"],
+  "templating": { "list": [] },
+  "time": { "from": "now-24h", "to": "now" },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Technitium DNS",
+  "uid": "technitium-dns",
+  "version": 1
+}
--- a/stacks/monitoring/modules/monitoring/dashboards/ups-prometheus-metrics.yml
+++ b/stacks/monitoring/modules/monitoring/dashboards/ups-prometheus-metrics.yml
@ -0,0 +1,303 @@
+# HELP snmpEnableAuthenTraps Indicates whether the SNMP entity is permitted to generate authenticationFailure traps - 1.3.6.1.2.1.11.30
+# TYPE snmpEnableAuthenTraps gauge
+snmpEnableAuthenTraps 2
+# HELP snmpInASNParseErrs The total number of ASN.1 or BER errors encountered by the SNMP entity when decoding received SNMP messages. - 1.3.6.1.2.1.11.6
+# TYPE snmpInASNParseErrs counter
+snmpInASNParseErrs 0
+# HELP snmpInBadCommunityNames The total number of community-based SNMP messages (for example, SNMPv1) delivered to the SNMP entity which used an SNMP community name not known to said entity - 1.3.6.1.2.1.11.4
+# TYPE snmpInBadCommunityNames counter
+snmpInBadCommunityNames 184
+# HELP snmpInBadCommunityUses The total number of community-based SNMP messages (for example, SNMPv1) delivered to the SNMP entity which represented an SNMP operation that was not allowed for the SNMP community named in the message - 1.3.6.1.2.1.11.5
+# TYPE snmpInBadCommunityUses counter
+snmpInBadCommunityUses 0
+# HELP snmpInBadValues The total number of SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `badValue'. - 1.3.6.1.2.1.11.10
+# TYPE snmpInBadValues counter
+snmpInBadValues 0
+# HELP snmpInBadVersions The total number of SNMP messages which were delivered to the SNMP entity and were for an unsupported SNMP version. - 1.3.6.1.2.1.11.3
+# TYPE snmpInBadVersions counter
+snmpInBadVersions 0
+# HELP snmpInGenErrs The total number of SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `genErr'. - 1.3.6.1.2.1.11.12
+# TYPE snmpInGenErrs counter
+snmpInGenErrs 0
+# HELP snmpInGetNexts The total number of SNMP Get-Next PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.16
+# TYPE snmpInGetNexts counter
+snmpInGetNexts 2940
+# HELP snmpInGetRequests The total number of SNMP Get-Request PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.15
+# TYPE snmpInGetRequests counter
+snmpInGetRequests 9
+# HELP snmpInGetResponses The total number of SNMP Get-Response PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.18
+# TYPE snmpInGetResponses counter
+snmpInGetResponses 0
+# HELP snmpInNoSuchNames The total number of SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `noSuchName'. - 1.3.6.1.2.1.11.9
+# TYPE snmpInNoSuchNames counter
+snmpInNoSuchNames 0
+# HELP snmpInPkts The total number of messages delivered to the SNMP entity from the transport service. - 1.3.6.1.2.1.11.1
+# TYPE snmpInPkts counter
+snmpInPkts 5928
+# HELP snmpInReadOnlys The total number valid SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `readOnly' - 1.3.6.1.2.1.11.11
+# TYPE snmpInReadOnlys counter
+snmpInReadOnlys 0
+# HELP snmpInSetRequests The total number of SNMP Set-Request PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.17
+# TYPE snmpInSetRequests counter
+snmpInSetRequests 0
+# HELP snmpInTooBigs The total number of SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `tooBig'. - 1.3.6.1.2.1.11.8
+# TYPE snmpInTooBigs counter
+snmpInTooBigs 0
+# HELP snmpInTotalReqVars The total number of MIB objects which have been retrieved successfully by the SNMP protocol entity as the result of receiving valid SNMP Get-Request and Get-Next PDUs. - 1.3.6.1.2.1.11.13
+# TYPE snmpInTotalReqVars counter
+snmpInTotalReqVars 72699
+# HELP snmpInTotalSetVars The total number of MIB objects which have been altered successfully by the SNMP protocol entity as the result of receiving valid SNMP Set-Request PDUs. - 1.3.6.1.2.1.11.14
+# TYPE snmpInTotalSetVars counter
+snmpInTotalSetVars 0
+# HELP snmpInTraps The total number of SNMP Trap PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.19
+# TYPE snmpInTraps counter
+snmpInTraps 0
+# HELP snmpOutBadValues The total number of SNMP PDUs which were generated by the SNMP protocol entity and for which the value of the error-status field was `badValue'. - 1.3.6.1.2.1.11.22
+# TYPE snmpOutBadValues counter
+snmpOutBadValues 0
+# HELP snmpOutGenErrs The total number of SNMP PDUs which were generated by the SNMP protocol entity and for which the value of the error-status field was `genErr'. - 1.3.6.1.2.1.11.24
+# TYPE snmpOutGenErrs counter
+snmpOutGenErrs 0
+# HELP snmpOutGetNexts The total number of SNMP Get-Next PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.26
+# TYPE snmpOutGetNexts counter
+snmpOutGetNexts 0
+# HELP snmpOutGetRequests The total number of SNMP Get-Request PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.25
+# TYPE snmpOutGetRequests counter
+snmpOutGetRequests 0
+# HELP snmpOutGetResponses The total number of SNMP Get-Response PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.28
+# TYPE snmpOutGetResponses counter
+snmpOutGetResponses 5740
+# HELP snmpOutNoSuchNames The total number of SNMP PDUs which were generated by the SNMP protocol entity and for which the value of the error-status was `noSuchName'. - 1.3.6.1.2.1.11.21
+# TYPE snmpOutNoSuchNames counter
+snmpOutNoSuchNames 0
+# HELP snmpOutPkts The total number of SNMP Messages which were passed from the SNMP protocol entity to the transport service. - 1.3.6.1.2.1.11.2
+# TYPE snmpOutPkts counter
+snmpOutPkts 5739
+# HELP snmpOutSetRequests The total number of SNMP Set-Request PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.27
+# TYPE snmpOutSetRequests counter
+snmpOutSetRequests 0
+# HELP snmpOutTooBigs The total number of SNMP PDUs which were generated by the SNMP protocol entity and for which the value of the error-status field was `tooBig.' - 1.3.6.1.2.1.11.20
+# TYPE snmpOutTooBigs counter
+snmpOutTooBigs 0
+# HELP snmpOutTraps The total number of SNMP Trap PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.29
+# TYPE snmpOutTraps counter
+snmpOutTraps 0
+# HELP snmpProxyDrops The total number of Confirmed Class PDUs (such as GetRequest-PDUs, GetNextRequest-PDUs, GetBulkRequest-PDUs, SetRequest-PDUs, and InformRequest-PDUs) delivered to the SNMP entity which were silently dropped because the transmission of the (possibly translated) message to a proxy target failed in a manner (other than a time-out) such that no Response Class PDU (such as a Response-PDU) could be returned. - 1.3.6.1.2.1.11.32
+# TYPE snmpProxyDrops counter
+snmpProxyDrops 0
+# HELP snmpSilentDrops The total number of Confirmed Class PDUs (such as GetRequest-PDUs, GetNextRequest-PDUs, GetBulkRequest-PDUs, SetRequest-PDUs, and InformRequest-PDUs) delivered to the SNMP entity which were silently dropped because the size of a reply containing an alternate Response Class PDU (such as a Response-PDU) with an empty variable-bindings field was greater than either a local constraint or the maximum message size associated with the originator of the request. - 1.3.6.1.2.1.11.31
+# TYPE snmpSilentDrops counter
+snmpSilentDrops 0
+# HELP snmp_scrape_duration_seconds Total SNMP time scrape took (walk and processing).
+# TYPE snmp_scrape_duration_seconds gauge
+snmp_scrape_duration_seconds{module="huawei"} 0.39253882
+# HELP snmp_scrape_packets_retried Packets retried for get, bulkget, and walk.
+# TYPE snmp_scrape_packets_retried gauge
+snmp_scrape_packets_retried{module="huawei"} 0
+# HELP snmp_scrape_packets_sent Packets sent for get, bulkget, and walk; including retries.
+# TYPE snmp_scrape_packets_sent gauge
+snmp_scrape_packets_sent{module="huawei"} 6
+# HELP snmp_scrape_pdus_returned PDUs returned from get, bulkget, and walk.
+# TYPE snmp_scrape_pdus_returned gauge
+snmp_scrape_pdus_returned{module="huawei"} 104
+# HELP snmp_scrape_walk_duration_seconds Time SNMP walk/bulkwalk took.
+# TYPE snmp_scrape_walk_duration_seconds gauge
+snmp_scrape_walk_duration_seconds{module="huawei"} 0.391760524
+# HELP sysContact The textual identification of the contact person for this managed node, together with information on how to contact this person - 1.3.6.1.2.1.1.4
+# TYPE sysContact gauge
+sysContact{sysContact="Not Configure System Contact"} 1
+# HELP sysDescr A textual description of the entity - 1.3.6.1.2.1.1.1
+# TYPE sysDescr gauge
+sysDescr{sysDescr="Linux GSE200M 2.6.27-SPEAr310 #80 Fri Jan 13 11:22:09 CST 2017 armv5tejl"} 1
+# HELP sysLocation The physical location of this node (e.g., 'telephone closet, 3rd floor') - 1.3.6.1.2.1.1.6
+# TYPE sysLocation gauge
+sysLocation{sysLocation="Garage G03"} 1
+# HELP sysName An administratively-assigned name for this managed node - 1.3.6.1.2.1.1.5
+# TYPE sysName gauge
+sysName{sysName="ups2000"} 1
+# HELP sysORDescr A textual description of the capabilities identified by the corresponding instance of sysORID. - 1.3.6.1.2.1.1.9.1.3
+# TYPE sysORDescr gauge
+sysORDescr{sysORDescr="The MIB for Message Processing and Dispatching.",sysORIndex="3"} 1
+sysORDescr{sysORDescr="The MIB module for SNMPv2 entities",sysORIndex="1"} 1
+sysORDescr{sysORDescr="The SNMP Management Architecture MIB.",sysORIndex="5"} 1
+sysORDescr{sysORDescr="The management information definitions for the SNMP User-based Security Model.",sysORIndex="4"} 1
+sysORDescr{sysORDescr="View-based Access Control Model for SNMP.",sysORIndex="2"} 1
+# HELP sysORID An authoritative identification of a capabilities statement with respect to various MIB modules supported by the local SNMP application acting as a command responder. - 1.3.6.1.2.1.1.9.1.2
+# TYPE sysORID gauge
+sysORID{sysORID="1.3.6.1.6.3.1",sysORIndex="1"} 1
+sysORID{sysORID="1.3.6.1.6.3.10.3.1.1",sysORIndex="5"} 1
+sysORID{sysORID="1.3.6.1.6.3.11.3.1.1",sysORIndex="3"} 1
+sysORID{sysORID="1.3.6.1.6.3.15.2.1.1",sysORIndex="4"} 1
+sysORID{sysORID="1.3.6.1.6.3.16.2.2.1",sysORIndex="2"} 1
+# HELP sysORLastChange The value of sysUpTime at the time of the most recent change in state or value of any instance of sysORID. - 1.3.6.1.2.1.1.8
+# TYPE sysORLastChange gauge
+sysORLastChange 8
+# HELP sysORUpTime The value of sysUpTime at the time this conceptual row was last instantiated. - 1.3.6.1.2.1.1.9.1.4
+# TYPE sysORUpTime gauge
+sysORUpTime{sysORIndex="1"} 7
+sysORUpTime{sysORIndex="2"} 8
+sysORUpTime{sysORIndex="3"} 8
+sysORUpTime{sysORIndex="4"} 8
+sysORUpTime{sysORIndex="5"} 8
+# HELP sysObjectID The vendor's authoritative identification of the network management subsystem contained in the entity - 1.3.6.1.2.1.1.2
+# TYPE sysObjectID gauge
+sysObjectID{sysObjectID="1.3.6.1.4.1.8072.3.2.10"} 1
+# HELP sysUpTime The time (in hundredths of a second) since the network management portion of the system was last re-initialized. - 1.3.6.1.2.1.1.3
+# TYPE sysUpTime gauge
+sysUpTime 5.3264032e+07
+# HELP upsAlarmsPresent The present number of active alarm conditions. - 1.3.6.1.2.1.33.1.6.1
+# TYPE upsAlarmsPresent gauge
+upsAlarmsPresent 0
+# HELP upsAutoRestart Setting this object to 'on' will cause the UPS system to restart after a shutdown if the shutdown occurred during a power loss as a result of either a upsShutdownAfterDelay or an internal battery depleted condition - 1.3.6.1.2.1.33.1.8.5
+# TYPE upsAutoRestart gauge
+upsAutoRestart 0
+# HELP upsBatteryCurrent The present battery current. - 1.3.6.1.2.1.33.1.2.6
+# TYPE upsBatteryCurrent gauge
+upsBatteryCurrent 2.147483647e+09
+# HELP upsBatteryStatus The indication of the capacity remaining in the UPS system's batteries - 1.3.6.1.2.1.33.1.2.1
+# TYPE upsBatteryStatus gauge
+upsBatteryStatus 2
+# HELP upsBatteryTemperature The ambient temperature at or near the UPS Battery casing. - 1.3.6.1.2.1.33.1.2.7
+# TYPE upsBatteryTemperature gauge
+upsBatteryTemperature 2.147483647e+09
+# HELP upsBatteryVoltage The magnitude of the present battery voltage. - 1.3.6.1.2.1.33.1.2.5
+# TYPE upsBatteryVoltage gauge
+upsBatteryVoltage 821
+# HELP upsBypassFrequency The present bypass frequency. - 1.3.6.1.2.1.33.1.5.1
+# TYPE upsBypassFrequency gauge
+upsBypassFrequency 500
+# HELP upsBypassLineIndex The bypass line identifier. - 1.3.6.1.2.1.33.1.5.3.1.1
+# TYPE upsBypassLineIndex gauge
+upsBypassLineIndex{upsBypassLineIndex="1"} 1
+# HELP upsBypassNumLines The number of bypass lines utilized in this device - 1.3.6.1.2.1.33.1.5.2
+# TYPE upsBypassNumLines gauge
+upsBypassNumLines 1
+# HELP upsBypassVoltage The present bypass voltage. - 1.3.6.1.2.1.33.1.5.3.1.2
+# TYPE upsBypassVoltage gauge
+upsBypassVoltage{upsBypassLineIndex="1"} 220
+# HELP upsConfigAudibleStatus The requested state of the audible alarm - 1.3.6.1.2.1.33.1.9.8
+# TYPE upsConfigAudibleStatus gauge
+upsConfigAudibleStatus 0
+# HELP upsConfigHighVoltageTransferPoint The maximum line voltage allowed before the UPS system transfers to battery backup. - 1.3.6.1.2.1.33.1.9.10
+# TYPE upsConfigHighVoltageTransferPoint gauge
+upsConfigHighVoltageTransferPoint 0
+# HELP upsConfigInputFreq The nominal input frequency - 1.3.6.1.2.1.33.1.9.2
+# TYPE upsConfigInputFreq gauge
+upsConfigInputFreq 0
+# HELP upsConfigInputVoltage The magnitude of the nominal input voltage - 1.3.6.1.2.1.33.1.9.1
+# TYPE upsConfigInputVoltage gauge
+upsConfigInputVoltage 0
+# HELP upsConfigLowBattTime The value of upsEstimatedMinutesRemaining at which a lowBattery condition is declared - 1.3.6.1.2.1.33.1.9.7
+# TYPE upsConfigLowBattTime gauge
+upsConfigLowBattTime 0
+# HELP upsConfigLowVoltageTransferPoint The minimum input line voltage allowed before the UPS system transfers to battery backup. - 1.3.6.1.2.1.33.1.9.9
+# TYPE upsConfigLowVoltageTransferPoint gauge
+upsConfigLowVoltageTransferPoint 0
+# HELP upsConfigOutputFreq The nominal output frequency - 1.3.6.1.2.1.33.1.9.4
+# TYPE upsConfigOutputFreq gauge
+upsConfigOutputFreq 0
+# HELP upsConfigOutputPower The magnitude of the nominal true power rating. - 1.3.6.1.2.1.33.1.9.6
+# TYPE upsConfigOutputPower gauge
+upsConfigOutputPower 0
+# HELP upsConfigOutputVA The magnitude of the nominal Volt-Amp rating. - 1.3.6.1.2.1.33.1.9.5
+# TYPE upsConfigOutputVA gauge
+upsConfigOutputVA 0
+# HELP upsConfigOutputVoltage The magnitude of the nominal output voltage - 1.3.6.1.2.1.33.1.9.3
+# TYPE upsConfigOutputVoltage gauge
+upsConfigOutputVoltage 0
+# HELP upsEstimatedChargeRemaining An estimate of the battery charge remaining expressed as a percent of full charge. - 1.3.6.1.2.1.33.1.2.4
+# TYPE upsEstimatedChargeRemaining gauge
+upsEstimatedChargeRemaining 91
+# HELP upsEstimatedMinutesRemaining An estimate of the time to battery charge depletion under the present load conditions if the utility power is off and remains off, or if it were to be lost and remain off. - 1.3.6.1.2.1.33.1.2.3
+# TYPE upsEstimatedMinutesRemaining gauge
+upsEstimatedMinutesRemaining 34
+# HELP upsIdentAgentSoftwareVersion The UPS agent software version - 1.3.6.1.2.1.33.1.1.4
+# TYPE upsIdentAgentSoftwareVersion gauge
+upsIdentAgentSoftwareVersion{upsIdentAgentSoftwareVersion="V200R001C31B016"} 1
+# HELP upsIdentAttachedDevices A string identifying the devices attached to the output(s) of the UPS - 1.3.6.1.2.1.33.1.1.6
+# TYPE upsIdentAttachedDevices gauge
+upsIdentAttachedDevices{upsIdentAttachedDevices="None"} 1
+# HELP upsIdentManufacturer The name of the UPS manufacturer. - 1.3.6.1.2.1.33.1.1.1
+# TYPE upsIdentManufacturer gauge
+upsIdentManufacturer{upsIdentManufacturer="HUAWEI"} 1
+# HELP upsIdentModel The UPS Model designation. - 1.3.6.1.2.1.33.1.1.2
+# TYPE upsIdentModel gauge
+upsIdentModel{upsIdentModel="UPS2000 2kVA"} 1
+# HELP upsIdentName A string identifying the UPS - 1.3.6.1.2.1.33.1.1.5
+# TYPE upsIdentName gauge
+upsIdentName{upsIdentName="ups2000"} 1
+# HELP upsIdentUPSSoftwareVersion The UPS firmware/software version(s) - 1.3.6.1.2.1.33.1.1.3
+# TYPE upsIdentUPSSoftwareVersion gauge
+upsIdentUPSSoftwareVersion{upsIdentUPSSoftwareVersion="V2R1C1SPC40"} 1
+# HELP upsInputFrequency The present input frequency. - 1.3.6.1.2.1.33.1.3.3.1.2
+# TYPE upsInputFrequency gauge
+upsInputFrequency{upsInputLineIndex="1"} 500
+# HELP upsInputLineBads A count of the number of times the input entered an out-of-tolerance condition as defined by the manufacturer - 1.3.6.1.2.1.33.1.3.1
+# TYPE upsInputLineBads counter
+upsInputLineBads 0
+# HELP upsInputLineIndex The input line identifier. - 1.3.6.1.2.1.33.1.3.3.1.1
+# TYPE upsInputLineIndex gauge
+upsInputLineIndex{upsInputLineIndex="1"} 1
+# HELP upsInputNumLines The number of input lines utilized in this device - 1.3.6.1.2.1.33.1.3.2
+# TYPE upsInputNumLines gauge
+upsInputNumLines 1
+# HELP upsInputVoltage The magnitude of the present input voltage. - 1.3.6.1.2.1.33.1.3.3.1.3
+# TYPE upsInputVoltage gauge
+upsInputVoltage{upsInputLineIndex="1"} 218
+# HELP upsOutputCurrent The present output current. - 1.3.6.1.2.1.33.1.4.4.1.3
+# TYPE upsOutputCurrent gauge
+upsOutputCurrent{upsOutputLineIndex="1"} 56
+# HELP upsOutputFrequency The present output frequency. - 1.3.6.1.2.1.33.1.4.2
+# TYPE upsOutputFrequency gauge
+upsOutputFrequency 500
+# HELP upsOutputLineIndex The output line identifier. - 1.3.6.1.2.1.33.1.4.4.1.1
+# TYPE upsOutputLineIndex gauge
+upsOutputLineIndex{upsOutputLineIndex="1"} 1
+# HELP upsOutputNumLines The number of output lines utilized in this device - 1.3.6.1.2.1.33.1.4.3
+# TYPE upsOutputNumLines gauge
+upsOutputNumLines 1
+# HELP upsOutputPercentLoad The percentage of the UPS power capacity presently being used on this output line, i.e., the greater of the percent load of true power capacity and the percent load of VA. - 1.3.6.1.2.1.33.1.4.4.1.5
+# TYPE upsOutputPercentLoad gauge
+upsOutputPercentLoad{upsOutputLineIndex="1"} 66
+# HELP upsOutputPower The present output true power. - 1.3.6.1.2.1.33.1.4.4.1.4
+# TYPE upsOutputPower gauge
+upsOutputPower{upsOutputLineIndex="1"} 1
+# HELP upsOutputSource The present source of output power - 1.3.6.1.2.1.33.1.4.1
+# TYPE upsOutputSource gauge
+upsOutputSource 3
+# HELP upsOutputVoltage The present output voltage. - 1.3.6.1.2.1.33.1.4.4.1.2
+# TYPE upsOutputVoltage gauge
+upsOutputVoltage{upsOutputLineIndex="1"} 230
+# HELP upsRebootWithDuration Setting this object will immediately shutdown (i.e., turn off) either the UPS output or the UPS system (as determined by the value of upsShutdownType at the time of shutdown) for a period equal to the indicated number of seconds, after which time the output will be started, including starting the UPS, if necessary - 1.3.6.1.2.1.33.1.8.4
+# TYPE upsRebootWithDuration gauge
+upsRebootWithDuration 0
+# HELP upsSecondsOnBattery If the unit is on battery power, the elapsed time since the UPS last switched to battery power, or the time since the network management subsystem was last restarted, whichever is less - 1.3.6.1.2.1.33.1.2.2
+# TYPE upsSecondsOnBattery gauge
+upsSecondsOnBattery 0
+# HELP upsShutdownAfterDelay Setting this object will shutdown (i.e., turn off) either the UPS output or the UPS system (as determined by the value of upsShutdownType at the time of shutdown) after the indicated number of seconds, or less if the UPS batteries become depleted - 1.3.6.1.2.1.33.1.8.2
+# TYPE upsShutdownAfterDelay gauge
+upsShutdownAfterDelay 0
+# HELP upsShutdownType This object determines the nature of the action to be taken at the time when the countdown of the upsShutdownAfterDelay and upsRebootWithDuration objects reaches zero - 1.3.6.1.2.1.33.1.8.1
+# TYPE upsShutdownType gauge
+upsShutdownType 0
+# HELP upsStartupAfterDelay Setting this object will start the output after the indicated number of seconds, including starting the UPS, if necessary - 1.3.6.1.2.1.33.1.8.3
+# TYPE upsStartupAfterDelay gauge
+upsStartupAfterDelay 0
+# HELP upsTestElapsedTime The amount of time, in TimeTicks, since the test in progress was initiated, or, if no test is in progress, the previous test took to complete - 1.3.6.1.2.1.33.1.7.6
+# TYPE upsTestElapsedTime gauge
+upsTestElapsedTime 0
+# HELP upsTestId The test is named by an OBJECT IDENTIFIER which allows a standard mechanism for the initiation of tests, including the well known tests identified in this document as well as those introduced by a particular implementation, i.e., as documented in the private enterprise MIB definition for the device - 1.3.6.1.2.1.33.1.7.1
+# TYPE upsTestId gauge
+upsTestId{upsTestId="0"} 1
+# HELP upsTestResultsDetail Additional information about upsTestResultsSummary - 1.3.6.1.2.1.33.1.7.4
+# TYPE upsTestResultsDetail gauge
+upsTestResultsDetail{upsTestResultsDetail="0"} 1
+# HELP upsTestResultsSummary The results of the current or last UPS diagnostics test performed - 1.3.6.1.2.1.33.1.7.3
+# TYPE upsTestResultsSummary gauge
+upsTestResultsSummary 0
+# HELP upsTestSpinLock A spin lock on the test subsystem - 1.3.6.1.2.1.33.1.7.2
+# TYPE upsTestSpinLock gauge
+upsTestSpinLock 0
+# HELP upsTestStartTime The value of sysUpTime at the time the test in progress was initiated, or, if no test is in progress, the time the previous test was initiated - 1.3.6.1.2.1.33.1.7.5
+# TYPE upsTestStartTime gauge
+upsTestStartTime 0
--- a/stacks/monitoring/modules/monitoring/dashboards/ups.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/ups.json
--- a/stacks/monitoring/modules/monitoring/goflow2.tf
+++ b/stacks/monitoring/modules/monitoring/goflow2.tf
@ -0,0 +1,97 @@
+resource "kubernetes_deployment" "goflow2" {
+  metadata {
+    name      = "goflow2"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    labels = {
+      app  = "goflow2"
+      tier = var.tier
+    }
+  }
+  spec {
+    replicas = 1
+    selector {
+      match_labels = {
+        app = "goflow2"
+      }
+    }
+    template {
+      metadata {
+        labels = {
+          app = "goflow2"
+        }
+      }
+      spec {
+        container {
+          name  = "goflow2"
+          image = "netsampler/goflow2:v2.2.1"
+          args  = ["-listen", "netflow://:2055"]
+
+          port {
+            name           = "netflow"
+            container_port = 2055
+            protocol       = "UDP"
+          }
+          port {
+            name           = "metrics"
+            container_port = 8080
+            protocol       = "TCP"
+          }
+
+          resources {
+            requests = {
+              cpu    = "50m"
+              memory = "128Mi"
+            }
+            limits = {
+              memory = "128Mi"
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+resource "kubernetes_service" "goflow2" {
+  metadata {
+    name      = "goflow2"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    labels = {
+      app = "goflow2"
+    }
+  }
+  spec {
+    selector = {
+      app = "goflow2"
+    }
+    port {
+      name        = "metrics"
+      port        = 8080
+      target_port = 8080
+      protocol    = "TCP"
+    }
+  }
+}
+
+resource "kubernetes_service" "goflow2-netflow" {
+  metadata {
+    name      = "goflow2-netflow"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    labels = {
+      app = "goflow2"
+    }
+  }
+  spec {
+    type = "NodePort"
+    selector = {
+      app = "goflow2"
+    }
+    port {
+      name        = "netflow"
+      port        = 2055
+      target_port = 2055
+      protocol    = "UDP"
+      node_port   = 32055
+    }
+  }
+}
--- a/stacks/monitoring/modules/monitoring/grafana.tf
+++ b/stacks/monitoring/modules/monitoring/grafana.tf
@ -0,0 +1,132 @@
+
+
+# resource "kubernetes_persistent_volume" "prometheus_grafana_pv" {
+#   metadata {
+#     name = "grafana-pv"
+#   }
+#   spec {
+#     capacity = {
+#       "storage" = "2Gi"
+#     }
+#     access_modes = ["ReadWriteOnce"]
+#     persistent_volume_source {
+#       nfs {
+#         path   = "/mnt/main/grafana"
+#         server = var.nfs_server
+#       }
+#       # iscsi {
+#       #   target_portal = "iscsi.viktorbarzin.lan:3260"
+#       #   iqn           = "iqn.2020-12.lan.viktorbarzin:storage:monitoring:grafana"
+#       #   lun           = 0
+#       #   fs_type       = "ext4"
+#       # }
+#     }
+#   }
+# }
+
+resource "kubernetes_persistent_volume" "alertmanager_pv" {
+  metadata {
+    name = "alertmanager-pv"
+  }
+  spec {
+    capacity = {
+      "storage" = "2Gi"
+    }
+    access_modes = ["ReadWriteOnce"]
+    persistent_volume_source {
+      csi {
+        driver        = "nfs.csi.k8s.io"
+        volume_handle = "alertmanager-pv"
+        volume_attributes = {
+          server = var.nfs_server
+          share  = "/mnt/main/alertmanager"
+        }
+      }
+    }
+    mount_options = [
+      "soft",
+      "timeo=30",
+      "retrans=3",
+      "actimeo=5",
+    ]
+    storage_class_name = "nfs-truenas"
+  }
+}
+# resource "kubernetes_persistent_volume_claim" "grafana_pvc" {
+#   metadata {
+#     name      = "grafana-pvc"
+#    namespace = kubernetes_namespace.monitoring.metadata[0].name
+#   }
+#   spec {
+#     access_modes = ["ReadWriteOnce"]
+#     resources {
+#       requests = {
+#         "storage" = "2Gi"
+#       }
+#     }
+#   }
+# }
+
+# DB credentials from Vault database engine (rotated automatically)
+# Provides GF_DATABASE_PASSWORD that auto-updates when password rotates
+resource "kubernetes_manifest" "grafana_db_creds" {
+  manifest = {
+    apiVersion = "external-secrets.io/v1beta1"
+    kind       = "ExternalSecret"
+    metadata = {
+      name      = "grafana-db-creds"
+      namespace = kubernetes_namespace.monitoring.metadata[0].name
+    }
+    spec = {
+      refreshInterval = "15m"
+      secretStoreRef = {
+        name = "vault-database"
+        kind = "ClusterSecretStore"
+      }
+      target = {
+        name = "grafana-db-creds"
+        template = {
+          data = {
+            GF_DATABASE_PASSWORD = "{{ .password }}"
+          }
+        }
+      }
+      data = [{
+        secretKey = "password"
+        remoteRef = {
+          key      = "static-creds/mysql-grafana"
+          property = "password"
+        }
+      }]
+    }
+  }
+}
+
+resource "kubernetes_config_map" "grafana_dashboards" {
+  for_each = fileset("${path.module}/dashboards", "*.json")
+
+  metadata {
+    name      = "grafana-dashboard-${replace(trimsuffix(each.value, ".json"), "_", "-")}"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    labels = {
+      grafana_dashboard = "1"
+    }
+  }
+  data = {
+    (each.value) = file("${path.module}/dashboards/${each.value}")
+  }
+}
+
+resource "helm_release" "grafana" {
+  namespace        = kubernetes_namespace.monitoring.metadata[0].name
+  create_namespace = true
+  name             = "grafana"
+  atomic           = true
+  timeout          = 600
+
+  repository = "https://grafana.github.io/helm-charts"
+  chart      = "grafana"
+
+  values     = [templatefile("${path.module}/grafana_chart_values.yaml", { grafana_admin_password = var.grafana_admin_password, mysql_host = var.mysql_host })]
+  depends_on = [kubernetes_manifest.grafana_db_creds]
+}
--- a/stacks/monitoring/modules/monitoring/grafana_chart_values.yaml
+++ b/stacks/monitoring/modules/monitoring/grafana_chart_values.yaml
@ -0,0 +1,103 @@
+deploymentStrategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 0
+    maxUnavailable: 1
+replicas: 2
+adminPassword: "${grafana_admin_password}"
+resources:
+  requests:
+    cpu: 50m
+    memory: 512Mi
+  limits:
+    memory: 512Mi
+topologySpreadConstraints:
+  - maxSkew: 1
+    topologyKey: kubernetes.io/hostname
+    whenUnsatisfiable: ScheduleAnyway
+    labelSelector:
+      matchLabels:
+        app.kubernetes.io/name: grafana
+podAnnotations:
+  dependency.kyverno.io/wait-for: "mysql.dbaas:3306"
+podDisruptionBudget:
+  maxUnavailable: 1
+persistence:
+  enabled: false # using external mysql
+  existingClaim: "grafana-pvc"
+ingress:
+  enabled: "true"
+  ingressClassName: "traefik"
+  annotations:
+    traefik.ingress.kubernetes.io/router.middlewares: "traefik-rate-limit@kubernetescrd,traefik-csp-headers@kubernetescrd,traefik-crowdsec@kubernetescrd"
+    traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
+    gethomepage.dev/enabled: "true"
+    gethomepage.dev/name: "Grafana"
+    gethomepage.dev/description: "Dashboards & observability"
+    gethomepage.dev/icon: "grafana.png"
+    gethomepage.dev/group: "Core Platform"
+    gethomepage.dev/pod-selector: ""
+    gethomepage.dev/widget.type: "grafana"
+    gethomepage.dev/widget.url: "http://grafana.monitoring.svc.cluster.local"
+    gethomepage.dev/widget.username: "admin"
+    gethomepage.dev/widget.password: "${grafana_admin_password}"
+  tls:
+    - secretName: "tls-secret"
+      hosts:
+        - "grafana.viktorbarzin.me"
+  hosts:
+    - "grafana.viktorbarzin.me"
+sidecar:
+  datasources:
+    enabled: "true"
+  dashboards:
+    enabled: true
+    label: "grafana_dashboard"
+dashboardProviders:
+  dashboardproviders.yaml:
+    apiVersion: 1
+    name: default
+    ordId: 1
+    # folder: ""
+    type: "file"
+    # disableDeletion: "false"
+    # editable: "true"
+    options:
+      path: "/var/lib/grafana/dashboards/default"
+envFromSecrets:
+  - name: grafana-db-creds
+    optional: false
+env:
+  GF_SERVER_ROOT_URL: https://grafana.viktorbarzin.me
+
+grafana.ini:
+  database:
+    type: mysql
+    host: ${mysql_host}:3306
+    name: grafana
+    user: grafana
+    password: $__env{GF_DATABASE_PASSWORD}
+    ssl_mode: disable
+  auth.anonymous:
+    enabled: true
+    org_role: Viewer
+  # auth.google:
+  #   enabled: true
+  analytics:
+    check_for_updates: "true"
+  grafana_net:
+    url: "https://grafana.net"
+  log:
+    mode: "console"
+  paths:
+    data: "/var/lib/grafana/data"
+    logs: "/var/log/grafana"
+    plugins: "/var/lib/grafana/plugins"
+    provisioning: "/etc/grafana/provisioning"
+  security:
+    allow_embedding: true # Allow to be iframed
+
+# url: https://grafana.com/api/dashboards/11074/revisions/2/download
+# datasources:
+#   - name: Prometheus
+#     url: http://prometheus-server
--- a/stacks/monitoring/modules/monitoring/idrac.tf
+++ b/stacks/monitoring/modules/monitoring/idrac.tf
@ -0,0 +1,130 @@
+
+resource "kubernetes_config_map" "redfish-config" {
+  metadata {
+    name      = "redfish-exporter-config"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+
+    annotations = {
+      "reloader.stakater.com/match" = "true"
+    }
+  }
+  data = {
+    "config.yml" = <<-EOF
+      address: 0.0.0.0
+      port: 9610
+      hosts:
+        ${var.idrac_host}:
+          username: ${var.idrac_username}
+          password: ${var.idrac_password}
+        default:
+          username: root
+          password: calvin
+      metrics:
+        all: true
+        # system: true
+        # sensors: true
+        # power: true
+        # sel: false        # Disable SEL - often slow
+        # storage: true    # Disable storage - slowest endpoint
+        # memory: true
+        # network: false    # Disable network adapters
+        # firmware: false   # Don't need this frequently
+    EOF
+  }
+}
+
+resource "kubernetes_deployment" "idrac-redfish" {
+  metadata {
+    name      = "idrac-redfish-exporter"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    labels = {
+      app  = "idrac-redfish-exporter"
+      tier = var.tier
+    }
+    annotations = {
+      "reloader.stakater.com/search" = "true"
+    }
+  }
+  spec {
+    replicas = 1
+    selector {
+      match_labels = {
+        app = "idrac-redfish-exporter"
+      }
+    }
+    template {
+      metadata {
+        labels = {
+          app = "idrac-redfish-exporter"
+        }
+      }
+      spec {
+        priority_class_name = "tier-1-cluster"
+        container {
+          # https://github.com/mrlhansen/idrac_exporter?tab=readme-ov-file
+          # Pinned tag — Kyverno policy sets imagePullPolicy: IfNotPresent
+          image = "ghcr.io/mrlhansen/idrac_exporter:2.4.1"
+          name  = "redfish-exporter"
+          port {
+            container_port = 9610
+          }
+
+          volume_mount {
+            name       = "redfish-exporter-config"
+            mount_path = "/etc/prometheus/idrac.yml"
+            sub_path   = "config.yml"
+          }
+        }
+        volume {
+          name = "redfish-exporter-config"
+          config_map {
+            name = "redfish-exporter-config"
+          }
+        }
+        dns_config {
+          option {
+            name  = "ndots"
+            value = "2"
+          }
+        }
+      }
+    }
+  }
+}
+
+resource "kubernetes_service" "idrac-redfish-exporter" {
+  metadata {
+    name      = "idrac-redfish-exporter"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    labels = {
+      "app" = "idrac-redfish-exporter"
+    }
+    # annotations = {
+    #   "prometheus.io/scrape" = "true"
+    #   "prometheus.io/path"   = "/metrics"
+    #   "prometheus.io/port"   = "9090"
+    # }
+  }
+
+  spec {
+    selector = {
+      "app" = "idrac-redfish-exporter"
+    }
+    port {
+      name        = "http"
+      port        = "9090"
+      target_port = "9610"
+    }
+  }
+}
+
+module "idrac-redfish-exporter-ingress" {
+  source                  = "../../../../modules/kubernetes/ingress_factory"
+  namespace               = kubernetes_namespace.monitoring.metadata[0].name
+  name                    = "idrac-redfish-exporter"
+  root_domain             = "viktorbarzin.lan"
+  tls_secret_name         = var.tls_secret_name
+  allow_local_access_only = true
+  ssl_redirect            = false
+  port                    = 9090
+}
--- a/stacks/monitoring/modules/monitoring/k8s-monitoring-values.yaml
+++ b/stacks/monitoring/modules/monitoring/k8s-monitoring-values.yaml
@ -0,0 +1,78 @@
+---
+cluster:
+  name: default
+
+destinations:
+  - name: loki
+    type: loki
+    url: http://loki-gateway.monitoring.svc.cluster.local/loki/api/v1/push
+
+clusterEvents:
+  enabled: false
+  collector: alloy-logs
+  namespaces:
+    - dbaas
+    - immich
+    - authentik
+    - mailserver
+    - crowdsec
+    - descheduler
+    - calibre
+    - monitoring
+    - ingress-nginx
+    - vaultwarden
+
+nodeLogs:
+  enabled: false
+
+podLogs:
+  enabled: true
+  gatherMethod: kubernetesApi
+  collector: alloy-logs
+  labelsToKeep:
+    [
+      "app_kubernetes_io_name",
+      "container",
+      "instance",
+      "job",
+      "level",
+      "namespace",
+      "service_name",
+      "service_namespace",
+      "deployment_environment",
+      "deployment_environment_name",
+    ]
+  structuredMetadata:
+    pod: pod # Set structured metadata "pod" from label "pod"
+  namespaces:
+    - dbaas
+    - immich
+    - authentik
+    - mailserver
+    - crowdsec
+    - descheduler
+    - calibre
+    - monitoring
+    - ingress-nginx
+    - vaultwarden
+# Collectors
+alloy-singleton:
+  enabled: false
+
+alloy-metrics:
+  enabled: false
+
+alloy-logs:
+  enabled: true
+  # Required when using the Kubernetes API to pod logs
+  alloy:
+    mounts:
+      varlog: false
+    clustering:
+      enabled: true
+
+alloy-profiles:
+  enabled: false
+
+alloy-receiver:
+  enabled: false
--- a/stacks/monitoring/modules/monitoring/loki.tf
+++ b/stacks/monitoring/modules/monitoring/loki.tf
@ -0,0 +1,220 @@
+variable "nfs_server" { type = string }
+
+# LOKI DISABLED - Uncomment to re-enable centralized logging
+# Disabled due to operational overhead vs benefit analysis after node2 incident
+# All configuration preserved in loki.yaml for future re-enabling
+/*
+resource "helm_release" "loki" {
+  namespace        = kubernetes_namespace.monitoring.metadata[0].name
+  create_namespace = true
+  name             = "loki"
+
+  repository = "https://grafana.github.io/helm-charts"
+  chart      = "loki"
+
+  values  = [templatefile("${path.module}/loki.yaml", {})]
+  timeout = 600
+
+  depends_on = [kubernetes_config_map.loki_alert_rules]
+}
+*/
+
+# ALLOY DISABLED - Log collection agents (depends on Loki)
+# https://grafana.com/docs/alloy/latest/configure/kubernetes/
+# Configuration preserved in alloy.yaml for future re-enabling
+/*
+resource "helm_release" "alloy" {
+  namespace        = kubernetes_namespace.monitoring.metadata[0].name
+  create_namespace = true
+  name             = "alloy"
+
+  repository = "https://grafana.github.io/helm-charts"
+  chart      = "alloy"
+
+  values = [file("${path.module}/alloy.yaml")]
+  atomic = true
+
+  depends_on = [helm_release.loki]
+}
+*/
+
+# SYSCTL INOTIFY DISABLED - Was specifically for Loki file watching requirements
+# Can be re-enabled when Loki is restored
+/*
+resource "kubernetes_daemon_set_v1" "sysctl-inotify" {
+  metadata {
+    name      = "sysctl-inotify"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    labels = {
+      app = "sysctl-inotify"
+    }
+  }
+  spec {
+    selector {
+      match_labels = {
+        app = "sysctl-inotify"
+      }
+    }
+    template {
+      metadata {
+        labels = {
+          app = "sysctl-inotify"
+        }
+      }
+      spec {
+        init_container {
+          name  = "sysctl"
+          image = "busybox:1.37"
+          command = [
+            "sh", "-c",
+            "sysctl -w fs.inotify.max_user_watches=1048576 && sysctl -w fs.inotify.max_user_instances=8192 && sysctl -w fs.inotify.max_queued_events=1048576"
+          ]
+          security_context {
+            privileged = true
+          }
+        }
+        container {
+          name  = "pause"
+          image = "registry.k8s.io/pause:3.10"
+          resources {
+            requests = {
+              cpu    = "1m"
+              memory = "4Mi"
+            }
+            limits = {
+              cpu    = "1m"
+              memory = "4Mi"
+            }
+          }
+        }
+        host_pid = true
+        toleration {
+          operator = "Exists"
+        }
+        dns_config {
+          option {
+            name  = "ndots"
+            value = "2"
+          }
+        }
+      }
+    }
+  }
+}
+*/
+
+# resource "helm_release" "k8s-monitoring" {
+#  namespace = kubernetes_namespace.monitoring.metadata[0].name
+#   create_namespace = true
+#   name             = "k8s-monitoring"
+
+#   repository = "https://grafana.github.io/helm-charts"
+#   chart      = "k8s-monitoring"
+
+#   values = [templatefile("${path.module}/k8s-monitoring-values.yaml", {})]
+#   atomic = true
+# }
+
+# LOKI ALERT RULES DISABLED - Depend on Loki log queries
+# These alert on kernel events from systemd journal logs via Loki
+# Can be re-enabled when Loki is restored
+/*
+resource "kubernetes_config_map" "loki_alert_rules" {
+  metadata {
+    name      = "loki-alert-rules"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+  }
+  data = {
+    "rules.yaml" = yamlencode({
+      groups = [
+        {
+          name = "Node Health"
+          rules = [
+            {
+              alert = "KernelOOMKiller"
+              expr  = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Out of memory.*Killed process\" [5m])) > 0"
+              for   = "0m"
+              labels = {
+                severity = "critical"
+              }
+              annotations = {
+                summary = "OOM killer active on {{ $labels.node }}"
+              }
+            },
+            {
+              alert = "KernelPanic"
+              expr  = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Kernel panic\" [5m])) > 0"
+              for   = "0m"
+              labels = {
+                severity = "critical"
+              }
+              annotations = {
+                summary = "Kernel panic on {{ $labels.node }}"
+              }
+            },
+            {
+              alert = "KernelHungTask"
+              expr  = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"blocked for more than\" [5m])) > 0"
+              for   = "0m"
+              labels = {
+                severity = "warning"
+              }
+              annotations = {
+                summary = "Hung task detected on {{ $labels.node }}"
+              }
+            },
+            {
+              alert = "KernelSoftLockup"
+              expr  = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)soft lockup\" [5m])) > 0"
+              for   = "0m"
+              labels = {
+                severity = "critical"
+              }
+              annotations = {
+                summary = "Soft lockup on {{ $labels.node }}"
+              }
+            },
+            {
+              alert = "ContainerdDown"
+              expr  = "sum by (node) (count_over_time({job=\"node-journal\", unit=\"containerd.service\"} |~ \"(?i)(dead|failed|deactivating)\" [5m])) > 0"
+              for   = "1m"
+              labels = {
+                severity = "critical"
+              }
+              annotations = {
+                summary = "containerd service unhealthy on {{ $labels.node }}"
+              }
+            },
+          ]
+        }
+      ]
+    })
+  }
+}
+*/
+
+# GRAFANA LOKI DATASOURCE DISABLED - Points to non-existent Loki service
+# Can be re-enabled when Loki is restored
+/*
+resource "kubernetes_config_map" "grafana_loki_datasource" {
+  metadata {
+    name      = "grafana-loki-datasource"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    labels = {
+      grafana_datasource = "1"
+    }
+  }
+  data = {
+    "loki-datasource.yaml" = yamlencode({
+      apiVersion = 1
+      datasources = [{
+        name      = "Loki"
+        type      = "loki"
+        access    = "proxy"
+        url       = "http://loki.monitoring.svc.cluster.local:3100"
+        isDefault = false
+      }]
+    })
+  }
+}
+*/
--- a/stacks/monitoring/modules/monitoring/loki.yaml
+++ b/stacks/monitoring/modules/monitoring/loki.yaml
@ -0,0 +1,109 @@
+loki:
+  commonConfig:
+    replication_factor: 1
+  schemaConfig:
+    configs:
+      - from: "2025-04-01"
+        store: tsdb
+        object_store: filesystem
+        schema: v13
+        index:
+          prefix: loki_index_
+          period: 24h
+  ingester:
+    chunk_idle_period: 12h
+    max_chunk_age: 24h
+    chunk_retain_period: 1m
+    chunk_target_size: 1572864
+    wal:
+      dir: /loki-wal
+  pattern_ingester:
+    enabled: true
+  limits_config:
+    allow_structured_metadata: true
+    volume_enabled: true
+    retention_period: 720h
+  compactor:
+    retention_enabled: true
+    working_directory: /var/loki/compactor
+    compaction_interval: 1h
+    delete_request_store: filesystem
+  ruler:
+    enable_api: true
+    storage:
+      type: local
+      local:
+        directory: /var/loki/rules
+    alertmanager_url: http://prometheus-alertmanager.monitoring.svc.cluster.local:9093
+    ring:
+      kvstore:
+        store: inmemory
+    rule_path: /var/loki/scratch
+  storage:
+    type: "filesystem"
+  auth_enabled: false
+
+minio:
+  enabled: false
+
+deploymentMode: SingleBinary
+
+singleBinary:
+  replicas: 1
+  persistence:
+    enabled: true
+    size: 50Gi
+    storageClass: "iscsi-truenas"
+  extraVolumes:
+    - name: wal
+      emptyDir:
+        medium: Memory
+        sizeLimit: 2Gi
+    - name: rules
+      configMap:
+        name: loki-alert-rules
+  extraVolumeMounts:
+    - name: wal
+      mountPath: /loki-wal
+    - name: rules
+      mountPath: /var/loki/rules/fake
+  resources:
+    requests:
+      cpu: 250m
+      memory: 2Gi
+    limits:
+      memory: 4Gi
+
+# Zero out replica counts of other deployment modes
+backend:
+  replicas: 0
+read:
+  replicas: 0
+write:
+  replicas: 0
+ingester:
+  replicas: 0
+querier:
+  replicas: 0
+queryFrontend:
+  replicas: 0
+queryScheduler:
+  replicas: 0
+distributor:
+  replicas: 0
+compactor:
+  replicas: 0
+indexGateway:
+  replicas: 0
+bloomCompactor:
+  replicas: 0
+bloomGateway:
+  replicas: 0
+
+# Disable optional components for single binary mode
+gateway:
+  enabled: false
+chunksCache:
+  enabled: false
+resultsCache:
+  enabled: false
--- a/stacks/monitoring/modules/monitoring/main.tf
+++ b/stacks/monitoring/modules/monitoring/main.tf
@ -0,0 +1,214 @@
+variable "tls_secret_name" {}
+variable "alertmanager_account_password" {}
+variable "idrac_host" {
+  default = "192.168.1.4"
+}
+variable "idrac_username" {
+  default = "root"
+}
+variable "idrac_password" {
+  default   = "calvin"
+  sensitive = true
+}
+variable "alertmanager_slack_api_url" {}
+variable "tiny_tuya_service_secret" {
+  type      = string
+  sensitive = true
+}
+variable "haos_api_token" {
+  type      = string
+  sensitive = true
+}
+variable "pve_password" {
+  type      = string
+  sensitive = true
+}
+variable "grafana_admin_password" {
+  type      = string
+  sensitive = true
+}
+variable "tier" { type = string }
+variable "mysql_host" { type = string }
+
+resource "kubernetes_namespace" "monitoring" {
+  metadata {
+    name = "monitoring"
+    labels = {
+      "istio-injection" : "disabled"
+      tier                               = var.tier
+      "resource-governance/custom-quota" = "true"
+    }
+  }
+}
+
+module "tls_secret" {
+  source          = "../../../../modules/kubernetes/setup_tls_secret"
+  namespace       = kubernetes_namespace.monitoring.metadata[0].name
+  tls_secret_name = var.tls_secret_name
+}
+# Terraform get angry with the 30k values file :/ use ansible until solved
+# resource "helm_release" "ups_prometheus_snmp_exporter" {
+#  namespace = kubernetes_namespace.monitoring.metadata[0].name
+#   create_namespace = true
+#   name             = "ups_prometheus_exporter"
+
+#   repository = "https://prometheus-community.github.io/helm-charts"
+#   chart      = "prometheus-snmp-exporter"
+
+#   values = [file("${path.module}/ups_snmp_values.yaml")]
+# }
+
+
+
+resource "kubernetes_cron_job_v1" "monitor_prom" {
+  metadata {
+    name = "monitor-prometheus"
+  }
+  spec {
+    concurrency_policy        = "Replace"
+    failed_jobs_history_limit = 5
+    schedule                  = "*/30 * * * *"
+    job_template {
+      metadata {
+
+      }
+      spec {
+        template {
+          metadata {
+
+          }
+          spec {
+            container {
+              name    = "monitor-prometheus"
+              image   = "alpine"
+              command = ["/bin/sh", "-c", "apk add --update curl && curl --connect-timeout 2 prometheus-server.monitoring.svc.cluster.local || curl https://webhook.viktorbarzin.me/fb/message-viktor -d 'Prometheus is down!'"]
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+resource "kubernetes_manifest" "status_redirect_middleware" {
+  manifest = {
+    apiVersion = "traefik.io/v1alpha1"
+    kind       = "Middleware"
+    metadata = {
+      name      = "status-redirect"
+      namespace = kubernetes_namespace.monitoring.metadata[0].name
+    }
+    spec = {
+      redirectRegex = {
+        regex       = ".*"
+        replacement = "https://hetrixtools.com/r/38981b548b5d38b052aca8d01285a3f3/"
+        permanent   = true
+      }
+    }
+  }
+}
+
+resource "kubernetes_ingress_v1" "status" {
+  metadata {
+    name      = "hetrix-redirect-ingress"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    annotations = {
+      "traefik.ingress.kubernetes.io/router.middlewares" = "monitoring-status-redirect@kubernetescrd"
+      "traefik.ingress.kubernetes.io/router.entrypoints" = "websecure"
+    }
+  }
+
+  spec {
+    ingress_class_name = "traefik"
+    tls {
+      hosts       = ["status.viktorbarzin.me"]
+      secret_name = var.tls_secret_name
+    }
+    rule {
+      host = "status.viktorbarzin.me"
+      http {
+        path {
+          path = "/"
+          backend {
+            service {
+              name = "not-used"
+              port {
+                number = 80 # redirected by middleware
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+resource "kubernetes_manifest" "yotovski_redirect_middleware" {
+  manifest = {
+    apiVersion = "traefik.io/v1alpha1"
+    kind       = "Middleware"
+    metadata = {
+      name      = "yotovski-redirect"
+      namespace = kubernetes_namespace.monitoring.metadata[0].name
+    }
+    spec = {
+      redirectRegex = {
+        regex       = ".*"
+        replacement = "https://hetrixtools.com/r/2ba9d7a5e017794db0fd91f0115a8b3b/"
+        permanent   = true
+      }
+    }
+  }
+}
+
+resource "kubernetes_ingress_v1" "status_yotovski" {
+  metadata {
+    name      = "hetrix-yotovski-redirect-ingress"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    annotations = {
+      "traefik.ingress.kubernetes.io/router.middlewares" = "monitoring-yotovski-redirect@kubernetescrd"
+      "traefik.ingress.kubernetes.io/router.entrypoints" = "websecure"
+    }
+  }
+
+  spec {
+    ingress_class_name = "traefik"
+    tls {
+      hosts       = ["yotovski-status.viktorbarzin.me"]
+      secret_name = var.tls_secret_name
+    }
+    rule {
+      host = "yotovski-status.viktorbarzin.me"
+      http {
+        path {
+          path = "/"
+          backend {
+            service {
+              name = "not-used" # redirected by middleware
+              port {
+                number = 80
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+# Custom ResourceQuota for monitoring — larger than the default 1-cluster tier quota
+# because monitoring runs 29+ pods (Prometheus, Grafana, Loki, Alloy, exporters, etc.)
+resource "kubernetes_resource_quota" "monitoring" {
+  metadata {
+    name      = "monitoring-quota"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+  }
+  spec {
+    hard = {
+      "requests.cpu"    = "16"
+      "requests.memory" = "16Gi"
+      "limits.memory"   = "64Gi"
+      pods              = "100"
+    }
+  }
+}
--- a/stacks/monitoring/modules/monitoring/prometheus.tf
+++ b/stacks/monitoring/modules/monitoring/prometheus.tf
@ -0,0 +1,31 @@
+
+
+resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" {
+  metadata {
+    name      = "prometheus-data"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+  }
+
+  spec {
+    access_modes       = ["ReadWriteOnce"]
+    storage_class_name = "iscsi-truenas"
+    resources {
+      requests = {
+        storage = "200Gi"
+      }
+    }
+  }
+}
+
+resource "helm_release" "prometheus" {
+  namespace        = kubernetes_namespace.monitoring.metadata[0].name
+  create_namespace = true
+  name             = "prometheus"
+
+  repository = "https://prometheus-community.github.io/helm-charts"
+  chart      = "prometheus"
+  # version    = "15.0.2"
+  version = "25.8.2"
+
+  values = [templatefile("${path.module}/prometheus_chart_values.tpl", { alertmanager_mail_pass = var.alertmanager_account_password, alertmanager_slack_api_url = var.alertmanager_slack_api_url, tuya_api_key = var.tiny_tuya_service_secret, haos_api_token = var.haos_api_token })]
+}
--- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
--- a/stacks/monitoring/modules/monitoring/prometheus_snmp_chart_values.yaml
+++ b/stacks/monitoring/modules/monitoring/prometheus_snmp_chart_values.yaml
--- a/stacks/monitoring/modules/monitoring/pve_exporter.tf
+++ b/stacks/monitoring/modules/monitoring/pve_exporter.tf
@ -0,0 +1,122 @@
+
+resource "kubernetes_secret" "pve_exporter_config" {
+  metadata {
+    name      = "pve-exporter-config"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+  }
+
+  data = {
+    "pve.yml" = <<-EOF
+      default:
+          user: "root@pam"
+          password: ${var.pve_password}
+          verify_ssl: false
+          timeout: 30
+    EOF
+  }
+}
+
+resource "kubernetes_deployment" "pve_exporter" {
+  metadata {
+    name      = "proxmox-exporter"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    labels = {
+      tier = var.tier
+    }
+  }
+
+  spec {
+    replicas = 1
+    selector {
+      match_labels = {
+        app = "proxmox-exporter"
+      }
+    }
+
+    template {
+      metadata {
+        labels = {
+          app = "proxmox-exporter"
+        }
+      }
+
+      spec {
+        container {
+          name  = "proxmox-exporter"
+          image = "prompve/prometheus-pve-exporter:latest"
+
+          port {
+            container_port = 9221
+          }
+
+          resources {
+            requests = {
+              cpu    = "15m"
+              memory = "256Mi"
+            }
+            limits = {
+              memory = "256Mi"
+            }
+          }
+
+          # Mount the file into the container
+          volume_mount {
+            name       = "config-volume"
+            mount_path = "/etc/prometheus"
+            read_only  = true
+          }
+        }
+
+        volume {
+          name = "config-volume"
+          secret {
+            secret_name = kubernetes_secret.pve_exporter_config.metadata[0].name
+            items {
+              key  = "pve.yml"
+              path = "pve.yml" # This results in /etc/prometheus/pve.yml
+            }
+          }
+        }
+        dns_config {
+          option {
+            name  = "ndots"
+            value = "2"
+          }
+        }
+      }
+    }
+  }
+}
+
+resource "kubernetes_service" "proxmox-exporter" {
+  metadata {
+    name      = "proxmox-exporter"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    labels = {
+      "app" = "proxmox-exporter"
+    }
+    annotations = {
+      "prometheus.io/scrape"        = "true"
+      "prometheus.io/port"          = 9221
+      "prometheus.io/path"          = "/pve"
+      "prometheus.io/param_target"  = "192.168.1.127"
+      "prometheus.io/param_node"    = "1"
+      "prometheus.io/param_cluster" = "1"
+    }
+  }
+
+  spec {
+    selector = {
+      "app" = "proxmox-exporter"
+    }
+    port {
+      name        = "http"
+      port        = 9221
+      target_port = 9221
+    }
+  }
+}
+
+# To monitor the pve node, use the node exporter and the playbook in this repo. from the root run:
+# ansible-playbook -i ./playbooks/inventory.ini  ./playbooks/deploy_node_exporter.yaml
+# This installs the exporter binary
--- a/stacks/monitoring/modules/monitoring/server-power-cycle/main.py
+++ b/stacks/monitoring/modules/monitoring/server-power-cycle/main.py
@ -0,0 +1,51 @@
+import asyncio
+import logging
+import os
+import signal
+import sys
+import time
+
+import aiohttp
+
+iDRAC_HOST = 'idrac'
+iDRAC_USER_ENV_VAR = 'idrac_user'
+iDRAC_PASSWORD_ENV_VAR = 'idrac_password'
+SHOULD_RUN = True
+
+
+def signal_handler(sig, frame):
+    logging.warning(f'signal {sig} received. shutting down gracefully...')
+    global SHOULD_RUN
+    SHOULD_RUN = False
+    time.sleep(60)
+    sys.exit(0)
+
+
+async def main() -> None:
+    # define signal handlers
+    signal.signal(signal.SIGINT, signal_handler)
+
+    user = os.environ.get(iDRAC_USER_ENV_VAR)
+    if user is None:
+        logging.critical('missing environment variable for idrac user'
+                         f' please set  {iDRAC_USER_ENV_VAR}')
+        return
+
+    password = os.environ.get(iDRAC_PASSWORD_ENV_VAR)
+    if password is None:
+        logging.critical('missing environment variable for idrac password'
+                         f' please set  {iDRAC_PASSWORD_ENV_VAR}')
+        return
+
+    logging.info('service initiated with credentials')
+    return await monitor(user, password)
+
+
+async def monitor(user: str, password: str) -> None:
+    while SHOULD_RUN:
+        pass
+
+
+if __name__ == '__main__':
+    # abandoned bc server cannot start itself when it's off :/
+    asyncio.run(main())
--- a/stacks/monitoring/modules/monitoring/server-power-cycle/main.sh
+++ b/stacks/monitoring/modules/monitoring/server-power-cycle/main.sh
@ -0,0 +1,66 @@
+#!/bin/sh
+
+tag=server-power-cycle-script
+logger -t $tag start $(date '+%F-%R')
+
+if [ -f /tmp/server-power-cycle-lock ]; then
+        logger -t $tag 'Script already running. exiting'
+        exit 0
+fi
+touch /tmp/server-power-cycle-lock
+
+
+if [ -f /root/server-power-cycle/state.off ]; then
+        logger -t $tag 'Server state set to off'
+        while true; do
+                sleep 60 # sleep 1 minute
+                logger -t $tag 'Trying to connect to idrac system...'
+                curl --connect-timeout 5 -s -k -u root:calvin -H"Content-type: application/json" -X GET https://192.168.1.4/redfish/v1/Chassis/System.Embedded.1/Power/PowerSupplies/PSU.Slot.2
+                if [[ $? -eq 0 ]]; then
+                        logger -t $tag "Connected to idrac, assuming power is back on"
+                        logger -t $tag "Power supply restored, sending power on command"
+                        curl -s -k -u root:calvin -X POST -d '{"Action": "Reset", "ResetType": "On"}' -H"Content-type: application/json" https://192.168.1.4/redfish/v1/Systems/System.Embedded.1/Actions/ComputerSystem.Reset
+                        rm /root/server-power-cycle/state.off
+
+                        logger -t $tag end $(date '+%F-%R')
+                        rm /tmp/server-power-cycle-lock
+                        exit 0
+                fi
+        done
+fi
+
+
+voltage=$(curl -s -k -u root:calvin -H"Content-type: application/json" -X GET https://192.168.1.4/redfish/v1/Chassis/System.Embedded.1/Power/PowerSupplies/PSU.Slot.2 |jq .LineInputVoltage)
+# check input voltage on the pwoer supply connected to the outer system
+if [[ $voltage -gt 0 ]]; then
+        logger -t $tag "power supply is on. exiting"
+        logger -t $tag end $(date '+%F-%R')
+        rm /tmp/server-power-cycle-lock
+        exit 0
+fi
+
+to_wait=30
+echo "Continuously checking power supply for the next $to_wait minutes"
+
+for i in $(seq 30); do
+        logger -t $tag "Sleeping a minute..Minute $i"
+        sleep 60
+
+        # check input voltage on the pwoer supply connected to the outer system
+        voltage=$(curl -s -k -u root:calvin -H"Content-type: application/json" -X GET https://192.168.1.4/redfish/v1/Chassis/System.Embedded.1/Power/PowerSupplies/PSU.Slot.2 |jq .LineInputVoltage)
+        if [[ $voltage -gt 0 ]]; then
+                logger -t $tag "power supply is on. exiting"
+
+                logger -t $tag end $(date '+%F-%R')
+                rm /tmp/server-power-cycle-lock
+                exit 0
+        fi
+
+done
+
+logger -t $tag "Power supply did not come back, sending graceful shutdown signal"
+curl -s -k -u root:calvin -X POST -d '{"Action": "Reset", "ResetType": "GracefulShutdown"}' -H"Content-type: application/json" https://192.168.1.4/redfish/v1/Systems/System.Embedded.1/Actions/ComputerSystem.Reset
+
+touch /root/server-power-cycle/state.off
+rm /tmp/server-power-cycle-lock
+logger -t $tag end $(date '+%F-%R')
--- a/stacks/monitoring/modules/monitoring/snmp_exporter.tf
+++ b/stacks/monitoring/modules/monitoring/snmp_exporter.tf
@ -0,0 +1,130 @@
+
+/**
+1. clone snmp exporter
+2. update generator.yaml to include only interesting modules
+3. make generate
+4. cp snmp.yml to whereever is used
+5. scrape service with curl 'http://snmp-exporter.monitoring.svc.cluster.local:9116/snmp?auth=public_v2&module=huawei&target=192.168.1.5%3A161'
+
+generate reference - https://github.com/prometheus/snmp_exporter/tree/main/generator
+https://sbcode.net/prometheus/snmp-generate-huawei/
+*/
+resource "kubernetes_config_map" "snmp-exporter-yaml" {
+  metadata {
+    name      = "snmp-exporter-yaml"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+
+    annotations = {
+      "reloader.stakater.com/match" = "true"
+    }
+  }
+  data = {
+    "snmp.yml" = file("${path.module}/ups_snmp_values.yaml")
+
+  }
+}
+
+resource "kubernetes_deployment" "snmp-exporter" {
+  metadata {
+    name      = "snmp-exporter"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    labels = {
+      app  = "snmp-exporter"
+      tier = var.tier
+    }
+    annotations = {
+      "reloader.stakater.com/search" = "true"
+    }
+  }
+  spec {
+    replicas = 1
+    selector {
+      match_labels = {
+        app = "snmp-exporter"
+      }
+    }
+    template {
+      metadata {
+        labels = {
+          app = "snmp-exporter"
+        }
+      }
+      spec {
+        container {
+          image = "prom/snmp-exporter"
+          name  = "snmp-exporter"
+          # command = ["/usr/local/bin/redfish_exporter", "--config.file", "/app/config.yml"]
+
+          resources {
+            requests = {
+              cpu    = "10m"
+              memory = "256Mi"
+            }
+            limits = {
+              memory = "256Mi"
+            }
+          }
+
+          port {
+            container_port = 9116
+          }
+
+          volume_mount {
+            name       = "config-volume"
+            mount_path = "/etc/snmp_exporter/"
+          }
+        }
+        volume {
+          name = "config-volume"
+
+          config_map {
+            name = "snmp-exporter-yaml"
+          }
+        }
+        dns_config {
+          option {
+            name  = "ndots"
+            value = "2"
+          }
+        }
+      }
+    }
+  }
+}
+
+resource "kubernetes_service" "snmp-exporter" {
+  metadata {
+    name      = "snmp-exporter"
+    namespace = kubernetes_namespace.monitoring.metadata[0].name
+    labels = {
+      "app" = "snmp-exporter"
+    }
+    # annotations = {
+    #   "prometheus.io/scrape" = "true"
+    #   "prometheus.io/path"   = "/snmp?auth=Public0&target=tcp%3A%2F%2F192.%3A161"
+    #   "prometheus.io/port"   = "9116"
+    # }
+  }
+
+  spec {
+    selector = {
+      "app" = "snmp-exporter"
+    }
+    port {
+      name        = "http"
+      port        = "9116"
+      target_port = "9116"
+    }
+  }
+}
+
+module "snmp-exporter-ingress" {
+  source                  = "../../../../modules/kubernetes/ingress_factory"
+  namespace               = kubernetes_namespace.monitoring.metadata[0].name
+  name                    = "snmp-exporter"
+  root_domain             = "viktorbarzin.lan"
+  tls_secret_name         = var.tls_secret_name
+  allow_local_access_only = true
+  ssl_redirect            = false
+  port                    = 9116
+}
--- a/stacks/monitoring/modules/monitoring/ups_snmp_values.yaml
+++ b/stacks/monitoring/modules/monitoring/ups_snmp_values.yaml
--- a/stacks/monitoring/secrets
+++ b/stacks/monitoring/secrets
@ -0,0 +1 @@
+../../secrets
--- a/stacks/monitoring/terragrunt.hcl
+++ b/stacks/monitoring/terragrunt.hcl
@ -0,0 +1,8 @@
+include "root" {
+  path = find_in_parent_folders()
+}
+
+dependency "infra" {
+  config_path  = "../infra"
+  skip_outputs = true
+}
--- a/stacks/monitoring/tiers.tf
+++ b/stacks/monitoring/tiers.tf
@ -0,0 +1,10 @@
+# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
+locals {
+  tiers = {
+    core    = "0-core"
+    cluster = "1-cluster"
+    gpu     = "2-gpu"
+    edge    = "3-edge"
+    aux     = "4-aux"
+  }
+}
--- a/stacks/nvidia/main.tf
+++ b/stacks/nvidia/main.tf
@ -0,0 +1,11 @@
+# =============================================================================
+# NVIDIA Stack — GPU device plugin
+# =============================================================================
+
+variable "tls_secret_name" { type = string }
+
+module "nvidia" {
+  source          = "./modules/nvidia"
+  tls_secret_name = var.tls_secret_name
+  tier            = local.tiers.gpu
+}
--- a/stacks/nvidia/modules/nvidia/Dockerfile
+++ b/stacks/nvidia/modules/nvidia/Dockerfile
@ -0,0 +1,27 @@
+# GPU container
+
+FROM ubuntu
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python and pip
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        python3 \
+        python3-pip \
+        python3-venv
+
+# Deps
+RUN apt-get install -y ffmpeg espeak-ng
+
+# Set a working directory
+WORKDIR /app
+
+RUN python3 -m venv audiblez && ./audiblez/bin/pip install audiblez
+# RUN python3 -m venv audiblez 
+
+CMD ["/usr/bin/sleep", "86400"]
+# RUN pip install audiblez
+
+# # Default command
+# CMD ["/usr/bin/sleep", "86400"]
--- a/stacks/nvidia/modules/nvidia/main.tf
+++ b/stacks/nvidia/modules/nvidia/main.tf
@ -0,0 +1,688 @@
+variable "tls_secret_name" {}
+variable "tier" { type = string }
+
+module "tls_secret" {
+  source          = "../../../../modules/kubernetes/setup_tls_secret"
+  namespace       = kubernetes_namespace.nvidia.metadata[0].name
+  tls_secret_name = var.tls_secret_name
+}
+
+resource "kubernetes_namespace" "nvidia" {
+  metadata {
+    name = "nvidia"
+    labels = {
+      "istio-injection" : "disabled"
+      tier                               = var.tier
+      "resource-governance/custom-quota" = "true"
+    }
+  }
+}
+
+resource "kubernetes_resource_quota" "nvidia_quota" {
+  metadata {
+    name      = "tier-quota"
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+  }
+  spec {
+    hard = {
+      "limits.memory"   = "48Gi"
+      "requests.cpu"    = "8"
+      "requests.memory" = "12Gi"
+      pods              = "40"
+    }
+  }
+}
+
+# Apply GPU taint and label to ensure only GPU workloads run on GPU node
+resource "null_resource" "gpu_node_config" {
+  provisioner "local-exec" {
+    command = <<-EOT
+      kubectl taint nodes k8s-node1 nvidia.com/gpu=true:PreferNoSchedule --overwrite
+      kubectl label nodes k8s-node1 gpu=true --overwrite
+    EOT
+  }
+
+  # Re-run if namespace changes (proxy for cluster changes)
+  triggers = {
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+  }
+}
+
+# [not needed anymore; part of the chart values] Apply to operator with:
+# kubectl patch clusterpolicies.nvidia.com/cluster-policy -n gpu-operator --type merge -p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config", "default": "any"}}}}'
+
+resource "kubernetes_config_map" "time_slicing_config" {
+  metadata {
+    name      = "time-slicing-config"
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+  }
+
+  data = {
+    any = <<-EOF
+      flags:
+        migStrategy: none
+      sharing:
+        timeSlicing:
+          renameByDefault: false
+          failRequestsGreaterThanOne: false
+          resources:
+            - name: nvidia.com/gpu
+              replicas: 100
+    EOF
+  }
+  depends_on = [kubernetes_namespace.nvidia]
+}
+
+resource "helm_release" "nvidia-gpu-operator" {
+  namespace = kubernetes_namespace.nvidia.metadata[0].name
+  name      = "nvidia-gpu-operator"
+
+  repository = "https://helm.ngc.nvidia.com/nvidia"
+  chart      = "gpu-operator"
+  atomic     = true
+  #   version    = "0.9.3"
+  timeout = 6000
+
+  values     = [templatefile("${path.module}/values.yaml", {})]
+  depends_on = [kubernetes_config_map.time_slicing_config]
+}
+
+resource "kubernetes_deployment" "nvidia-exporter" {
+  metadata {
+    name      = "nvidia-exporter"
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+    labels = {
+      app  = "nvidia-exporter"
+      tier = var.tier
+    }
+  }
+  spec {
+    replicas = 1
+    selector {
+      match_labels = {
+        app = "nvidia-exporter"
+      }
+    }
+    template {
+      metadata {
+        labels = {
+          app = "nvidia-exporter"
+        }
+      }
+      spec {
+        node_selector = {
+          "gpu" : "true"
+        }
+        toleration {
+          key      = "nvidia.com/gpu"
+          operator = "Equal"
+          value    = "true"
+          effect   = "NoSchedule"
+        }
+        container {
+          image = "nvidia/dcgm-exporter:latest"
+          name  = "nvidia-exporter"
+          port {
+            container_port = 9400
+          }
+          security_context {
+            privileged = true
+            capabilities {
+              add = ["SYS_ADMIN"]
+            }
+          }
+          resources {
+            requests = {
+              memory = "192Mi"
+            }
+            limits = {
+              memory           = "192Mi"
+              "nvidia.com/gpu" = "1"
+            }
+          }
+        }
+        dns_config {
+          option {
+            name  = "ndots"
+            value = "2"
+          }
+        }
+      }
+    }
+  }
+  depends_on = [helm_release.nvidia-gpu-operator]
+}
+
+resource "kubernetes_service" "nvidia-exporter" {
+  metadata {
+    name      = "nvidia-exporter"
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+    labels = {
+      "app" = "nvidia-exporter"
+    }
+  }
+
+  spec {
+    selector = {
+      app = "nvidia-exporter"
+    }
+    port {
+      name        = "http"
+      port        = 80
+      target_port = 9400
+    }
+  }
+}
+
+
+module "ingress" {
+  source                  = "../../../../modules/kubernetes/ingress_factory"
+  namespace               = kubernetes_namespace.nvidia.metadata[0].name
+  name                    = "nvidia-exporter"
+  root_domain             = "viktorbarzin.lan"
+  tls_secret_name         = var.tls_secret_name
+  allow_local_access_only = true
+  ssl_redirect            = false
+}
+
+# resource "kubernetes_ingress_v1" "nvidia-exporter" {
+#   metadata {
+#     name      = "nvidia-exporter"
+#    namespace = kubernetes_namespace.nvidia.metadata[0].name
+#     annotations = {
+#       "kubernetes.io/ingress.class" = "nginx"
+#       "nginx.ingress.kubernetes.io/whitelist-source-range" : "192.168.1.0/24, 10.0.0.0/8"
+#       "nginx.ingress.kubernetes.io/ssl-redirect" : "false" # used only in LAN
+
+#     }
+#   }
+#   spec {
+#     tls {
+#       hosts       = ["nvidia-exporter.viktorbarzin.lan"]
+#       secret_name = var.tls_secret_name
+#     }
+#     rule {
+#       host = "nvidia-exporter.viktorbarzin.lan"
+#       http {
+#         path {
+#           backend {
+#             service {
+#               name = "nvidia-exporter"
+#               port {
+#                 number = 80
+#               }
+#             }
+#           }
+#         }
+#       }
+#     }
+#   }
+# }
+
+
+# resource "kubernetes_deployment" "gpu-container" {
+#   metadata {
+#     name      = "gpu-container"
+#     namespace = kubernetes_namespace.nvidia.metadata[0].name
+#     labels = {
+#       app = "gpu-container"
+#     }
+#   }
+#   spec {
+#     replicas = 1
+#     selector {
+#       match_labels = {
+#         app = "gpu-container"
+#       }
+#     }
+#     template {
+#       metadata {
+#         labels = {
+#           app = "gpu-container"
+#         }
+#       }
+#       spec {
+#         node_selector = {
+#           "gpu" : "true"
+#         }
+#         container {
+#           image   = "ubuntu"
+#           name    = "gpu-container"
+#           command = ["/usr/bin/sleep", "3600"]
+#           # security_context {
+#           #   privileged = true
+#           #   capabilities {
+#           #     add = ["SYS_ADMIN"]
+#           #   }
+#           # }
+#           resources {
+#             limits = {
+#               "nvidia.com/gpu" = "1"
+#             }
+#           }
+#         }
+#       }
+#     }
+#   }
+#   depends_on = [helm_release.nvidia-gpu-operator]
+# }
+
+# GPU Pod Memory Exporter - exposes per-pod GPU memory usage as Prometheus metrics
+resource "kubernetes_config_map" "gpu_pod_exporter_script" {
+  metadata {
+    name      = "gpu-pod-exporter-script"
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+  }
+
+  data = {
+    "exporter.py" = <<-EOF
+#!/usr/bin/env python3
+"""GPU Pod Memory Exporter - Collects per-pod GPU memory usage."""
+
+import subprocess
+import time
+import re
+import os
+import json
+import urllib.request
+import ssl
+from http.server import HTTPServer, BaseHTTPRequestHandler
+
+METRICS_PORT = 9401
+SCRAPE_INTERVAL = 15
+
+# Kubernetes API configuration
+K8S_API = "https://kubernetes.default.svc"
+TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+CA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
+
+# Cache for container ID to pod info mapping
+container_cache = {}
+cache_refresh_time = 0
+CACHE_TTL = 60  # Refresh cache every 60 seconds
+
+def get_k8s_token():
+    """Read Kubernetes service account token."""
+    try:
+        with open(TOKEN_PATH, 'r') as f:
+            return f.read().strip()
+    except:
+        return None
+
+def refresh_container_cache():
+    """Refresh the container ID to pod mapping from Kubernetes API."""
+    global container_cache, cache_refresh_time
+
+    token = get_k8s_token()
+    if not token:
+        return
+
+    try:
+        # Create SSL context with K8s CA
+        ctx = ssl.create_default_context()
+        if os.path.exists(CA_PATH):
+            ctx.load_verify_locations(CA_PATH)
+
+        # Get all pods on this node
+        node_name = os.environ.get('NODE_NAME', '')
+        url = f"{K8S_API}/api/v1/pods?fieldSelector=spec.nodeName={node_name}"
+
+        req = urllib.request.Request(url, headers={
+            'Authorization': f'Bearer {token}',
+            'Accept': 'application/json'
+        })
+
+        with urllib.request.urlopen(req, context=ctx, timeout=10) as resp:
+            data = json.loads(resp.read().decode())
+
+        new_cache = {}
+        for pod in data.get('items', []):
+            pod_name = pod['metadata']['name']
+            namespace = pod['metadata']['namespace']
+
+            # Get container statuses
+            for status in pod.get('status', {}).get('containerStatuses', []):
+                container_id = status.get('containerID', '')
+                # Extract the ID part (e.g., "containerd://abc123..." -> "abc123")
+                if '://' in container_id:
+                    container_id = container_id.split('://')[-1]
+                if container_id:
+                    short_id = container_id[:12]
+                    new_cache[short_id] = {
+                        'pod': pod_name,
+                        'namespace': namespace,
+                        'container': status.get('name', 'unknown')
+                    }
+
+        container_cache = new_cache
+        cache_refresh_time = time.time()
+        print(f"Refreshed container cache: {len(new_cache)} containers")
+
+    except Exception as e:
+        print(f"Error refreshing container cache: {e}")
+
+def get_pod_info(container_id):
+    """Look up pod info for a container ID."""
+    global cache_refresh_time
+
+    # Refresh cache if stale
+    if time.time() - cache_refresh_time > CACHE_TTL:
+        refresh_container_cache()
+
+    return container_cache.get(container_id, {
+        'pod': 'unknown',
+        'namespace': 'unknown',
+        'container': 'unknown'
+    })
+
+def get_gpu_processes():
+    """Run nvidia-smi to get GPU process info."""
+    try:
+        result = subprocess.run(
+            ["nvidia-smi", "--query-compute-apps=pid,used_memory,process_name", "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=10
+        )
+        if result.returncode != 0:
+            print(f"nvidia-smi error: {result.stderr}")
+            return []
+
+        processes = []
+        for line in result.stdout.strip().split('\n'):
+            if not line.strip():
+                continue
+            parts = [p.strip() for p in line.split(',')]
+            if len(parts) >= 3:
+                pid, memory_mib, process_name = parts[0], parts[1], parts[2]
+                processes.append({
+                    'pid': pid,
+                    'memory_bytes': int(memory_mib) * 1024 * 1024,
+                    'process_name': process_name
+                })
+        return processes
+    except Exception as e:
+        print(f"Error running nvidia-smi: {e}")
+        return []
+
+def get_container_id(pid):
+    """Map PID to container ID via cgroup."""
+    cgroup_path = f"/host_proc/{pid}/cgroup"
+    try:
+        with open(cgroup_path, 'r') as f:
+            for line in f:
+                # Match container ID patterns (docker, containerd, cri-o)
+                match = re.search(r'[:/]([a-f0-9]{64})', line)
+                if match:
+                    return match.group(1)[:12]
+                match = re.search(r'cri-containerd-([a-f0-9]{64})', line)
+                if match:
+                    return match.group(1)[:12]
+    except (FileNotFoundError, PermissionError):
+        pass
+    return "host"
+
+# Global metrics storage
+current_metrics = []
+
+def collect_metrics():
+    """Collect GPU memory metrics."""
+    global current_metrics
+    metrics = []
+    processes = get_gpu_processes()
+
+    for proc in processes:
+        container_id = get_container_id(proc['pid'])
+        pod_info = get_pod_info(container_id)
+        metrics.append({
+            'container_id': container_id,
+            'pid': proc['pid'],
+            'process_name': proc['process_name'],
+            'memory_bytes': proc['memory_bytes'],
+            'pod': pod_info['pod'],
+            'namespace': pod_info['namespace'],
+            'container': pod_info['container']
+        })
+
+    current_metrics = metrics
+
+def format_metrics():
+    """Format metrics in Prometheus exposition format."""
+    lines = [
+        "# HELP gpu_pod_memory_used_bytes GPU memory used by pod",
+        "# TYPE gpu_pod_memory_used_bytes gauge"
+    ]
+
+    for m in current_metrics:
+        labels = ','.join([
+            f'namespace="{m["namespace"]}"',
+            f'pod="{m["pod"]}"',
+            f'container="{m["container"]}"',
+            f'process_name="{m["process_name"]}"',
+            f'pid="{m["pid"]}"'
+        ])
+        lines.append(f'gpu_pod_memory_used_bytes{{{labels}}} {m["memory_bytes"]}')
+
+    return '\n'.join(lines) + '\n'
+
+class MetricsHandler(BaseHTTPRequestHandler):
+    def do_GET(self):
+        if self.path == '/metrics':
+            content = format_metrics()
+            self.send_response(200)
+            self.send_header('Content-Type', 'text/plain; charset=utf-8')
+            self.end_headers()
+            self.wfile.write(content.encode())
+        elif self.path == '/health':
+            self.send_response(200)
+            self.end_headers()
+            self.wfile.write(b'ok')
+        else:
+            self.send_response(404)
+            self.end_headers()
+
+    def log_message(self, format, *args):
+        pass  # Suppress request logging
+
+def background_collector():
+    """Background thread to collect metrics periodically."""
+    import threading
+    def run():
+        while True:
+            collect_metrics()
+            time.sleep(SCRAPE_INTERVAL)
+    thread = threading.Thread(target=run, daemon=True)
+    thread.start()
+
+if __name__ == '__main__':
+    print(f"Starting GPU Pod Memory Exporter on port {METRICS_PORT}")
+    refresh_container_cache()  # Initial cache load
+    collect_metrics()  # Initial collection
+    background_collector()
+
+    server = HTTPServer(('', METRICS_PORT), MetricsHandler)
+    server.serve_forever()
+EOF
+  }
+}
+
+resource "kubernetes_service_account" "gpu_pod_exporter" {
+  metadata {
+    name      = "gpu-pod-exporter"
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+  }
+}
+
+resource "kubernetes_cluster_role" "gpu_pod_exporter" {
+  metadata {
+    name = "gpu-pod-exporter"
+  }
+
+  rule {
+    api_groups = [""]
+    resources  = ["pods"]
+    verbs      = ["list"]
+  }
+}
+
+resource "kubernetes_cluster_role_binding" "gpu_pod_exporter" {
+  metadata {
+    name = "gpu-pod-exporter"
+  }
+
+  role_ref {
+    api_group = "rbac.authorization.k8s.io"
+    kind      = "ClusterRole"
+    name      = kubernetes_cluster_role.gpu_pod_exporter.metadata[0].name
+  }
+
+  subject {
+    kind      = "ServiceAccount"
+    name      = kubernetes_service_account.gpu_pod_exporter.metadata[0].name
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+  }
+}
+
+resource "kubernetes_daemonset" "gpu_pod_exporter" {
+  metadata {
+    name      = "gpu-pod-exporter"
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+    labels = {
+      app  = "gpu-pod-exporter"
+      tier = var.tier
+    }
+  }
+
+  spec {
+    selector {
+      match_labels = {
+        app = "gpu-pod-exporter"
+      }
+    }
+
+    template {
+      metadata {
+        labels = {
+          app = "gpu-pod-exporter"
+        }
+      }
+
+      spec {
+        host_pid             = true
+        service_account_name = kubernetes_service_account.gpu_pod_exporter.metadata[0].name
+
+        node_selector = {
+          "gpu" : "true"
+        }
+
+        toleration {
+          key      = "nvidia.com/gpu"
+          operator = "Equal"
+          value    = "true"
+          effect   = "NoSchedule"
+        }
+
+        container {
+          name  = "exporter"
+          image = "python:3.11-slim"
+
+          command = ["/bin/bash", "-c"]
+          args = [
+            "python3 /scripts/exporter.py"
+          ]
+
+          env {
+            name = "NODE_NAME"
+            value_from {
+              field_ref {
+                field_path = "spec.nodeName"
+              }
+            }
+          }
+
+          port {
+            container_port = 9401
+            name           = "metrics"
+          }
+
+          volume_mount {
+            name       = "scripts"
+            mount_path = "/scripts"
+            read_only  = true
+          }
+
+          volume_mount {
+            name       = "host-proc"
+            mount_path = "/host_proc"
+            read_only  = true
+          }
+
+          resources {
+            requests = {
+              cpu    = "10m"
+              memory = "128Mi"
+            }
+            limits = {
+              memory           = "128Mi"
+              "nvidia.com/gpu" = "1"
+            }
+          }
+
+          liveness_probe {
+            http_get {
+              path = "/health"
+              port = 9401
+            }
+            initial_delay_seconds = 30
+            period_seconds        = 30
+            timeout_seconds       = 5
+          }
+        }
+
+        volume {
+          name = "scripts"
+          config_map {
+            name         = kubernetes_config_map.gpu_pod_exporter_script.metadata[0].name
+            default_mode = "0755"
+          }
+        }
+
+        volume {
+          name = "host-proc"
+          host_path {
+            path = "/proc"
+            type = "Directory"
+          }
+        }
+        dns_config {
+          option {
+            name  = "ndots"
+            value = "2"
+          }
+        }
+      }
+    }
+  }
+
+  depends_on = [helm_release.nvidia-gpu-operator]
+}
+
+resource "kubernetes_service" "gpu_pod_exporter" {
+  metadata {
+    name      = "gpu-pod-exporter"
+    namespace = kubernetes_namespace.nvidia.metadata[0].name
+    labels = {
+      app = "gpu-pod-exporter"
+    }
+  }
+
+  spec {
+    selector = {
+      app = "gpu-pod-exporter"
+    }
+
+    port {
+      name        = "metrics"
+      port        = 80
+      target_port = 9401
+    }
+  }
+}
--- a/stacks/nvidia/modules/nvidia/values.yaml
+++ b/stacks/nvidia/modules/nvidia/values.yaml
@ -0,0 +1,43 @@
+driver:
+  enabled: true
+  # repository: nvcr.io/nvidia/driver
+  # choose a driver version compatible with your GPU + CUDA 12.x (example)
+  # NVIDIA GPU driver - https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html#known-issue
+  # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/
+  # 13.x >= 580
+  # 12.x >= 525, <580
+  # 11.x >= 450, <525
+  #
+  # Delete the cluster policy before each change
+  # version: "575.57.08" # CUDA 12.9
+  version: "570.195.03" # CUDA 12.8
+  upgradePolicy:
+    autoUpgrade: false
+
+  devicePlugin:
+    config:
+      name: time-slicing-config
+
+# DCGM Exporter - reduced from 2560Mi to 1536Mi based on VPA upper bound of 1459Mi (1.05x margin)
+dcgmExporter:
+  resources:
+    requests:
+      memory: "1536Mi"
+    limits:
+      memory: "1536Mi"
+
+# CUDA Validator - reduced from 1024Mi to 256Mi (one-shot job)
+validator:
+  resources:
+    requests:
+      memory: "256Mi"
+    limits:
+      memory: "256Mi"
+
+# Tolerate GPU node taint for all GPU operator components
+daemonsets:
+  tolerations:
+    - key: "nvidia.com/gpu"
+      operator: "Equal"
+      value: "true"
+      effect: "NoSchedule"
--- a/stacks/nvidia/secrets
+++ b/stacks/nvidia/secrets
@ -0,0 +1 @@
+../../secrets
--- a/stacks/nvidia/terragrunt.hcl
+++ b/stacks/nvidia/terragrunt.hcl
@ -0,0 +1,8 @@
+include "root" {
+  path = find_in_parent_folders()
+}
+
+dependency "infra" {
+  config_path  = "../infra"
+  skip_outputs = true
+}
--- a/stacks/nvidia/tiers.tf
+++ b/stacks/nvidia/tiers.tf
@ -0,0 +1,10 @@
+# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
+locals {
+  tiers = {
+    core    = "0-core"
+    cluster = "1-cluster"
+    gpu     = "2-gpu"
+    edge    = "3-edge"
+    aux     = "4-aux"
+  }
+}
--- a/stacks/platform/main.tf
+++ b/stacks/platform/main.tf
@ -7,13 +7,12 @@
 # foundational infrastructure that application stacks depend on.
 #
 # Services included:
-#   metallb, cloudflared, infra-maintenance,
-#   redis, traefik, technitium, headscale, rbac, k8s-portal,
-#   monitoring, vaultwarden, reverse-proxy, metrics-server, vpa,
-#   nvidia, kyverno, uptime-kuma, wireguard, xray, mailserver
+#   metallb, infra-maintenance, redis, traefik, technitium, headscale,
+#   rbac, k8s-portal, vaultwarden, reverse-proxy, metrics-server, vpa,
+#   nfs-csi, iscsi-csi, cnpg, sealed-secrets, uptime-kuma, wireguard, xray
 #
 # Extracted to independent stacks:
-#   dbaas, authentik, crowdsec
+#   dbaas, authentik, crowdsec, monitoring, nvidia, mailserver, cloudflared, kyverno
 # =============================================================================

 # -----------------------------------------------------------------------------
@ -43,14 +42,6 @@ variable "ssh_private_key" {
  default   = ""
  sensitive = true
 }
-variable "cloudflare_email" { type = string }
-variable "cloudflare_account_id" { type = string }
-variable "cloudflare_zone_id" { type = string }
-variable "cloudflare_tunnel_id" { type = string }
-variable "public_ip" { type = string }
-variable "cloudflare_proxied_names" {}
-variable "cloudflare_non_proxied_names" {}
-variable "monitoring_idrac_username" { type = string }

 # --- Vault KV secrets ---
 data "vault_kv_secret_v2" "secrets" {
@ -63,16 +54,6 @@ locals {
  k8s_users               = jsondecode(data.vault_kv_secret_v2.secrets.data["k8s_users"])
  xray_reality_clients    = jsondecode(data.vault_kv_secret_v2.secrets.data["xray_reality_clients"])
  xray_reality_short_ids  = jsondecode(data.vault_kv_secret_v2.secrets.data["xray_reality_short_ids"])
-  mailserver_accounts     = jsondecode(data.vault_kv_secret_v2.secrets.data["mailserver_accounts"])
-  mailserver_aliases      = jsondecode(data.vault_kv_secret_v2.secrets.data["mailserver_aliases"])
-  mailserver_opendkim_key = jsondecode(data.vault_kv_secret_v2.secrets.data["mailserver_opendkim_key"])
-  mailserver_sasl_passwd  = jsondecode(data.vault_kv_secret_v2.secrets.data["mailserver_sasl_passwd"])
-
-  # User domains from namespace-owners for DNS/Cloudflare
-  user_domains = flatten([
-    for name, user in local.k8s_users : lookup(user, "domains", [])
-    if user.role == "namespace-owner"
-  ])
 }

 # =============================================================================
@ -158,25 +139,6 @@ module "k8s-portal" {
  k8s_ca_cert     = var.k8s_ca_cert
 }

-# -----------------------------------------------------------------------------
-# Monitoring — Prometheus / Grafana / Loki stack
-# -----------------------------------------------------------------------------
-module "monitoring" {
-  source                        = "./modules/monitoring"
-  tls_secret_name               = var.tls_secret_name
-  nfs_server                    = var.nfs_server
-  mysql_host                    = var.mysql_host
-  alertmanager_account_password = data.vault_kv_secret_v2.secrets.data["alertmanager_account_password"]
-  idrac_username                = var.monitoring_idrac_username
-  idrac_password                = data.vault_kv_secret_v2.secrets.data["monitoring_idrac_password"]
-  alertmanager_slack_api_url    = data.vault_kv_secret_v2.secrets.data["alertmanager_slack_api_url"]
-  tiny_tuya_service_secret      = data.vault_kv_secret_v2.secrets.data["tiny_tuya_service_secret"]
-  haos_api_token                = data.vault_kv_secret_v2.secrets.data["haos_api_token"]
-  pve_password                  = data.vault_kv_secret_v2.secrets.data["pve_password"]
-  grafana_admin_password        = data.vault_kv_secret_v2.secrets.data["grafana_admin_password"]
-  tier                          = local.tiers.cluster
-}
-
 # -----------------------------------------------------------------------------
 # Vaultwarden — Password manager
 # -----------------------------------------------------------------------------
@ -254,22 +216,6 @@ module "sealed-secrets" {
  tier   = local.tiers.cluster
 }

-# -----------------------------------------------------------------------------
-# NVIDIA — GPU device plugin
-# -----------------------------------------------------------------------------
-module "nvidia" {
-  source          = "./modules/nvidia"
-  tls_secret_name = var.tls_secret_name
-  tier            = local.tiers.gpu
-}
-
-# -----------------------------------------------------------------------------
-# Kyverno — Policy engine
-# -----------------------------------------------------------------------------
-module "kyverno" {
-  source = "./modules/kyverno"
-}
-
 # -----------------------------------------------------------------------------
 # Uptime Kuma — Status monitoring
 # -----------------------------------------------------------------------------
@ -305,41 +251,6 @@ module "xray" {
  xray_reality_short_ids   = local.xray_reality_short_ids
 }

-# -----------------------------------------------------------------------------
-# Mailserver — docker-mailserver
-# -----------------------------------------------------------------------------
-module "mailserver" {
-  source                  = "./modules/mailserver"
-  tls_secret_name         = var.tls_secret_name
-  nfs_server              = var.nfs_server
-  mysql_host              = var.mysql_host
-  mailserver_accounts     = local.mailserver_accounts
-  postfix_account_aliases = local.mailserver_aliases
-  opendkim_key            = local.mailserver_opendkim_key
-  sasl_passwd             = local.mailserver_sasl_passwd
-  roundcube_db_password   = data.vault_kv_secret_v2.secrets.data["mailserver_roundcubemail_db_password"]
-  tier                    = local.tiers.edge
-}
-
-# -----------------------------------------------------------------------------
-# Cloudflared — Cloudflare tunnel + DNS records
-# -----------------------------------------------------------------------------
-module "cloudflared" {
-  source          = "./modules/cloudflared"
-  tier            = local.tiers.core
-  tls_secret_name = var.tls_secret_name
-
-  cloudflare_api_key           = data.vault_kv_secret_v2.secrets.data["cloudflare_api_key"]
-  cloudflare_email             = var.cloudflare_email
-  cloudflare_account_id        = var.cloudflare_account_id
-  cloudflare_zone_id           = var.cloudflare_zone_id
-  cloudflare_tunnel_id         = var.cloudflare_tunnel_id
-  public_ip                    = var.public_ip
-  cloudflare_proxied_names     = concat(var.cloudflare_proxied_names, nonsensitive(local.user_domains))
-  cloudflare_non_proxied_names = var.cloudflare_non_proxied_names
-  cloudflare_tunnel_token      = data.vault_kv_secret_v2.secrets.data["cloudflare_tunnel_token"]
-}
-
 # -----------------------------------------------------------------------------
 # Infra Maintenance — Automated maintenance jobs
 # -----------------------------------------------------------------------------