[ci skip] Infrastructure hardening: security, monitoring, reliability, maintainability
Phase 1 - Critical Security:
- Netbox: move hardcoded DB/superuser passwords to variables
- MeshCentral: disable public registration, add Authentik auth
- Traefik: disable insecure API dashboard (api.insecure=false)
- Traefik: configure forwarded headers with Cloudflare trusted IPs
Phase 2 - Security Hardening:
- Add security headers middleware (HSTS, X-Frame-Options, nosniff, etc.)
- Add Kyverno pod security policies in audit mode (privileged, host
namespaces, SYS_ADMIN, trusted registries)
- Tighten rate limiting (avg=10, burst=50)
- Add Authentik protection to grampsweb
Phase 3 - Monitoring & Alerting:
- Add critical service alerts (PostgreSQL, MySQL, Redis, Headscale,
Authentik, Loki)
- Increase Loki retention from 7 to 30 days (720h)
- Add predictive PV filling alert (predict_linear)
- Re-enable Hackmd and Privatebin down alerts
Phase 4 - Reliability:
- Add resource requests/limits to Redis, DBaaS, Technitium, Headscale,
Vaultwarden, Uptime Kuma
- Increase Alloy DaemonSet memory to 512Mi/1Gi
Phase 6 - Maintainability:
- Extract duplicated tiers locals to terragrunt.hcl generate block
(removed from 67 stacks)
- Replace hardcoded NFS IP 10.0.10.15 with var.nfs_server (114
instances across 63 files)
- Replace hardcoded Redis/PostgreSQL/MySQL/Ollama/mail host references
with variables across ~35 stacks
- Migrate xray raw ingress resources to ingress_factory modules
2026-02-23 22:05:28 +00:00
# =============================================================================
# Pod Security Policies (Audit Mode)
# =============================================================================
# Kyverno validate policies for pod security standards.
# All policies start in Audit mode - violations are logged but not blocked.
resource " kubernetes_manifest " " policy_deny_privileged " {
manifest = {
apiVersion = " kyverno.io/v1 "
kind = " ClusterPolicy "
metadata = {
name = " deny-privileged-containers "
annotations = {
" policies.kyverno.io/title " = " Deny Privileged Containers "
" policies.kyverno.io/category " = " Pod Security "
" policies.kyverno.io/severity " = " high "
" policies.kyverno.io/description " = " Privileged containers have full host access. Deny unless explicitly exempted. "
}
}
spec = {
validationFailureAction = " Audit "
background = true
rules = [ {
name = " deny-privileged "
match = {
any = [ {
resources = {
kinds = [ " Pod " ]
}
} ]
}
exclude = {
any = [ {
resources = {
namespaces = [ " frigate " , " nvidia " , " monitoring " ]
}
} ]
}
validate = {
message = " Privileged containers are not allowed. Use specific capabilities instead. "
pattern = {
spec = {
containers = [ {
" =(securityContext) " = {
" =(privileged) " = false
}
} ]
" =(initContainers) " = [ {
" =(securityContext) " = {
" =(privileged) " = false
}
} ]
}
}
}
} ]
}
}
depends_on = [ helm_release . kyverno ]
}
resource " kubernetes_manifest " " policy_deny_host_namespaces " {
manifest = {
apiVersion = " kyverno.io/v1 "
kind = " ClusterPolicy "
metadata = {
name = " deny-host-namespaces "
annotations = {
" policies.kyverno.io/title " = " Deny Host Namespaces "
" policies.kyverno.io/category " = " Pod Security "
" policies.kyverno.io/severity " = " high "
" policies.kyverno.io/description " = " Sharing host namespaces enables container escapes. Deny hostNetwork, hostPID, hostIPC. "
}
}
spec = {
validationFailureAction = " Audit "
background = true
rules = [ {
name = " deny-host-namespaces "
match = {
any = [ {
resources = {
kinds = [ " Pod " ]
}
} ]
}
exclude = {
any = [ {
resources = {
namespaces = [ " frigate " , " monitoring " ]
}
} ]
}
validate = {
message = " Host namespaces (hostNetwork, hostPID, hostIPC) are not allowed. "
pattern = {
spec = {
" =(hostNetwork) " = false
" =(hostPID) " = false
" =(hostIPC) " = false
}
}
}
} ]
}
}
depends_on = [ helm_release . kyverno ]
}
resource " kubernetes_manifest " " policy_restrict_capabilities " {
manifest = {
apiVersion = " kyverno.io/v1 "
kind = " ClusterPolicy "
metadata = {
name = " restrict-sys-admin "
annotations = {
" policies.kyverno.io/title " = " Restrict SYS_ADMIN Capability "
" policies.kyverno.io/category " = " Pod Security "
" policies.kyverno.io/severity " = " high "
" policies.kyverno.io/description " = " SYS_ADMIN is nearly equivalent to root. Restrict to explicitly exempted namespaces. "
}
}
spec = {
validationFailureAction = " Audit "
background = true
rules = [ {
name = " restrict-sys-admin "
match = {
any = [ {
resources = {
kinds = [ " Pod " ]
}
} ]
}
exclude = {
any = [ {
resources = {
namespaces = [ " nvidia " , " monitoring " ]
}
} ]
}
validate = {
message = " Adding SYS_ADMIN capability is not allowed. "
deny = {
conditions = {
any = [ {
key = " {{ request.object.spec.containers[].securityContext.capabilities.add[] || `[]` }} "
operator = " AnyIn "
value = [ " SYS_ADMIN " ]
} ]
}
}
}
} ]
}
}
depends_on = [ helm_release . kyverno ]
}
2026-03-19 20:23:59 +00:00
# =============================================================================
# Image Pull Policy Governance
# =============================================================================
# Mutate imagePullPolicy to IfNotPresent for all containers with pinned tags
# (non-:latest). This prevents pods from getting stuck in ImagePullBackOff
# when the pull-through cache at 10.0.20.10 has transient failures.
# For :latest or untagged images, set to Always so stale images don't persist.
resource " kubernetes_manifest " " policy_set_image_pull_policy " {
manifest = {
apiVersion = " kyverno.io/v1 "
kind = " ClusterPolicy "
metadata = {
name = " set-image-pull-policy "
annotations = {
" policies.kyverno.io/title " = " Set Image Pull Policy "
" policies.kyverno.io/category " = " Best Practices "
" policies.kyverno.io/severity " = " medium "
" policies.kyverno.io/description " = " Set imagePullPolicy to IfNotPresent for pinned tags and Always for :latest to prevent ImagePullBackOff from transient cache failures. "
}
}
spec = {
background = false
rules = [
{
name = " set-ifnotpresent-for-pinned-tags "
match = {
any = [ {
resources = {
kinds = [ " Pod " ]
}
} ]
}
mutate = {
foreach = [ {
list = " request.object.spec.containers "
preconditions = {
all = [ {
key = " {{ ends_with(element.image, ':latest') || !contains(element.image, ':') }} "
operator = " Equals "
value = false
} ]
}
patchStrategicMerge = {
spec = {
containers = [ {
name = " {{ element.name }} "
imagePullPolicy = " IfNotPresent "
} ]
}
}
} ]
}
} ,
{
name = " set-always-for-latest "
match = {
any = [ {
resources = {
kinds = [ " Pod " ]
}
} ]
}
mutate = {
foreach = [ {
list = " request.object.spec.containers "
preconditions = {
all = [ {
key = " {{ ends_with(element.image, ':latest') || !contains(element.image, ':') }} "
operator = " Equals "
value = true
} ]
}
patchStrategicMerge = {
spec = {
containers = [ {
name = " {{ element.name }} "
imagePullPolicy = " Always "
} ]
}
}
} ]
}
}
]
}
}
depends_on = [ helm_release . kyverno ]
}
[ci skip] Infrastructure hardening: security, monitoring, reliability, maintainability
Phase 1 - Critical Security:
- Netbox: move hardcoded DB/superuser passwords to variables
- MeshCentral: disable public registration, add Authentik auth
- Traefik: disable insecure API dashboard (api.insecure=false)
- Traefik: configure forwarded headers with Cloudflare trusted IPs
Phase 2 - Security Hardening:
- Add security headers middleware (HSTS, X-Frame-Options, nosniff, etc.)
- Add Kyverno pod security policies in audit mode (privileged, host
namespaces, SYS_ADMIN, trusted registries)
- Tighten rate limiting (avg=10, burst=50)
- Add Authentik protection to grampsweb
Phase 3 - Monitoring & Alerting:
- Add critical service alerts (PostgreSQL, MySQL, Redis, Headscale,
Authentik, Loki)
- Increase Loki retention from 7 to 30 days (720h)
- Add predictive PV filling alert (predict_linear)
- Re-enable Hackmd and Privatebin down alerts
Phase 4 - Reliability:
- Add resource requests/limits to Redis, DBaaS, Technitium, Headscale,
Vaultwarden, Uptime Kuma
- Increase Alloy DaemonSet memory to 512Mi/1Gi
Phase 6 - Maintainability:
- Extract duplicated tiers locals to terragrunt.hcl generate block
(removed from 67 stacks)
- Replace hardcoded NFS IP 10.0.10.15 with var.nfs_server (114
instances across 63 files)
- Replace hardcoded Redis/PostgreSQL/MySQL/Ollama/mail host references
with variables across ~35 stacks
- Migrate xray raw ingress resources to ingress_factory modules
2026-02-23 22:05:28 +00:00
resource " kubernetes_manifest " " policy_require_trusted_registries " {
manifest = {
apiVersion = " kyverno.io/v1 "
kind = " ClusterPolicy "
metadata = {
name = " require-trusted-registries "
annotations = {
" policies.kyverno.io/title " = " Require Trusted Image Registries "
" policies.kyverno.io/category " = " Pod Security "
" policies.kyverno.io/severity " = " medium "
" policies.kyverno.io/description " = " Images must come from trusted registries to prevent supply chain attacks. "
}
}
spec = {
validationFailureAction = " Audit "
background = true
rules = [ {
name = " validate-registries "
match = {
any = [ {
resources = {
kinds = [ " Pod " ]
}
} ]
}
validate = {
message = " Images must be from trusted registries (docker.io, ghcr.io, quay.io, registry.k8s.io, or local cache). "
pattern = {
spec = {
containers = [ {
image = " docker.io/* | ghcr.io/* | quay.io/* | registry.k8s.io/* | 10.0.20.10* | */* "
} ]
}
}
}
} ]
}
}
depends_on = [ helm_release . kyverno ]
}