variable "tls_secret_name" {} variable "alertmanager_account_password" {} variable "idrac_host" { default = "192.168.1.4" } variable "idrac_username" { default = "root" } variable "idrac_password" { default = "calvin" sensitive = true } variable "alertmanager_slack_api_url" {} variable "tiny_tuya_service_secret" { type = string sensitive = true } variable "haos_api_token" { type = string sensitive = true } variable "pve_password" { type = string sensitive = true } variable "grafana_admin_password" { type = string sensitive = true } variable "kube_config_path" { type = string sensitive = true } variable "tier" { type = string } variable "mysql_host" { type = string } variable "registry_user" { type = string sensitive = true } variable "registry_password" { type = string sensitive = true } variable "forgejo_pull_token" { type = string sensitive = true description = "PAT for the cluster-puller user, used by the Forgejo registry integrity probe." } resource "kubernetes_namespace" "monitoring" { metadata { name = "monitoring" labels = { "istio-injection" : "disabled" tier = var.tier "resource-governance/custom-quota" = "true" "keel.sh/enrolled" = "true" } } lifecycle { # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] } } module "tls_secret" { source = "../../../../modules/kubernetes/setup_tls_secret" namespace = kubernetes_namespace.monitoring.metadata[0].name tls_secret_name = var.tls_secret_name } # Terraform get angry with the 30k values file :/ use ansible until solved # resource "helm_release" "ups_prometheus_snmp_exporter" { # namespace = kubernetes_namespace.monitoring.metadata[0].name # create_namespace = true # name = "ups_prometheus_exporter" # repository = "https://prometheus-community.github.io/helm-charts" # chart = "prometheus-snmp-exporter" # values = [file("${path.module}/ups_snmp_values.yaml")] # } resource "kubernetes_cron_job_v1" "monitor_prom" { metadata { name = "monitor-prometheus" } spec { concurrency_policy = "Replace" failed_jobs_history_limit = 5 schedule = "*/30 * * * *" job_template { metadata { } spec { template { metadata { } spec { container { name = "monitor-prometheus" image = "alpine" command = ["/bin/sh", "-c", "apk add --update curl && curl --connect-timeout 2 prometheus-server.monitoring.svc.cluster.local || curl https://webhook.viktorbarzin.me/fb/message-viktor -d 'Prometheus is down!'"] } } } } } } lifecycle { # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2 ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] } } # ----------------------------------------------------------------------------- # DNS Anomaly Monitor — query Technitium stats API, detect anomalies, push to Pushgateway # Runs every 15 min. Checks for query spikes, high error rates, and suspicious patterns. # ----------------------------------------------------------------------------- resource "kubernetes_cron_job_v1" "dns_anomaly_monitor" { metadata { name = "dns-anomaly-monitor" namespace = kubernetes_namespace.monitoring.metadata[0].name } spec { concurrency_policy = "Replace" failed_jobs_history_limit = 3 successful_jobs_history_limit = 3 schedule = "*/15 * * * *" job_template { metadata {} spec { backoff_limit = 2 ttl_seconds_after_finished = 300 template { metadata {} spec { container { name = "dns-anomaly-monitor" image = "docker.io/library/alpine" command = ["/bin/sh", "-c", <<-EOT set -euo pipefail apk add --no-cache curl jq TECHNITIUM_URL="http://technitium-web.technitium.svc.cluster.local:5380" # Get main stats STATS=$(curl -sf "$TECHNITIUM_URL/api/stats/get?token=&type=LastHour" 2>&1) || { echo "ERROR: Failed to query Technitium stats API" exit 1 } # Parse key metrics TOTAL_QUERIES=$(echo "$STATS" | jq -r '.response.stats.totalQueries // 0') SERVER_FAILURE=$(echo "$STATS" | jq -r '.response.stats.serverFailure // 0') NX_DOMAIN=$(echo "$STATS" | jq -r '.response.stats.nxDomain // 0') BLOCKED=$(echo "$STATS" | jq -r '.response.stats.blocked // 0') NO_ERROR=$(echo "$STATS" | jq -r '.response.stats.noError // 0') echo "DNS Stats (last hour): total=$TOTAL_QUERIES noError=$NO_ERROR nxDomain=$NX_DOMAIN serverFailure=$SERVER_FAILURE blocked=$BLOCKED" # Get top clients for anomaly context TOP_CLIENTS=$(curl -sf "$TECHNITIUM_URL/api/stats/getTopClients?token=&type=LastHour&limit=10" 2>&1) || true # Get top domains for DGA/tunneling detection TOP_DOMAINS=$(curl -sf "$TECHNITIUM_URL/api/stats/getTopDomains?token=&type=LastHour&limit=20" 2>&1) || true # Check for high-entropy domains (potential DGA) DGA_SUSPECT=0 if [ -n "$TOP_DOMAINS" ]; then # Simple heuristic: domains with many consonant clusters or very long labels DGA_SUSPECT=$(echo "$TOP_DOMAINS" | jq -r '[.response.topDomains[]?.name // empty | select(length > 30 or test("[bcdfghjklmnpqrstvwxyz]{5,}"))] | length') fi # Push metrics to Pushgateway cat </dev/null REG="$REGISTRY_HOST" SCHEME="$${REGISTRY_SCHEME:-https}" INSTANCE="$REGISTRY_INSTANCE" AUTH="$REG_USER:$REG_PASS" ACCEPT='application/vnd.oci.image.index.v1+json,application/vnd.oci.image.manifest.v1+json,application/vnd.docker.distribution.manifest.list.v2+json,application/vnd.docker.distribution.manifest.v2+json' push() { curl -sf --max-time 10 --data-binary @- "$PUSHGATEWAY" >/dev/null 2>&1 || true } CATALOG=$(curl -sk -u "$AUTH" --max-time 30 "$SCHEME://$REG/v2/_catalog?n=1000" || echo "") REPOS=$(echo "$CATALOG" | jq -r '.repositories[]?' 2>/dev/null || echo "") if [ -z "$REPOS" ]; then echo "ERROR: empty catalog or auth failure — cannot probe" NOW=$(date +%s) push < /tmp/repos.txt while IFS= read -r repo; do [ -z "$repo" ] && continue REPOS_N=$((REPOS_N + 1)) TAGS_JSON=$(curl -sk -u "$AUTH" --max-time 15 "$SCHEME://$REG/v2/$repo/tags/list" || echo "") echo "$TAGS_JSON" | jq -r '.tags[]?' 2>/dev/null | tail -n "$TAGS_PER_REPO" > /tmp/tags.txt || true while IFS= read -r tag; do [ -z "$tag" ] && continue TAGS_N=$((TAGS_N + 1)) HTTP=$(curl -sk -u "$AUTH" -o /tmp/m.json -w '%%{http_code}' \ -H "Accept: $ACCEPT" --max-time 15 \ "$SCHEME://$REG/v2/$repo/manifests/$tag") if [ "$HTTP" != "200" ]; then echo "FAIL: $repo:$tag manifest HTTP $HTTP" FAIL=$((FAIL + 1)) continue fi MT=$(jq -r '.mediaType // empty' /tmp/m.json 2>/dev/null || echo "") if echo "$MT" | grep -Eq 'manifest\.list|image\.index'; then INDEXES_N=$((INDEXES_N + 1)) jq -r '.manifests[].digest' /tmp/m.json > /tmp/children.txt 2>/dev/null || true while IFS= read -r d; do [ -z "$d" ] && continue CH=$(curl -sk -u "$AUTH" -o /dev/null -w '%%{http_code}' \ -H "Accept: $ACCEPT" --max-time 10 -I \ "$SCHEME://$REG/v2/$repo/manifests/$d") if [ "$CH" != "200" ]; then echo "FAIL: $repo:$tag index child $d HTTP $CH" FAIL=$((FAIL + 1)) fi done < /tmp/children.txt fi done < /tmp/tags.txt done < /tmp/repos.txt NOW=$(date +%s) push <