goldmane-trail: polish follow-ups #57/#59/#61/#62/#63 + digest→#alerts
All checks were successful
ci/woodpecker/push/default Pipeline was successful

Completes the Goldmane who-talks-to-whom trail (ADR-0014), implemented by a
subagent workflow (distinct stacks in parallel, docs last):

- #57 Whisker gated ingress: ingress_factory (whisker.viktorbarzin.me,
  auth=required, Authentik-gated) + a NetworkPolicy allowing traefik->whisker:8081
  (the operator's whisker NP default-denies ingress). calico stack.
- #61 pipeline health: AggregatorDown + DigestFailing Prometheus alerts
  (prometheus_chart_values.tpl) + cluster-health check #48.
- #59 service-identity labels on the multi-Service namespaces (monitoring's 5
  TF-managed deployments + dbaas), with the KYVERNO_LIFECYCLE_V1 marker so they
  update in-place.
- #62/#63 docs: docs/runbooks/goldmane-flow-trail.md (new), service-catalog,
  security.md + monitoring.md east-west sections, ADR-0014 as-built, CONTEXT.md.
  #62 = the SQL to derive the Wave-1 per-namespace egress allowlist from the
  edge table (feeds code-8ywc; enforce-flips out of scope).

Also fixes the digest's Slack target: #security override 404s channel_not_found
because the shared alertmanager_slack_api_url webhook's app isn't a member of
#security (this likely also breaks alertmanager's slack-security receiver — flagged
in the runbook). Routed to #alerts (the webhook's working channel) until the app
is invited; verified a real digest run posts cleanly (360 edges).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-25 17:49:25 +00:00
parent 306cdd4cb3
commit 6c5288998f
17 changed files with 626 additions and 11 deletions

View file

@ -212,3 +212,65 @@ resource "kubectl_manifest" "whisker" {
spec = { notifications = "Disabled" }
})
}
# ---------------------------------------------------------------------------
# Gated public ingress for the Whisker UI (infra #57 / ADR-0014).
#
# whisker.viktorbarzin.me -> whisker:8081, Authentik-gated (auth="required":
# Whisker ships NO own login it's an admin observability UI, so Authentik
# forward-auth is the only gate between strangers and the flow view). The
# operator replicated `tls-secret` into calico-system already.
#
# TWO coupled pieces are required because the operator's own `whisker`
# NetworkPolicy (owned by the Whisker CR above) sets policyTypes:[Ingress]
# with NO ingress rules => default-deny on ingress to the whisker pod. The
# additive NP below ORs in a Traefik allow (k8s NetworkPolicies are additive
# across policies selecting the same pod), so we never edit the operator NP.
module "ingress_whisker" {
source = "../../modules/kubernetes/ingress_factory"
dns_type = "proxied"
namespace = "calico-system"
name = "whisker"
service_name = "whisker"
port = 8081
auth = "required"
tls_secret_name = "tls-secret"
extra_annotations = {
"gethomepage.dev/enabled" = "true"
"gethomepage.dev/name" = "Whisker"
"gethomepage.dev/description" = "Calico flow observability (who-talks-to-whom)"
"gethomepage.dev/icon" = "calico.png"
"gethomepage.dev/group" = "Infrastructure"
}
}
# Additive NetworkPolicy: permit Traefik -> whisker:8081. ORs with the
# operator's default-deny `whisker` NP (selecting the same pod) so Traefik
# can reach the UI without touching the operator-owned policy.
resource "kubernetes_network_policy_v1" "whisker_allow_traefik" {
metadata {
name = "whisker-allow-traefik"
namespace = "calico-system"
}
spec {
pod_selector {
match_labels = {
"app.kubernetes.io/name" = "whisker"
}
}
policy_types = ["Ingress"]
ingress {
from {
namespace_selector {
match_labels = {
"kubernetes.io/metadata.name" = "traefik"
}
}
}
ports {
port = "8081"
protocol = "TCP"
}
}
}
}

View file

@ -745,7 +745,10 @@ resource "kubernetes_deployment" "phpmyadmin" {
labels = {
"app" = "phpmyadmin"
tier = var.tier
# ADR-0014 service identity: dbaas is a multi-Service namespace, so the
# namespace alone can't attribute Goldmane flows. Value = the fronting
# Service name (kubernetes_service.phpmyadmin is named "pma").
"service-identity" = "pma"
}
annotations = {
"reloader.stakater.com/search" = "true"
@ -762,6 +765,10 @@ resource "kubernetes_deployment" "phpmyadmin" {
metadata {
labels = {
"app" = "phpmyadmin"
# ADR-0014: Goldmane/Felix stamps POD labels onto flows, so the
# disambiguating identity must live on the pod template (not just
# the Deployment metadata above). Not in selector no replace.
"service-identity" = "pma"
}
}
spec {
@ -812,8 +819,19 @@ resource "kubernetes_deployment" "phpmyadmin" {
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
ignore_changes = [spec[0].template[0].spec[0].dns_config]
ignore_changes = [
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
# This Deployment is Keel-enrolled (keel.sh/policy=patch). Ignore the
# attributes Keel/Kyverno mutate at runtime so `terragrunt apply` (incl.
# the daily drift plan) doesn't fight them or revert the live image
# canonical KEEL/KYVERNO lifecycle guard, matches linkwarden/chrome-service.
metadata[0].annotations["keel.sh/policy"],
metadata[0].annotations["keel.sh/trigger"],
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
metadata[0].annotations["keel.sh/match-tag"],
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE Keel manages tag updates
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
]
}
}
@ -1499,6 +1517,10 @@ resource "kubernetes_deployment" "pgadmin" {
}
labels = {
tier = var.tier
# ADR-0014 service identity: dbaas is a multi-Service namespace, so the
# namespace alone can't attribute Goldmane flows. Value = the fronting
# Service name (kubernetes_service.pgadmin is named "pgadmin").
"service-identity" = "pgadmin"
}
}
spec {
@ -1514,6 +1536,10 @@ resource "kubernetes_deployment" "pgadmin" {
metadata {
labels = {
app = "pgadmin"
# ADR-0014: Goldmane/Felix stamps POD labels onto flows, so the
# disambiguating identity must live on the pod template (not just
# the Deployment metadata above). Not in selector no replace.
"service-identity" = "pgadmin"
}
}
spec {
@ -1568,8 +1594,20 @@ resource "kubernetes_deployment" "pgadmin" {
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
ignore_changes = [spec[0].template[0].spec[0].dns_config]
ignore_changes = [
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
# This Deployment is Keel-enrolled (keel.sh/policy=patch) and Keel has
# bumped the live image (dpage/pgadmin4:9.16). Ignore the Keel/Kyverno
# runtime-mutated attributes so `terragrunt apply` (incl. the daily drift
# plan) doesn't revert the image to bare `dpage/pgadmin4` or strip Keel's
# annotations canonical guard, matches linkwarden/chrome-service.
metadata[0].annotations["keel.sh/policy"],
metadata[0].annotations["keel.sh/trigger"],
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
metadata[0].annotations["keel.sh/match-tag"],
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE Keel manages tag updates
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
]
}
}
resource "kubernetes_service" "pgadmin" {

View file

@ -449,8 +449,16 @@ resource "kubernetes_cron_job_v1" "digest" {
}
}
env {
name = "SLACK_CHANNEL"
value = "#security"
name = "SLACK_CHANNEL"
# The shared alertmanager_slack_api_url incoming webhook's Slack
# app is NOT a member of #security, so overriding the channel to
# it returns HTTP 404 channel_not_found (verified 2026-06-25).
# alertmanager's own slack-security receiver shares this webhook
# and almost certainly hits the same wall. Post to #alerts (the
# webhook's working channel, same as alert-digest) until the app
# is invited to #security, then flip this back. See
# docs/runbooks/goldmane-flow-trail.md.
value = "#alerts"
}
resources {

View file

@ -130,6 +130,11 @@ resource "kubernetes_deployment" "blackbox_exporter" {
labels = {
app = "blackbox-exporter"
tier = var.tier
# ADR-0014 service identity: monitoring is a multi-Service namespace, so
# the namespace alone can't attribute Goldmane flows. Value = the
# fronting Service name (kubernetes_service.blackbox_exporter is named
# "blackbox-exporter").
"service-identity" = "blackbox-exporter"
}
annotations = {
"reloader.stakater.com/search" = "true"
@ -146,6 +151,10 @@ resource "kubernetes_deployment" "blackbox_exporter" {
metadata {
labels = {
app = "blackbox-exporter"
# ADR-0014: Goldmane/Felix stamps POD labels onto flows, so the
# disambiguating identity must live on the pod template (not just
# the Deployment metadata above). Not in selector no replace.
"service-identity" = "blackbox-exporter"
}
}
spec {

View file

@ -5,6 +5,11 @@ resource "kubernetes_deployment" "goflow2" {
labels = {
app = "goflow2"
tier = var.tier
# ADR-0014 service identity: monitoring is a multi-Service namespace, so
# the namespace alone can't attribute Goldmane flows. Value = the
# fronting Service name (kubernetes_service.goflow2 the metrics svc; the
# goflow2-netflow NodePort is the same pod by another name).
"service-identity" = "goflow2"
}
}
spec {
@ -18,6 +23,10 @@ resource "kubernetes_deployment" "goflow2" {
metadata {
labels = {
app = "goflow2"
# ADR-0014: Goldmane/Felix stamps POD labels onto flows, so the
# disambiguating identity must live on the pod template (not just
# the Deployment metadata above). Not in selector no replace.
"service-identity" = "goflow2"
}
}
spec {

View file

@ -47,6 +47,10 @@ resource "kubernetes_deployment" "idrac-redfish" {
labels = {
app = "idrac-redfish-exporter"
tier = var.tier
# ADR-0014 service identity: monitoring is a multi-Service namespace, so
# the namespace alone can't attribute Goldmane flows. Value = the
# fronting Service name (kubernetes_service.idrac-redfish-exporter).
"service-identity" = "idrac-redfish-exporter"
}
annotations = {
"reloader.stakater.com/search" = "true"
@ -63,6 +67,10 @@ resource "kubernetes_deployment" "idrac-redfish" {
metadata {
labels = {
app = "idrac-redfish-exporter"
# ADR-0014: Goldmane/Felix stamps POD labels onto flows, so the
# disambiguating identity must live on the pod template (not just
# the Deployment metadata above). Not in selector no replace.
"service-identity" = "idrac-redfish-exporter"
}
}
spec {

View file

@ -1450,6 +1450,49 @@ serverFiles:
Remediation: right-size top reservers via Goldilocks (immich-server,
frigate, prometheus, pg-cluster, paperless) or bump VM RAM on
k8s-node2/k8s-node3 from 32GB → 48GB to match node1.
# Goldmane edge-aggregator (ADR-0014 / infra #58, #61): the durable
# who-talks-to-whom trail. The aggregator pod has NO /metrics endpoint,
# so its health is inferred from kube-state-metrics signals — the trail
# must not silently die. Two failure modes are covered:
# - the aggregate Deployment stops consuming Goldmane's flow stream
# (AggregatorDown) → no new edges ever land in the goldmane_edges DB
# - the daily digest CronJob can't post new edges to Slack
# (DigestFailing) → edges still land but nobody is told.
# A freshness probe (max(last_seen) staleness) is intentionally NOT here:
# AggregatorDown is the agreed floor and needs no extra moving parts.
- name: Network Observability (Goldmane)
rules:
# Deployment has <1 available replica for 15m. kube-state-metrics
# keeps `kube_deployment_status_replicas_available` (metric-keep list
# in serverFiles below). The 15m window rides out a normal rollout /
# node drain without paging; a genuinely-dead aggregator means the
# edge trail has stopped recording and stays down.
- alert: AggregatorDown
expr: |
kube_deployment_status_replicas_available{namespace="goldmane-edge-aggregator",deployment="goldmane-edge-aggregator"} < 1
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
for: 15m
labels:
severity: warning
annotations:
summary: "goldmane-edge-aggregator has no available replica — the who-talks-to-whom edge trail has stopped recording"
description: "The aggregate Deployment streams Calico Goldmane flows into the goldmane_edges CNPG DB. With 0 replicas, no new namespace-pair edges are captured. `kubectl -n goldmane-edge-aggregator describe deploy goldmane-edge-aggregator` + check the goldmane svc (calico-system) is reachable."
# The goldmane-edges-digest CronJob has a failed Job that started in
# the last 24h. Mirrors the generic JobFailed shape but scoped to the
# digest so it routes here. `for: 30m` rides out the apply/scrape
# transient; the digest runs daily so a real failure won't self-heal
# until the next run — surface it same-day rather than waiting 24h.
- alert: DigestFailing
expr: |
kube_job_status_failed{namespace="goldmane-edge-aggregator", job_name=~"goldmane-edges-digest.*"} > 0
and on(namespace, job_name)
(time() - kube_job_status_start_time{namespace="goldmane-edge-aggregator", job_name=~"goldmane-edges-digest.*"}) < 86400
for: 30m
labels:
severity: warning
annotations:
summary: "goldmane-edges-digest CronJob failing — new edges captured but not posted to #security"
description: "The daily edge digest Job {{ $labels.job_name }} failed. Edges may still be landing in the goldmane_edges DB but no one is being notified of new namespace-pairs. `kubectl -n goldmane-edge-aggregator logs job/{{ $labels.job_name }}`."
- name: Infrastructure Health
rules:
- alert: HomeAssistantDown

View file

@ -22,6 +22,10 @@ resource "kubernetes_deployment" "pve_exporter" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
tier = var.tier
# ADR-0014 service identity: monitoring is a multi-Service namespace, so
# the namespace alone can't attribute Goldmane flows. Value = the
# fronting Service name (kubernetes_service.proxmox-exporter).
"service-identity" = "proxmox-exporter"
}
}
@ -37,6 +41,10 @@ resource "kubernetes_deployment" "pve_exporter" {
metadata {
labels = {
app = "proxmox-exporter"
# ADR-0014: Goldmane/Felix stamps POD labels onto flows, so the
# disambiguating identity must live on the pod template (not just
# the Deployment metadata above). Not in selector no replace.
"service-identity" = "proxmox-exporter"
}
}

View file

@ -31,6 +31,10 @@ resource "kubernetes_deployment" "snmp-exporter" {
labels = {
app = "snmp-exporter"
tier = var.tier
# ADR-0014 service identity: monitoring is a multi-Service namespace, so
# the namespace alone can't attribute Goldmane flows. Value = the
# fronting Service name (kubernetes_service.snmp-exporter).
"service-identity" = "snmp-exporter"
}
annotations = {
"reloader.stakater.com/search" = "true"
@ -47,6 +51,10 @@ resource "kubernetes_deployment" "snmp-exporter" {
metadata {
labels = {
app = "snmp-exporter"
# ADR-0014: Goldmane/Felix stamps POD labels onto flows, so the
# disambiguating identity must live on the pod template (not just
# the Deployment metadata above). Not in selector no replace.
"service-identity" = "snmp-exporter"
}
}
spec {