goldmane-trail: polish follow-ups #57/#59/#61/#62/#63 + digest→#alerts
All checks were successful
ci/woodpecker/push/default Pipeline was successful
All checks were successful
ci/woodpecker/push/default Pipeline was successful
Completes the Goldmane who-talks-to-whom trail (ADR-0014), implemented by a subagent workflow (distinct stacks in parallel, docs last): - #57 Whisker gated ingress: ingress_factory (whisker.viktorbarzin.me, auth=required, Authentik-gated) + a NetworkPolicy allowing traefik->whisker:8081 (the operator's whisker NP default-denies ingress). calico stack. - #61 pipeline health: AggregatorDown + DigestFailing Prometheus alerts (prometheus_chart_values.tpl) + cluster-health check #48. - #59 service-identity labels on the multi-Service namespaces (monitoring's 5 TF-managed deployments + dbaas), with the KYVERNO_LIFECYCLE_V1 marker so they update in-place. - #62/#63 docs: docs/runbooks/goldmane-flow-trail.md (new), service-catalog, security.md + monitoring.md east-west sections, ADR-0014 as-built, CONTEXT.md. #62 = the SQL to derive the Wave-1 per-namespace egress allowlist from the edge table (feeds code-8ywc; enforce-flips out of scope). Also fixes the digest's Slack target: #security override 404s channel_not_found because the shared alertmanager_slack_api_url webhook's app isn't a member of #security (this likely also breaks alertmanager's slack-security receiver — flagged in the runbook). Routed to #alerts (the webhook's working channel) until the app is invited; verified a real digest run posts cleanly (360 edges). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
306cdd4cb3
commit
6c5288998f
17 changed files with 626 additions and 11 deletions
|
|
@ -212,3 +212,65 @@ resource "kubectl_manifest" "whisker" {
|
|||
spec = { notifications = "Disabled" }
|
||||
})
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gated public ingress for the Whisker UI (infra #57 / ADR-0014).
|
||||
#
|
||||
# whisker.viktorbarzin.me -> whisker:8081, Authentik-gated (auth="required":
|
||||
# Whisker ships NO own login — it's an admin observability UI, so Authentik
|
||||
# forward-auth is the only gate between strangers and the flow view). The
|
||||
# operator replicated `tls-secret` into calico-system already.
|
||||
#
|
||||
# TWO coupled pieces are required because the operator's own `whisker`
|
||||
# NetworkPolicy (owned by the Whisker CR above) sets policyTypes:[Ingress]
|
||||
# with NO ingress rules => default-deny on ingress to the whisker pod. The
|
||||
# additive NP below ORs in a Traefik allow (k8s NetworkPolicies are additive
|
||||
# across policies selecting the same pod), so we never edit the operator NP.
|
||||
module "ingress_whisker" {
|
||||
source = "../../modules/kubernetes/ingress_factory"
|
||||
dns_type = "proxied"
|
||||
namespace = "calico-system"
|
||||
name = "whisker"
|
||||
service_name = "whisker"
|
||||
port = 8081
|
||||
auth = "required"
|
||||
tls_secret_name = "tls-secret"
|
||||
extra_annotations = {
|
||||
"gethomepage.dev/enabled" = "true"
|
||||
"gethomepage.dev/name" = "Whisker"
|
||||
"gethomepage.dev/description" = "Calico flow observability (who-talks-to-whom)"
|
||||
"gethomepage.dev/icon" = "calico.png"
|
||||
"gethomepage.dev/group" = "Infrastructure"
|
||||
}
|
||||
}
|
||||
|
||||
# Additive NetworkPolicy: permit Traefik -> whisker:8081. ORs with the
|
||||
# operator's default-deny `whisker` NP (selecting the same pod) so Traefik
|
||||
# can reach the UI without touching the operator-owned policy.
|
||||
resource "kubernetes_network_policy_v1" "whisker_allow_traefik" {
|
||||
metadata {
|
||||
name = "whisker-allow-traefik"
|
||||
namespace = "calico-system"
|
||||
}
|
||||
spec {
|
||||
pod_selector {
|
||||
match_labels = {
|
||||
"app.kubernetes.io/name" = "whisker"
|
||||
}
|
||||
}
|
||||
policy_types = ["Ingress"]
|
||||
ingress {
|
||||
from {
|
||||
namespace_selector {
|
||||
match_labels = {
|
||||
"kubernetes.io/metadata.name" = "traefik"
|
||||
}
|
||||
}
|
||||
}
|
||||
ports {
|
||||
port = "8081"
|
||||
protocol = "TCP"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -745,7 +745,10 @@ resource "kubernetes_deployment" "phpmyadmin" {
|
|||
labels = {
|
||||
"app" = "phpmyadmin"
|
||||
tier = var.tier
|
||||
|
||||
# ADR-0014 service identity: dbaas is a multi-Service namespace, so the
|
||||
# namespace alone can't attribute Goldmane flows. Value = the fronting
|
||||
# Service name (kubernetes_service.phpmyadmin is named "pma").
|
||||
"service-identity" = "pma"
|
||||
}
|
||||
annotations = {
|
||||
"reloader.stakater.com/search" = "true"
|
||||
|
|
@ -762,6 +765,10 @@ resource "kubernetes_deployment" "phpmyadmin" {
|
|||
metadata {
|
||||
labels = {
|
||||
"app" = "phpmyadmin"
|
||||
# ADR-0014: Goldmane/Felix stamps POD labels onto flows, so the
|
||||
# disambiguating identity must live on the pod template (not just
|
||||
# the Deployment metadata above). Not in selector → no replace.
|
||||
"service-identity" = "pma"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
|
|
@ -812,8 +819,19 @@ resource "kubernetes_deployment" "phpmyadmin" {
|
|||
}
|
||||
}
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||
ignore_changes = [spec[0].template[0].spec[0].dns_config]
|
||||
ignore_changes = [
|
||||
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||
# This Deployment is Keel-enrolled (keel.sh/policy=patch). Ignore the
|
||||
# attributes Keel/Kyverno mutate at runtime so `terragrunt apply` (incl.
|
||||
# the daily drift plan) doesn't fight them or revert the live image —
|
||||
# canonical KEEL/KYVERNO lifecycle guard, matches linkwarden/chrome-service.
|
||||
metadata[0].annotations["keel.sh/policy"],
|
||||
metadata[0].annotations["keel.sh/trigger"],
|
||||
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
|
||||
metadata[0].annotations["keel.sh/match-tag"],
|
||||
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates
|
||||
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1499,6 +1517,10 @@ resource "kubernetes_deployment" "pgadmin" {
|
|||
}
|
||||
labels = {
|
||||
tier = var.tier
|
||||
# ADR-0014 service identity: dbaas is a multi-Service namespace, so the
|
||||
# namespace alone can't attribute Goldmane flows. Value = the fronting
|
||||
# Service name (kubernetes_service.pgadmin is named "pgadmin").
|
||||
"service-identity" = "pgadmin"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
|
|
@ -1514,6 +1536,10 @@ resource "kubernetes_deployment" "pgadmin" {
|
|||
metadata {
|
||||
labels = {
|
||||
app = "pgadmin"
|
||||
# ADR-0014: Goldmane/Felix stamps POD labels onto flows, so the
|
||||
# disambiguating identity must live on the pod template (not just
|
||||
# the Deployment metadata above). Not in selector → no replace.
|
||||
"service-identity" = "pgadmin"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
|
|
@ -1568,8 +1594,20 @@ resource "kubernetes_deployment" "pgadmin" {
|
|||
}
|
||||
}
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||
ignore_changes = [spec[0].template[0].spec[0].dns_config]
|
||||
ignore_changes = [
|
||||
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||
# This Deployment is Keel-enrolled (keel.sh/policy=patch) and Keel has
|
||||
# bumped the live image (dpage/pgadmin4:9.16). Ignore the Keel/Kyverno
|
||||
# runtime-mutated attributes so `terragrunt apply` (incl. the daily drift
|
||||
# plan) doesn't revert the image to bare `dpage/pgadmin4` or strip Keel's
|
||||
# annotations — canonical guard, matches linkwarden/chrome-service.
|
||||
metadata[0].annotations["keel.sh/policy"],
|
||||
metadata[0].annotations["keel.sh/trigger"],
|
||||
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
|
||||
metadata[0].annotations["keel.sh/match-tag"],
|
||||
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates
|
||||
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
|
||||
]
|
||||
}
|
||||
}
|
||||
resource "kubernetes_service" "pgadmin" {
|
||||
|
|
|
|||
|
|
@ -449,8 +449,16 @@ resource "kubernetes_cron_job_v1" "digest" {
|
|||
}
|
||||
}
|
||||
env {
|
||||
name = "SLACK_CHANNEL"
|
||||
value = "#security"
|
||||
name = "SLACK_CHANNEL"
|
||||
# The shared alertmanager_slack_api_url incoming webhook's Slack
|
||||
# app is NOT a member of #security, so overriding the channel to
|
||||
# it returns HTTP 404 channel_not_found (verified 2026-06-25).
|
||||
# alertmanager's own slack-security receiver shares this webhook
|
||||
# and almost certainly hits the same wall. Post to #alerts (the
|
||||
# webhook's working channel, same as alert-digest) until the app
|
||||
# is invited to #security, then flip this back. See
|
||||
# docs/runbooks/goldmane-flow-trail.md.
|
||||
value = "#alerts"
|
||||
}
|
||||
|
||||
resources {
|
||||
|
|
|
|||
|
|
@ -130,6 +130,11 @@ resource "kubernetes_deployment" "blackbox_exporter" {
|
|||
labels = {
|
||||
app = "blackbox-exporter"
|
||||
tier = var.tier
|
||||
# ADR-0014 service identity: monitoring is a multi-Service namespace, so
|
||||
# the namespace alone can't attribute Goldmane flows. Value = the
|
||||
# fronting Service name (kubernetes_service.blackbox_exporter is named
|
||||
# "blackbox-exporter").
|
||||
"service-identity" = "blackbox-exporter"
|
||||
}
|
||||
annotations = {
|
||||
"reloader.stakater.com/search" = "true"
|
||||
|
|
@ -146,6 +151,10 @@ resource "kubernetes_deployment" "blackbox_exporter" {
|
|||
metadata {
|
||||
labels = {
|
||||
app = "blackbox-exporter"
|
||||
# ADR-0014: Goldmane/Felix stamps POD labels onto flows, so the
|
||||
# disambiguating identity must live on the pod template (not just
|
||||
# the Deployment metadata above). Not in selector → no replace.
|
||||
"service-identity" = "blackbox-exporter"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
|
|
|
|||
|
|
@ -5,6 +5,11 @@ resource "kubernetes_deployment" "goflow2" {
|
|||
labels = {
|
||||
app = "goflow2"
|
||||
tier = var.tier
|
||||
# ADR-0014 service identity: monitoring is a multi-Service namespace, so
|
||||
# the namespace alone can't attribute Goldmane flows. Value = the
|
||||
# fronting Service name (kubernetes_service.goflow2 — the metrics svc; the
|
||||
# goflow2-netflow NodePort is the same pod by another name).
|
||||
"service-identity" = "goflow2"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
|
|
@ -18,6 +23,10 @@ resource "kubernetes_deployment" "goflow2" {
|
|||
metadata {
|
||||
labels = {
|
||||
app = "goflow2"
|
||||
# ADR-0014: Goldmane/Felix stamps POD labels onto flows, so the
|
||||
# disambiguating identity must live on the pod template (not just
|
||||
# the Deployment metadata above). Not in selector → no replace.
|
||||
"service-identity" = "goflow2"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
|
|
|
|||
|
|
@ -47,6 +47,10 @@ resource "kubernetes_deployment" "idrac-redfish" {
|
|||
labels = {
|
||||
app = "idrac-redfish-exporter"
|
||||
tier = var.tier
|
||||
# ADR-0014 service identity: monitoring is a multi-Service namespace, so
|
||||
# the namespace alone can't attribute Goldmane flows. Value = the
|
||||
# fronting Service name (kubernetes_service.idrac-redfish-exporter).
|
||||
"service-identity" = "idrac-redfish-exporter"
|
||||
}
|
||||
annotations = {
|
||||
"reloader.stakater.com/search" = "true"
|
||||
|
|
@ -63,6 +67,10 @@ resource "kubernetes_deployment" "idrac-redfish" {
|
|||
metadata {
|
||||
labels = {
|
||||
app = "idrac-redfish-exporter"
|
||||
# ADR-0014: Goldmane/Felix stamps POD labels onto flows, so the
|
||||
# disambiguating identity must live on the pod template (not just
|
||||
# the Deployment metadata above). Not in selector → no replace.
|
||||
"service-identity" = "idrac-redfish-exporter"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
|
|
|
|||
|
|
@ -1450,6 +1450,49 @@ serverFiles:
|
|||
Remediation: right-size top reservers via Goldilocks (immich-server,
|
||||
frigate, prometheus, pg-cluster, paperless) or bump VM RAM on
|
||||
k8s-node2/k8s-node3 from 32GB → 48GB to match node1.
|
||||
# Goldmane edge-aggregator (ADR-0014 / infra #58, #61): the durable
|
||||
# who-talks-to-whom trail. The aggregator pod has NO /metrics endpoint,
|
||||
# so its health is inferred from kube-state-metrics signals — the trail
|
||||
# must not silently die. Two failure modes are covered:
|
||||
# - the aggregate Deployment stops consuming Goldmane's flow stream
|
||||
# (AggregatorDown) → no new edges ever land in the goldmane_edges DB
|
||||
# - the daily digest CronJob can't post new edges to Slack
|
||||
# (DigestFailing) → edges still land but nobody is told.
|
||||
# A freshness probe (max(last_seen) staleness) is intentionally NOT here:
|
||||
# AggregatorDown is the agreed floor and needs no extra moving parts.
|
||||
- name: Network Observability (Goldmane)
|
||||
rules:
|
||||
# Deployment has <1 available replica for 15m. kube-state-metrics
|
||||
# keeps `kube_deployment_status_replicas_available` (metric-keep list
|
||||
# in serverFiles below). The 15m window rides out a normal rollout /
|
||||
# node drain without paging; a genuinely-dead aggregator means the
|
||||
# edge trail has stopped recording and stays down.
|
||||
- alert: AggregatorDown
|
||||
expr: |
|
||||
kube_deployment_status_replicas_available{namespace="goldmane-edge-aggregator",deployment="goldmane-edge-aggregator"} < 1
|
||||
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "goldmane-edge-aggregator has no available replica — the who-talks-to-whom edge trail has stopped recording"
|
||||
description: "The aggregate Deployment streams Calico Goldmane flows into the goldmane_edges CNPG DB. With 0 replicas, no new namespace-pair edges are captured. `kubectl -n goldmane-edge-aggregator describe deploy goldmane-edge-aggregator` + check the goldmane svc (calico-system) is reachable."
|
||||
# The goldmane-edges-digest CronJob has a failed Job that started in
|
||||
# the last 24h. Mirrors the generic JobFailed shape but scoped to the
|
||||
# digest so it routes here. `for: 30m` rides out the apply/scrape
|
||||
# transient; the digest runs daily so a real failure won't self-heal
|
||||
# until the next run — surface it same-day rather than waiting 24h.
|
||||
- alert: DigestFailing
|
||||
expr: |
|
||||
kube_job_status_failed{namespace="goldmane-edge-aggregator", job_name=~"goldmane-edges-digest.*"} > 0
|
||||
and on(namespace, job_name)
|
||||
(time() - kube_job_status_start_time{namespace="goldmane-edge-aggregator", job_name=~"goldmane-edges-digest.*"}) < 86400
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "goldmane-edges-digest CronJob failing — new edges captured but not posted to #security"
|
||||
description: "The daily edge digest Job {{ $labels.job_name }} failed. Edges may still be landing in the goldmane_edges DB but no one is being notified of new namespace-pairs. `kubectl -n goldmane-edge-aggregator logs job/{{ $labels.job_name }}`."
|
||||
- name: Infrastructure Health
|
||||
rules:
|
||||
- alert: HomeAssistantDown
|
||||
|
|
|
|||
|
|
@ -22,6 +22,10 @@ resource "kubernetes_deployment" "pve_exporter" {
|
|||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
labels = {
|
||||
tier = var.tier
|
||||
# ADR-0014 service identity: monitoring is a multi-Service namespace, so
|
||||
# the namespace alone can't attribute Goldmane flows. Value = the
|
||||
# fronting Service name (kubernetes_service.proxmox-exporter).
|
||||
"service-identity" = "proxmox-exporter"
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -37,6 +41,10 @@ resource "kubernetes_deployment" "pve_exporter" {
|
|||
metadata {
|
||||
labels = {
|
||||
app = "proxmox-exporter"
|
||||
# ADR-0014: Goldmane/Felix stamps POD labels onto flows, so the
|
||||
# disambiguating identity must live on the pod template (not just
|
||||
# the Deployment metadata above). Not in selector → no replace.
|
||||
"service-identity" = "proxmox-exporter"
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -31,6 +31,10 @@ resource "kubernetes_deployment" "snmp-exporter" {
|
|||
labels = {
|
||||
app = "snmp-exporter"
|
||||
tier = var.tier
|
||||
# ADR-0014 service identity: monitoring is a multi-Service namespace, so
|
||||
# the namespace alone can't attribute Goldmane flows. Value = the
|
||||
# fronting Service name (kubernetes_service.snmp-exporter).
|
||||
"service-identity" = "snmp-exporter"
|
||||
}
|
||||
annotations = {
|
||||
"reloader.stakater.com/search" = "true"
|
||||
|
|
@ -47,6 +51,10 @@ resource "kubernetes_deployment" "snmp-exporter" {
|
|||
metadata {
|
||||
labels = {
|
||||
app = "snmp-exporter"
|
||||
# ADR-0014: Goldmane/Felix stamps POD labels onto flows, so the
|
||||
# disambiguating identity must live on the pod template (not just
|
||||
# the Deployment metadata above). Not in selector → no replace.
|
||||
"service-identity" = "snmp-exporter"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue