fix: cluster healthcheck fixes + Authentik upgrade to 2026.2.2

- Authentik: upgrade 2025.10.3 → 2025.12.4 → 2026.2.2 with DB restore
  and stepped migration. Switch to existingSecret, PgBouncer session mode.
- Mailserver: migrate email roundtrip probe from Mailgun to Brevo API
- Redis: fix HAProxy tcp-check regex (rstring), faster health intervals
- Nextcloud: fix Redis fallback to HAProxy service, update dependency
- MeshCentral: fix TLSOffload + certUrl init container for first-run
- Monitoring: remove authentik from latency alert exclusion
- Diun: simplify to webhook notifier, remove git auto-update

[ci skip]

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-04-15 06:41:56 +00:00
parent d31bbc9a18
commit bd41bb9230
11 changed files with 115 additions and 282 deletions

View file

@ -40,8 +40,8 @@ graph TB
| Component | Version | Location | Purpose |
|-----------|---------|----------|---------|
| Authentik Server | Latest | `stacks/authentik/` | Core IdP application servers (3 replicas) |
| Authentik Worker | Latest | `stacks/authentik/` | Background task processors (3 replicas) |
| Authentik Server | 2026.2.2 | `stacks/authentik/` | Core IdP application servers (2 replicas) |
| Authentik Worker | 2026.2.2 | `stacks/authentik/` | Background task processors (2 replicas) |
| PgBouncer | Latest | `stacks/authentik/` | PostgreSQL connection pooler (3 replicas) |
| Embedded Outpost | - | Built into Authentik | Forward auth endpoint for Traefik |
| Traefik ForwardAuth | - | `ingress_factory` module | Middleware for protected ingresses |

View file

@ -55,21 +55,23 @@ resource "helm_release" "authentik" {
repository = "https://charts.goauthentik.io/"
chart = "authentik"
# version = "2025.8.1"
version = "2025.10.3"
# version = "2025.10.3"
# version = "2025.12.4"
version = "2026.2.2"
atomic = true
timeout = 6000
values = [templatefile("${path.module}/values.yaml", { postgres_password = var.postgres_password, secret_key = var.secret_key, redis_host = var.redis_host })]
values = [templatefile("${path.module}/values.yaml", { postgres_password = var.postgres_password, secret_key = var.secret_key })]
}
module "ingress" {
source = "../../../../modules/kubernetes/ingress_factory"
namespace = kubernetes_namespace.authentik.metadata[0].name
name = "authentik"
service_name = "goauthentik-server"
tls_secret_name = var.tls_secret_name
source = "../../../../modules/kubernetes/ingress_factory"
namespace = kubernetes_namespace.authentik.metadata[0].name
name = "authentik"
service_name = "goauthentik-server"
tls_secret_name = var.tls_secret_name
anti_ai_scraping = false
extra_annotations = {
"gethomepage.dev/enabled" = "true"
"gethomepage.dev/name" = "Authentik"
@ -84,12 +86,14 @@ module "ingress" {
}
module "ingress-outpost" {
source = "../../../../modules/kubernetes/ingress_factory"
namespace = kubernetes_namespace.authentik.metadata[0].name
name = "authentik-outpost"
host = "authentik"
service_name = "ak-outpost-authentik-embedded-outpost"
port = 9000
ingress_path = ["/outpost.goauthentik.io"]
tls_secret_name = var.tls_secret_name
source = "../../../../modules/kubernetes/ingress_factory"
namespace = kubernetes_namespace.authentik.metadata[0].name
name = "authentik-outpost"
host = "authentik"
service_name = "ak-outpost-authentik-embedded-outpost"
port = 9000
ingress_path = ["/outpost.goauthentik.io"]
tls_secret_name = var.tls_secret_name
anti_ai_scraping = false
exclude_crowdsec = true
}

View file

@ -6,7 +6,7 @@ listen_addr = 0.0.0.0
listen_port = 6432
auth_type = md5
auth_file = /etc/pgbouncer/userlist.txt
pool_mode = transaction
pool_mode = session
max_client_conn = 200
default_pool_size = 20
reserve_pool_size = 5

View file

@ -1,19 +1,19 @@
authentik:
log_level: warning
# log_level: trace
secret_key: "${secret_key}"
secret_key: ""
existingSecret:
secretName: "goauthentik"
# This sends anonymous usage-data, stack traces on errors and
# performance data to authentik.error-reporting.a7k.io, and is fully opt-in
error_reporting:
enabled: true
enabled: false
postgresql:
# host: postgresql.dbaas
host: pgbouncer.authentik
port: 6432
user: authentik
password: ${postgres_password}
redis:
host: ${redis_host}
password: ""
server:
replicas: 2
@ -58,9 +58,9 @@ worker:
resources:
requests:
cpu: 100m
memory: 1Gi
memory: 1.5Gi
limits:
memory: 1Gi
memory: 1.5Gi
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
@ -71,3 +71,6 @@ worker:
pdb:
enabled: true
maxUnavailable: 1
postgresql:
enabled: false

View file

@ -41,44 +41,6 @@ resource "kubernetes_manifest" "external_secret" {
depends_on = [kubernetes_namespace.diun]
}
resource "kubernetes_manifest" "external_secret_git" {
manifest = {
apiVersion = "external-secrets.io/v1beta1"
kind = "ExternalSecret"
metadata = {
name = "diun-git-secrets"
namespace = "diun"
}
spec = {
refreshInterval = "15m"
secretStoreRef = {
name = "vault-kv"
kind = "ClusterSecretStore"
}
target = {
name = "diun-git-secrets"
}
data = [
{
secretKey = "git_token"
remoteRef = {
key = "viktor"
property = "webhook_handler_git_token"
}
},
{
secretKey = "git_user"
remoteRef = {
key = "viktor"
property = "webhook_handler_git_user"
}
}
]
}
}
depends_on = [kubernetes_namespace.diun]
}
module "tls_secret" {
source = "../../modules/kubernetes/setup_tls_secret"
namespace = kubernetes_namespace.diun.metadata[0].name
@ -119,28 +81,6 @@ resource "kubernetes_cluster_role_binding" "diun" {
}
}
resource "kubernetes_persistent_volume_claim" "repo" {
wait_until_bound = false
metadata {
name = "diun-repo"
namespace = kubernetes_namespace.diun.metadata[0].name
annotations = {
"resize.topolvm.io/threshold" = "80%"
"resize.topolvm.io/increase" = "100%"
"resize.topolvm.io/storage_limit" = "5Gi"
}
}
spec {
access_modes = ["ReadWriteOnce"]
storage_class_name = "proxmox-lvm"
resources {
requests = {
storage = "1Gi"
}
}
}
}
resource "kubernetes_persistent_volume_claim" "data_proxmox" {
wait_until_bound = false
metadata {
@ -163,81 +103,6 @@ resource "kubernetes_persistent_volume_claim" "data_proxmox" {
}
}
resource "kubernetes_config_map_v1" "auto_update_script" {
metadata {
name = "diun-auto-update-script"
namespace = kubernetes_namespace.diun.metadata[0].name
}
data = {
"auto-update.sh" = <<-SCRIPT
#!/bin/sh
set -e
# Only act on updates (not new or unchanged)
[ "$$DIUN_ENTRY_STATUS" = "update" ] || exit 0
IMAGE="$$DIUN_ENTRY_IMAGE"
NEW_TAG="$$DIUN_ENTRY_IMAGETAG"
echo "[auto-update] Detected update: $$IMAGE -> $$NEW_TAG"
# Skip databases
case "$$IMAGE" in
*postgres*|*mysql*|*redis*|*clickhouse*|*etcd*) echo "[auto-update] Skipping database image"; exit 0 ;;
esac
# Skip custom images (handled by CI/CD)
case "$$IMAGE" in
viktorbarzin/*|registry.viktorbarzin.me/*|ancamilea/*|mghee/*) echo "[auto-update] Skipping CI/CD-managed image"; exit 0 ;;
esac
# Skip kube-system / infrastructure images
case "$$IMAGE" in
registry.k8s.io/*|quay.io/tigera/*|quay.io/metallb/*|nvcr.io/*|reg.kyverno.io/*) echo "[auto-update] Skipping infrastructure image"; exit 0 ;;
esac
# Acquire lock (serialize concurrent DIUN notifications)
exec 200>/tmp/auto-update.lock
flock -n 200 || { echo "[auto-update] Another update in progress, skipping"; exit 0; }
cd /repo
# Configure git
git config user.email "diun@viktorbarzin.me"
git config user.name "DIUN Auto-Update"
# Pull latest using HTTPS with token
git remote set-url origin "https://$${GIT_USER}:$${GIT_TOKEN}@github.com/ViktorBarzin/infra.git"
git pull --rebase origin master || { echo "[auto-update] git pull failed"; exit 1; }
# Find .tf files containing this image
MATCHES=$$(grep -rl "\"$${IMAGE}:" stacks/ --include="*.tf" 2>/dev/null || true)
[ -z "$$MATCHES" ] && { echo "[auto-update] No .tf file found for $$IMAGE"; exit 0; }
# Update the image tag in each matching file
UPDATED=0
for FILE in $$MATCHES; do
if sed -i "s|\"$${IMAGE}:[^\"]*\"|\"$${IMAGE}:$${NEW_TAG}\"|g" "$$FILE"; then
echo "[auto-update] Updated $$FILE"
UPDATED=1
fi
done
# Check if anything actually changed
if git diff --quiet; then
echo "[auto-update] No changes after update for $$IMAGE:$$NEW_TAG (already up to date)"
exit 0
fi
# Commit and push
git add -A stacks/
git commit -m "auto-update: $${IMAGE} -> $${NEW_TAG}"
git push origin master
echo "[auto-update] Pushed update: $${IMAGE}:$${NEW_TAG}"
SCRIPT
}
}
resource "kubernetes_deployment" "diun" {
metadata {
name = "diun"
@ -269,50 +134,6 @@ resource "kubernetes_deployment" "diun" {
}
spec {
service_account_name = "diun"
init_container {
name = "clone-repo"
image = "alpine/git:latest"
command = ["/bin/sh", "-c"]
args = [<<-EOF
if [ -d /repo/.git ]; then
cd /repo && git pull --rebase origin master || true
else
git clone https://$${GIT_USER}:$${GIT_TOKEN}@github.com/ViktorBarzin/infra.git /repo
fi
EOF
]
env {
name = "GIT_USER"
value_from {
secret_key_ref {
name = "diun-git-secrets"
key = "git_user"
}
}
}
env {
name = "GIT_TOKEN"
value_from {
secret_key_ref {
name = "diun-git-secrets"
key = "git_token"
}
}
}
volume_mount {
name = "repo"
mount_path = "/repo"
}
resources {
requests = {
cpu = "10m"
memory = "64Mi"
}
limits = {
memory = "128Mi"
}
}
}
container {
image = "viktorbarzin/diun:latest"
name = "diun"
@ -349,12 +170,25 @@ resource "kubernetes_deployment" "diun" {
name = "DIUN_DEFAULTS_SORTTAGS"
value = "reverse"
}
# Script notifier for auto-updates
# Webhook notifier for upgrade agent (via n8n)
env {
name = "DIUN_NOTIF_SCRIPT_CMD"
value = "/scripts/auto-update.sh"
name = "DIUN_NOTIF_WEBHOOK_ENDPOINT"
value_from {
secret_key_ref {
name = "diun-secrets"
key = "n8n_webhook_url"
}
}
}
# Slack notifier (kept alongside script notifier)
env {
name = "DIUN_NOTIF_WEBHOOK_METHOD"
value = "POST"
}
env {
name = "DIUN_NOTIF_WEBHOOK_HEADERS_CONTENT-TYPE"
value = "application/json"
}
# Slack notifier (independent notification channel)
env {
name = "DIUN_NOTIF_SLACK_WEBHOOKURL"
value_from {
@ -364,25 +198,6 @@ resource "kubernetes_deployment" "diun" {
}
}
}
# Git credentials for auto-update script
env {
name = "GIT_USER"
value_from {
secret_key_ref {
name = "diun-git-secrets"
key = "git_user"
}
}
}
env {
name = "GIT_TOKEN"
value_from {
secret_key_ref {
name = "diun-git-secrets"
key = "git_token"
}
}
}
env {
name = "LOG_LEVEL"
value = "debug"
@ -391,14 +206,6 @@ resource "kubernetes_deployment" "diun" {
name = "data"
mount_path = "/data"
}
volume_mount {
name = "scripts"
mount_path = "/scripts"
}
volume_mount {
name = "repo"
mount_path = "/repo"
}
resources {
requests = {
cpu = "10m"
@ -415,19 +222,6 @@ resource "kubernetes_deployment" "diun" {
claim_name = kubernetes_persistent_volume_claim.data_proxmox.metadata[0].name
}
}
volume {
name = "scripts"
config_map {
name = kubernetes_config_map_v1.auto_update_script.metadata[0].name
default_mode = "0755"
}
}
volume {
name = "repo"
persistent_volume_claim {
claim_name = kubernetes_persistent_volume_claim.repo.metadata[0].name
}
}
}
}
}

View file

@ -34,6 +34,6 @@ module "mailserver" {
sasl_passwd = local.mailserver_sasl_passwd
roundcube_db_password = data.vault_kv_secret_v2.secrets.data["mailserver_roundcubemail_db_password"]
tier = local.tiers.edge
mailgun_api_key = data.vault_kv_secret_v2.viktor.data["mailgun_api_key"]
brevo_api_key = jsondecode(base64decode(data.vault_kv_secret_v2.viktor.data["brevo_api_key"]))["api_key"]
email_monitor_imap_password = local.mailserver_accounts["spam@viktorbarzin.me"]
}

View file

@ -5,7 +5,7 @@ variable "postfix_account_aliases" {}
variable "opendkim_key" {}
variable "sasl_passwd" {} # For sendgrid i.e relayhost
variable "nfs_server" { type = string }
variable "mailgun_api_key" {
variable "brevo_api_key" {
type = string
sensitive = true
}
@ -537,7 +537,7 @@ resource "kubernetes_service" "mailserver" {
# =============================================================================
# E2E Email Roundtrip Monitor
# Sends test email via Mailgun API, verifies delivery via IMAP, pushes metrics
# Sends test email via Brevo API, verifies delivery via IMAP, pushes metrics
# =============================================================================
resource "kubernetes_cron_job_v1" "email_roundtrip_monitor" {
metadata {
@ -562,9 +562,9 @@ resource "kubernetes_cron_job_v1" "email_roundtrip_monitor" {
image = "docker.io/library/python:3.12-alpine"
command = ["/bin/sh", "-c", <<-EOT
pip install --quiet --disable-pip-version-check requests && python3 -c '
import requests, imaplib, email, time, os, uuid, sys, ssl
import requests, imaplib, email, time, os, uuid, sys, ssl, json
MAILGUN_API_KEY = os.environ["MAILGUN_API_KEY"]
BREVO_API_KEY = os.environ["BREVO_API_KEY"]
IMAP_USER = "spam@viktorbarzin.me"
IMAP_PASS = os.environ["EMAIL_MONITOR_IMAP_PASSWORD"]
IMAP_HOST = "mailserver.mailserver.svc.cluster.local"
@ -578,20 +578,24 @@ success = 0
duration = 0
try:
# Step 1: Send via Mailgun HTTP API to smoke-test@ (hits catch-all -> spam@)
# Step 1: Send via Brevo Transactional Email API to smoke-test@ (hits catch-all -> spam@)
resp = requests.post(
f"https://api.eu.mailgun.net/v3/{DOMAIN}/messages",
auth=("api", MAILGUN_API_KEY),
data={
"from": f"monitoring@{DOMAIN}",
"to": f"smoke-test@{DOMAIN}",
"https://api.brevo.com/v3/smtp/email",
headers={
"api-key": BREVO_API_KEY,
"Content-Type": "application/json",
"Accept": "application/json",
},
json={
"sender": {"name": "Monitoring", "email": f"monitoring@{DOMAIN}"},
"to": [{"email": f"smoke-test@{DOMAIN}"}],
"subject": subject,
"text": f"E2E email monitoring probe {marker}. Auto-generated, will be deleted.",
"textContent": f"E2E email monitoring probe {marker}. Auto-generated, will be deleted.",
},
timeout=30,
)
resp.raise_for_status()
print(f"Sent test email via Mailgun: {resp.status_code} marker={marker}")
print(f"Sent test email via Brevo: {resp.status_code} marker={marker}")
# Step 2: Wait for delivery, retry IMAP up to 3 min
ctx = ssl.create_default_context()
@ -667,8 +671,8 @@ sys.exit(0 if success else 1)
EOT
]
env {
name = "MAILGUN_API_KEY"
value = var.mailgun_api_key
name = "BREVO_API_KEY"
value = var.brevo_api_key
}
env {
name = "EMAIL_MONITOR_IMAP_PASSWORD"

View file

@ -114,19 +114,43 @@ resource "kubernetes_deployment" "meshcentral" {
image_pull_policy = "IfNotPresent"
command = ["/bin/sh"]
args = ["-c", <<-EOT
if [ -f /opt/meshcentral/meshcentral-data/config.json ]; then
CONFIG=/opt/meshcentral/meshcentral-data/config.json
if [ -f "$CONFIG" ]; then
# Disable certUrl when using Traefik reverse proxy with TLS offload
sed -i 's/"certUrl":/"_certUrl":/g' /opt/meshcentral/meshcentral-data/config.json
sed -i 's/"certUrl":/"_certUrl":/g' "$CONFIG"
# Fix WebRTC value from string to boolean
sed -i 's/"WebRTC": "[^"]*"/"WebRTC": false/g' /opt/meshcentral/meshcentral-data/config.json
sed -i 's/"WebRTC": "[^"]*"/"WebRTC": false/g' "$CONFIG"
# Ensure TLSOffload is enabled (Traefik terminates TLS, MeshCentral serves HTTP on 443)
# Re-enable if previously disabled by restoring _TLSOffload back to TLSOffload
sed -i 's/"_TLSOffload":/"TLSOffload":/g' /opt/meshcentral/meshcentral-data/config.json
# Set TLSOffload to true (accepts any reverse proxy)
sed -i 's/"TLSOffload": "[^"]*"/"TLSOffload": true/g' /opt/meshcentral/meshcentral-data/config.json
sed -i 's/"TLSOffload": false/"TLSOffload": true/g' /opt/meshcentral/meshcentral-data/config.json
sed -i 's/"_TLSOffload":/"TLSOffload":/g' "$CONFIG"
sed -i 's/"TLSOffload": "[^"]*"/"TLSOffload": true/g' "$CONFIG"
sed -i 's/"TLSOffload": false/"TLSOffload": true/g' "$CONFIG"
else
# First run: create config from template before startup.sh runs, so REVERSE_PROXY
# env var doesn't generate a bad certUrl. Pre-seed with correct values.
cat > "$CONFIG" <<'CONF'
{
"$schema": "http://info.meshcentral.com/downloads/meshcentral-config-schema.json",
"settings": {
"cert": "meshcentral.viktorbarzin.me",
"_WANonly": true,
"_LANonly": true,
"port": 443,
"redirPort": 80,
"AgentPong": 300,
"TLSOffload": true,
"SelfUpdate": false,
"AllowFraming": false,
"WebRTC": false
},
"domains": {
"": {
"NewAccounts": false
}
}
}
CONF
fi
EOT
]
@ -153,7 +177,7 @@ EOT
}
env {
name = "REVERSE_PROXY"
value = "true"
value = "false"
}
env {
name = "ALLOW_NEW_ACCOUNTS"

View file

@ -1594,10 +1594,10 @@ serverFiles:
- alert: HighServiceLatency
expr: |
(
sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*|.*headscale.*|.*authentik.*"}[5m])) by (service)
/ sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*authentik.*"}[5m])) by (service)
sum(rate(traefik_service_request_duration_seconds_sum{service!~".*idrac.*|.*headscale.*"}[5m])) by (service)
/ sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*"}[5m])) by (service)
) > 10
and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*|.*authentik.*"}[5m])) by (service) > 0.01
and sum(rate(traefik_service_request_duration_seconds_count{service!~".*idrac.*|.*headscale.*"}[5m])) by (service) > 0.01
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
for: 5m
labels:

View file

@ -30,12 +30,14 @@ nextcloud:
zzz-redis.config.php: |
<?php
// Redis with Sentinel-based master discovery
// Queries Sentinel at startup to find the current master, falls back to direct host
// Queries Sentinel to find the current master, falls back to HAProxy service
// which health-checks Redis nodes and routes only to the master.
$sentinels = [
['redis-node-0.redis-headless.redis.svc.cluster.local', 26379],
['redis-node-1.redis-headless.redis.svc.cluster.local', 26379],
];
$redisHost = 'redis-node-0.redis-headless.redis.svc.cluster.local';
// Fallback: HAProxy master-only service (safe even if Sentinel is unavailable)
$redisHost = 'redis-master.redis.svc.cluster.local';
$redisPort = 6379;
foreach ($sentinels as [$sHost, $sPort]) {
try {
@ -145,7 +147,7 @@ readinessProbe:
podAnnotations:
diun.enable: "true"
diun.include_tags: "^[0-9]+(?:.[0-9]+)?(?:.[0-9]+)?.*"
dependency.kyverno.io/wait-for: "mysql.dbaas:3306,redis.redis:6379"
dependency.kyverno.io/wait-for: "mysql.dbaas:3306,redis-master.redis:6379"
secret.reloader.stakater.com/reload: "nextcloud-db-creds"
collabora:

View file

@ -154,11 +154,13 @@ resource "kubernetes_config_map" "haproxy" {
tcp-check send "PING\r\n"
tcp-check expect string +PONG
tcp-check send "INFO replication\r\n"
tcp-check expect string role:master
# Match "role:master" only cannot appear in slave responses
# (slave has "role:slave" then "master_host:..." which doesn't match)
tcp-check expect rstring role:master
tcp-check send "QUIT\r\n"
tcp-check expect string +OK
server redis-node-0 redis-node-0.redis-headless.redis.svc.cluster.local:6379 check inter 3s fall 3 rise 2
server redis-node-1 redis-node-1.redis-headless.redis.svc.cluster.local:6379 check inter 3s fall 3 rise 2
server redis-node-0 redis-node-0.redis-headless.redis.svc.cluster.local:6379 check inter 1s fall 2 rise 2
server redis-node-1 redis-node-1.redis-headless.redis.svc.cluster.local:6379 check inter 1s fall 2 rise 2
backend redis_sentinel
balance roundrobin