postiz+portal: remove broken alert sources (stale backup CronJob, bogus scrape annotations)

Viktor is getting daily Slack alert noise; these two were the recurring generators. The postiz-postgres-backup CronJob still dumped from the old in-namespace postiz-postgresql service that was removed in the CNPG migration (2026-06-28) — it failed every night at 03:00 and re-fired BackupCronJobFailed each day. The postiz DB now lives on the shared CNPG cluster and is already covered by the dbaas per-db dumps, so the CronJob (and its NFS backup volume) is redundant and removed rather than repaired. portal-stt/portal-tts advertised prometheus.io scrape annotations that never worked: the deployed Speaches build 404s /metrics, and openai-edge-tts has no metrics at all (its annotation pointed at a JSON endpoint, which fails exposition parsing regardless). Both produced a permanently firing ScrapeTargetDown. Annotations removed until the apps actually serve metrics. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-01 22:35:21 +00:00 · 2026-07-01 22:35:21 +00:00 · 3c476dab32
commit 3c476dab32
parent 5a312563c6
3 changed files with 6 additions and 121 deletions
--- a/stacks/portal-stt/main.tf
+++ b/stacks/portal-stt/main.tf
@ -334,13 +334,9 @@ resource "kubernetes_service" "portal_stt" {
    name      = "portal-stt"
    namespace = kubernetes_namespace.portal_stt.metadata[0].name
    labels    = local.labels
-    annotations = {
-      # Speaches exposes Prometheus metrics at /metrics — wire annotation-based
-      # scrape (Ready-endpoint relabeling already filters non-Ready pods).
-      "prometheus.io/scrape" = "true"
-      "prometheus.io/path"   = "/metrics"
-      "prometheus.io/port"   = "8000"
-    }
+    # No scrape annotations: the deployed Speaches build 404s /metrics, so the
+    # annotation-based scrape only produced a permanently firing
+    # ScrapeTargetDown. Re-add when the app actually serves Prometheus metrics.
  }
  spec {
    type     = "ClusterIP"
--- a/stacks/portal-tts/main.tf
+++ b/stacks/portal-tts/main.tf
@ -184,14 +184,9 @@ resource "kubernetes_service" "portal_tts" {
    name      = "portal-tts"
    namespace = kubernetes_namespace.portal_tts.metadata[0].name
    labels    = local.labels
-    annotations = {
-      # openai-edge-tts has no /metrics; annotation-based scrape kept on a live
-      # path so the Service stays in the scrape set (Ready-endpoint relabeling
-      # filters non-Ready pods). /v1/models is the OpenAI model list.
-      "prometheus.io/scrape" = "true"
-      "prometheus.io/path"   = "/v1/models"
-      "prometheus.io/port"   = "8000"
-    }
+    # No scrape annotations: openai-edge-tts exposes no Prometheus metrics, and
+    # scraping a JSON endpoint (/v1/models) fails exposition parsing anyway ->
+    # up=0 -> a permanently firing ScrapeTargetDown.
  }
  spec {
    type     = "ClusterIP"
--- a/stacks/postiz/modules/postiz/main.tf
+++ b/stacks/postiz/modules/postiz/main.tf
@ -493,109 +493,3 @@ resource "kubernetes_service" "temporal" {
    }
  }
 }
-
-# ──────────────────────────────────────────────────────────────────────────────
-# Backup CronJob — nightly pg_dump of the bundled postiz-postgresql to NFS.
-#
-# The bundled PostgreSQL StatefulSet uses local-path storage on the K8s node
-# OS disk (chart default), which is NOT covered by Layer 1 (LVM thin
-# snapshots) or Layer 2 (sda file backup) of the 3-2-1 pipeline. A pg_dump
-# CronJob writing to /srv/nfs/postiz-backup/ closes the gap: dumps land on
-# Proxmox host NFS → covered by inotify-driven offsite sync to Synology.
-# Three databases are dumped: postiz (app data), temporal (workflow engine),
-# temporal_visibility (workflow search). Bitnami chart-default credentials
-# are used — same creds the Postiz pod itself uses, scoped to the postiz
-# namespace via ClusterIP-only Services.
-# ──────────────────────────────────────────────────────────────────────────────
-
-module "nfs_backup_host" {
-  source     = "../../../../modules/kubernetes/nfs_volume"
-  name       = "postiz-backup-host"
-  namespace  = kubernetes_namespace.postiz.metadata[0].name
-  nfs_server = "192.168.1.127"
-  nfs_path   = "/srv/nfs/postiz-backup"
-}
-
-resource "kubernetes_cron_job_v1" "postgres_backup" {
-  metadata {
-    name      = "postiz-postgres-backup"
-    namespace = kubernetes_namespace.postiz.metadata[0].name
-    labels    = { app = "postiz", component = "backup" }
-  }
-  spec {
-    schedule                      = "0 3 * * *"
-    concurrency_policy            = "Forbid"
-    successful_jobs_history_limit = 3
-    failed_jobs_history_limit     = 5
-    job_template {
-      metadata {}
-      spec {
-        backoff_limit              = 1
-        ttl_seconds_after_finished = 86400
-        template {
-          metadata {
-            labels = { app = "postiz", component = "backup" }
-          }
-          spec {
-            restart_policy = "OnFailure"
-            container {
-              name = "backup"
-              # Same image/pattern as dbaas/postgresql-backup: official postgres
-              # client tools + apt-installed curl for the Pushgateway push. The
-              # bitnamilegacy/postgresql variant is stripped (no curl/wget/python),
-              # so the metric push silently failed there.
-              image   = "docker.io/library/postgres:16.4-bullseye"
-              command = ["/bin/bash", "-c"]
-              args = [
-                <<-EOT
-                set -uo pipefail
-                apt-get update -qq && apt-get install -yqq curl >/dev/null 2>&1 || true
-                TIMESTAMP=$(date +%Y%m%d_%H%M)
-                BACKUP_DIR=/backup
-                STATUS=0
-                for db in postiz; do
-                  echo "Dumping $db..."
-                  if PGPASSWORD=postiz-password pg_dump -h postiz-postgresql -U postiz \
-                       --format=custom --compress=6 \
-                       --file="$BACKUP_DIR/$db-$TIMESTAMP.dump" \
-                       "$db"; then
-                    echo "  OK: $db ($(du -h "$BACKUP_DIR/$db-$TIMESTAMP.dump" | cut -f1))"
-                  else
-                    echo "  FAIL: $db" >&2
-                    STATUS=1
-                  fi
-                done
-                find "$BACKUP_DIR" -name '*.dump' -mtime +30 -delete 2>/dev/null || true
-                {
-                  echo "backup_last_run_timestamp $(date +%s)"
-                  echo "backup_last_status $STATUS"
-                  [ "$STATUS" -eq 0 ] && echo "backup_last_success_timestamp $(date +%s)"
-                } | curl -sf --connect-timeout 5 --max-time 10 --data-binary @- \
-                  "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/postiz-postgres-backup" || true
-                exit $STATUS
-                EOT
-              ]
-              volume_mount {
-                name       = "backup"
-                mount_path = "/backup"
-              }
-              resources {
-                requests = { cpu = "10m", memory = "64Mi" }
-                limits   = { memory = "256Mi" }
-              }
-            }
-            volume {
-              name = "backup"
-              persistent_volume_claim {
-                claim_name = module.nfs_backup_host.claim_name
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  lifecycle {
-    ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
-  }
-}