postiz+portal: remove broken alert sources (stale backup CronJob, bogus scrape annotations)

Viktor is getting daily Slack alert noise; these two were the recurring generators. The postiz-postgres-backup CronJob still dumped from the old in-namespace postiz-postgresql service that was removed in the CNPG migration (2026-06-28) — it failed every night at 03:00 and re-fired BackupCronJobFailed each day. The postiz DB now lives on the shared CNPG cluster and is already covered by the dbaas per-db dumps, so the CronJob (and its NFS backup volume) is redundant and removed rather than repaired. portal-stt/portal-tts advertised prometheus.io scrape annotations that never worked: the deployed Speaches build 404s /metrics, and openai-edge-tts has no metrics at all (its annotation pointed at a JSON endpoint, which fails exposition parsing regardless). Both produced a permanently firing ScrapeTargetDown. Annotations removed until the apps actually serve metrics. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-01 22:35:21 +00:00 · 2026-07-01 22:35:21 +00:00 · 3c476dab32
commit 3c476dab32
parent 5a312563c6
3 changed files with 6 additions and 121 deletions
--- a/stacks/portal-stt/main.tf
+++ b/stacks/portal-stt/main.tf
@ -334,13 +334,9 @@ resource "kubernetes_service" "portal_stt" {
    name      = "portal-stt"
    namespace = kubernetes_namespace.portal_stt.metadata[0].name
    labels    = local.labels
-    annotations = {
+    # No scrape annotations: the deployed Speaches build 404s /metrics, so the
-      # Speaches exposes Prometheus metrics at /metrics — wire annotation-based
+    # annotation-based scrape only produced a permanently firing
-      # scrape (Ready-endpoint relabeling already filters non-Ready pods).
+    # ScrapeTargetDown. Re-add when the app actually serves Prometheus metrics.
      "prometheus.io/scrape" = "true"
      "prometheus.io/path"   = "/metrics"
      "prometheus.io/port"   = "8000"
    }
  }
  spec {
    type     = "ClusterIP"
--- a/stacks/portal-tts/main.tf
+++ b/stacks/portal-tts/main.tf
@ -184,14 +184,9 @@ resource "kubernetes_service" "portal_tts" {
    name      = "portal-tts"
    namespace = kubernetes_namespace.portal_tts.metadata[0].name
    labels    = local.labels
-    annotations = {
+    # No scrape annotations: openai-edge-tts exposes no Prometheus metrics, and
-      # openai-edge-tts has no /metrics; annotation-based scrape kept on a live
+    # scraping a JSON endpoint (/v1/models) fails exposition parsing anyway ->
-      # path so the Service stays in the scrape set (Ready-endpoint relabeling
+    # up=0 -> a permanently firing ScrapeTargetDown.
      # filters non-Ready pods). /v1/models is the OpenAI model list.
      "prometheus.io/scrape" = "true"
      "prometheus.io/path"   = "/v1/models"
      "prometheus.io/port"   = "8000"
    }
  }
  spec {
    type     = "ClusterIP"
--- a/stacks/postiz/modules/postiz/main.tf
+++ b/stacks/postiz/modules/postiz/main.tf
@ -493,109 +493,3 @@ resource "kubernetes_service" "temporal" {
    }
  }
 }
 # ──────────────────────────────────────────────────────────────────────────────
 # Backup CronJob — nightly pg_dump of the bundled postiz-postgresql to NFS.
 #
 # The bundled PostgreSQL StatefulSet uses local-path storage on the K8s node
 # OS disk (chart default), which is NOT covered by Layer 1 (LVM thin
 # snapshots) or Layer 2 (sda file backup) of the 3-2-1 pipeline. A pg_dump
 # CronJob writing to /srv/nfs/postiz-backup/ closes the gap: dumps land on
 # Proxmox host NFS → covered by inotify-driven offsite sync to Synology.
 # Three databases are dumped: postiz (app data), temporal (workflow engine),
 # temporal_visibility (workflow search). Bitnami chart-default credentials
 # are used — same creds the Postiz pod itself uses, scoped to the postiz
 # namespace via ClusterIP-only Services.
 # ──────────────────────────────────────────────────────────────────────────────
 module "nfs_backup_host" {
  source     = "../../../../modules/kubernetes/nfs_volume"
  name       = "postiz-backup-host"
  namespace  = kubernetes_namespace.postiz.metadata[0].name
  nfs_server = "192.168.1.127"
  nfs_path   = "/srv/nfs/postiz-backup"
 }
 resource "kubernetes_cron_job_v1" "postgres_backup" {
  metadata {
    name      = "postiz-postgres-backup"
    namespace = kubernetes_namespace.postiz.metadata[0].name
    labels    = { app = "postiz", component = "backup" }
  }
  spec {
    schedule                      = "0 3 * * *"
    concurrency_policy            = "Forbid"
    successful_jobs_history_limit = 3
    failed_jobs_history_limit     = 5
    job_template {
      metadata {}
      spec {
        backoff_limit              = 1
        ttl_seconds_after_finished = 86400
        template {
          metadata {
            labels = { app = "postiz", component = "backup" }
          }
          spec {
            restart_policy = "OnFailure"
            container {
              name = "backup"
              # Same image/pattern as dbaas/postgresql-backup: official postgres
              # client tools + apt-installed curl for the Pushgateway push. The
              # bitnamilegacy/postgresql variant is stripped (no curl/wget/python),
              # so the metric push silently failed there.
              image   = "docker.io/library/postgres:16.4-bullseye"
              command = ["/bin/bash", "-c"]
              args = [
                <<-EOT
                set -uo pipefail
                apt-get update -qq && apt-get install -yqq curl >/dev/null 2>&1 || true
                TIMESTAMP=$(date +%Y%m%d_%H%M)
                BACKUP_DIR=/backup
                STATUS=0
                for db in postiz; do
                  echo "Dumping $db..."
                  if PGPASSWORD=postiz-password pg_dump -h postiz-postgresql -U postiz \
                       --format=custom --compress=6 \
                       --file="$BACKUP_DIR/$db-$TIMESTAMP.dump" \
                       "$db"; then
                    echo "  OK: $db ($(du -h "$BACKUP_DIR/$db-$TIMESTAMP.dump" | cut -f1))"
                  else
                    echo "  FAIL: $db" >&2
                    STATUS=1
                  fi
                done
                find "$BACKUP_DIR" -name '*.dump' -mtime +30 -delete 2>/dev/null || true
                {
                  echo "backup_last_run_timestamp $(date +%s)"
                  echo "backup_last_status $STATUS"
                  [ "$STATUS" -eq 0 ] && echo "backup_last_success_timestamp $(date +%s)"
                } | curl -sf --connect-timeout 5 --max-time 10 --data-binary @- \
                  "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/postiz-postgres-backup" || true
                exit $STATUS
                EOT
              ]
              volume_mount {
                name       = "backup"
                mount_path = "/backup"
              }
              resources {
                requests = { cpu = "10m", memory = "64Mi" }
                limits   = { memory = "256Mi" }
              }
            }
            volume {
              name = "backup"
              persistent_volume_claim {
                claim_name = module.nfs_backup_host.claim_name
              }
            }
          }
        }
      }
    }
  }
  lifecycle {
    ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
  }
 }