postiz+portal: remove broken alert sources (stale backup CronJob, bogus scrape annotations)

Viktor is getting daily Slack alert noise; these two were the recurring
generators. The postiz-postgres-backup CronJob still dumped from the old
in-namespace postiz-postgresql service that was removed in the CNPG
migration (2026-06-28) — it failed every night at 03:00 and re-fired
BackupCronJobFailed each day. The postiz DB now lives on the shared CNPG
cluster and is already covered by the dbaas per-db dumps, so the CronJob
(and its NFS backup volume) is redundant and removed rather than repaired.

portal-stt/portal-tts advertised prometheus.io scrape annotations that
never worked: the deployed Speaches build 404s /metrics, and openai-edge-tts
has no metrics at all (its annotation pointed at a JSON endpoint, which
fails exposition parsing regardless). Both produced a permanently firing
ScrapeTargetDown. Annotations removed until the apps actually serve metrics.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-07-01 22:35:21 +00:00
parent 5a312563c6
commit 3c476dab32
3 changed files with 6 additions and 121 deletions

View file

@ -334,13 +334,9 @@ resource "kubernetes_service" "portal_stt" {
name = "portal-stt" name = "portal-stt"
namespace = kubernetes_namespace.portal_stt.metadata[0].name namespace = kubernetes_namespace.portal_stt.metadata[0].name
labels = local.labels labels = local.labels
annotations = { # No scrape annotations: the deployed Speaches build 404s /metrics, so the
# Speaches exposes Prometheus metrics at /metrics wire annotation-based # annotation-based scrape only produced a permanently firing
# scrape (Ready-endpoint relabeling already filters non-Ready pods). # ScrapeTargetDown. Re-add when the app actually serves Prometheus metrics.
"prometheus.io/scrape" = "true"
"prometheus.io/path" = "/metrics"
"prometheus.io/port" = "8000"
}
} }
spec { spec {
type = "ClusterIP" type = "ClusterIP"

View file

@ -184,14 +184,9 @@ resource "kubernetes_service" "portal_tts" {
name = "portal-tts" name = "portal-tts"
namespace = kubernetes_namespace.portal_tts.metadata[0].name namespace = kubernetes_namespace.portal_tts.metadata[0].name
labels = local.labels labels = local.labels
annotations = { # No scrape annotations: openai-edge-tts exposes no Prometheus metrics, and
# openai-edge-tts has no /metrics; annotation-based scrape kept on a live # scraping a JSON endpoint (/v1/models) fails exposition parsing anyway ->
# path so the Service stays in the scrape set (Ready-endpoint relabeling # up=0 -> a permanently firing ScrapeTargetDown.
# filters non-Ready pods). /v1/models is the OpenAI model list.
"prometheus.io/scrape" = "true"
"prometheus.io/path" = "/v1/models"
"prometheus.io/port" = "8000"
}
} }
spec { spec {
type = "ClusterIP" type = "ClusterIP"

View file

@ -493,109 +493,3 @@ resource "kubernetes_service" "temporal" {
} }
} }
} }
#
# Backup CronJob nightly pg_dump of the bundled postiz-postgresql to NFS.
#
# The bundled PostgreSQL StatefulSet uses local-path storage on the K8s node
# OS disk (chart default), which is NOT covered by Layer 1 (LVM thin
# snapshots) or Layer 2 (sda file backup) of the 3-2-1 pipeline. A pg_dump
# CronJob writing to /srv/nfs/postiz-backup/ closes the gap: dumps land on
# Proxmox host NFS covered by inotify-driven offsite sync to Synology.
# Three databases are dumped: postiz (app data), temporal (workflow engine),
# temporal_visibility (workflow search). Bitnami chart-default credentials
# are used same creds the Postiz pod itself uses, scoped to the postiz
# namespace via ClusterIP-only Services.
#
module "nfs_backup_host" {
source = "../../../../modules/kubernetes/nfs_volume"
name = "postiz-backup-host"
namespace = kubernetes_namespace.postiz.metadata[0].name
nfs_server = "192.168.1.127"
nfs_path = "/srv/nfs/postiz-backup"
}
resource "kubernetes_cron_job_v1" "postgres_backup" {
metadata {
name = "postiz-postgres-backup"
namespace = kubernetes_namespace.postiz.metadata[0].name
labels = { app = "postiz", component = "backup" }
}
spec {
schedule = "0 3 * * *"
concurrency_policy = "Forbid"
successful_jobs_history_limit = 3
failed_jobs_history_limit = 5
job_template {
metadata {}
spec {
backoff_limit = 1
ttl_seconds_after_finished = 86400
template {
metadata {
labels = { app = "postiz", component = "backup" }
}
spec {
restart_policy = "OnFailure"
container {
name = "backup"
# Same image/pattern as dbaas/postgresql-backup: official postgres
# client tools + apt-installed curl for the Pushgateway push. The
# bitnamilegacy/postgresql variant is stripped (no curl/wget/python),
# so the metric push silently failed there.
image = "docker.io/library/postgres:16.4-bullseye"
command = ["/bin/bash", "-c"]
args = [
<<-EOT
set -uo pipefail
apt-get update -qq && apt-get install -yqq curl >/dev/null 2>&1 || true
TIMESTAMP=$(date +%Y%m%d_%H%M)
BACKUP_DIR=/backup
STATUS=0
for db in postiz; do
echo "Dumping $db..."
if PGPASSWORD=postiz-password pg_dump -h postiz-postgresql -U postiz \
--format=custom --compress=6 \
--file="$BACKUP_DIR/$db-$TIMESTAMP.dump" \
"$db"; then
echo " OK: $db ($(du -h "$BACKUP_DIR/$db-$TIMESTAMP.dump" | cut -f1))"
else
echo " FAIL: $db" >&2
STATUS=1
fi
done
find "$BACKUP_DIR" -name '*.dump' -mtime +30 -delete 2>/dev/null || true
{
echo "backup_last_run_timestamp $(date +%s)"
echo "backup_last_status $STATUS"
[ "$STATUS" -eq 0 ] && echo "backup_last_success_timestamp $(date +%s)"
} | curl -sf --connect-timeout 5 --max-time 10 --data-binary @- \
"http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/postiz-postgres-backup" || true
exit $STATUS
EOT
]
volume_mount {
name = "backup"
mount_path = "/backup"
}
resources {
requests = { cpu = "10m", memory = "64Mi" }
limits = { memory = "256Mi" }
}
}
volume {
name = "backup"
persistent_volume_claim {
claim_name = module.nfs_backup_host.claim_name
}
}
}
}
}
}
}
lifecycle {
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
}
}