postiz+portal: remove broken alert sources (stale backup CronJob, bogus scrape annotations)
Viktor is getting daily Slack alert noise; these two were the recurring generators. The postiz-postgres-backup CronJob still dumped from the old in-namespace postiz-postgresql service that was removed in the CNPG migration (2026-06-28) — it failed every night at 03:00 and re-fired BackupCronJobFailed each day. The postiz DB now lives on the shared CNPG cluster and is already covered by the dbaas per-db dumps, so the CronJob (and its NFS backup volume) is redundant and removed rather than repaired. portal-stt/portal-tts advertised prometheus.io scrape annotations that never worked: the deployed Speaches build 404s /metrics, and openai-edge-tts has no metrics at all (its annotation pointed at a JSON endpoint, which fails exposition parsing regardless). Both produced a permanently firing ScrapeTargetDown. Annotations removed until the apps actually serve metrics. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
5a312563c6
commit
3c476dab32
3 changed files with 6 additions and 121 deletions
|
|
@ -334,13 +334,9 @@ resource "kubernetes_service" "portal_stt" {
|
|||
name = "portal-stt"
|
||||
namespace = kubernetes_namespace.portal_stt.metadata[0].name
|
||||
labels = local.labels
|
||||
annotations = {
|
||||
# Speaches exposes Prometheus metrics at /metrics — wire annotation-based
|
||||
# scrape (Ready-endpoint relabeling already filters non-Ready pods).
|
||||
"prometheus.io/scrape" = "true"
|
||||
"prometheus.io/path" = "/metrics"
|
||||
"prometheus.io/port" = "8000"
|
||||
}
|
||||
# No scrape annotations: the deployed Speaches build 404s /metrics, so the
|
||||
# annotation-based scrape only produced a permanently firing
|
||||
# ScrapeTargetDown. Re-add when the app actually serves Prometheus metrics.
|
||||
}
|
||||
spec {
|
||||
type = "ClusterIP"
|
||||
|
|
|
|||
|
|
@ -184,14 +184,9 @@ resource "kubernetes_service" "portal_tts" {
|
|||
name = "portal-tts"
|
||||
namespace = kubernetes_namespace.portal_tts.metadata[0].name
|
||||
labels = local.labels
|
||||
annotations = {
|
||||
# openai-edge-tts has no /metrics; annotation-based scrape kept on a live
|
||||
# path so the Service stays in the scrape set (Ready-endpoint relabeling
|
||||
# filters non-Ready pods). /v1/models is the OpenAI model list.
|
||||
"prometheus.io/scrape" = "true"
|
||||
"prometheus.io/path" = "/v1/models"
|
||||
"prometheus.io/port" = "8000"
|
||||
}
|
||||
# No scrape annotations: openai-edge-tts exposes no Prometheus metrics, and
|
||||
# scraping a JSON endpoint (/v1/models) fails exposition parsing anyway ->
|
||||
# up=0 -> a permanently firing ScrapeTargetDown.
|
||||
}
|
||||
spec {
|
||||
type = "ClusterIP"
|
||||
|
|
|
|||
|
|
@ -493,109 +493,3 @@ resource "kubernetes_service" "temporal" {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Backup CronJob — nightly pg_dump of the bundled postiz-postgresql to NFS.
|
||||
#
|
||||
# The bundled PostgreSQL StatefulSet uses local-path storage on the K8s node
|
||||
# OS disk (chart default), which is NOT covered by Layer 1 (LVM thin
|
||||
# snapshots) or Layer 2 (sda file backup) of the 3-2-1 pipeline. A pg_dump
|
||||
# CronJob writing to /srv/nfs/postiz-backup/ closes the gap: dumps land on
|
||||
# Proxmox host NFS → covered by inotify-driven offsite sync to Synology.
|
||||
# Three databases are dumped: postiz (app data), temporal (workflow engine),
|
||||
# temporal_visibility (workflow search). Bitnami chart-default credentials
|
||||
# are used — same creds the Postiz pod itself uses, scoped to the postiz
|
||||
# namespace via ClusterIP-only Services.
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
module "nfs_backup_host" {
|
||||
source = "../../../../modules/kubernetes/nfs_volume"
|
||||
name = "postiz-backup-host"
|
||||
namespace = kubernetes_namespace.postiz.metadata[0].name
|
||||
nfs_server = "192.168.1.127"
|
||||
nfs_path = "/srv/nfs/postiz-backup"
|
||||
}
|
||||
|
||||
resource "kubernetes_cron_job_v1" "postgres_backup" {
|
||||
metadata {
|
||||
name = "postiz-postgres-backup"
|
||||
namespace = kubernetes_namespace.postiz.metadata[0].name
|
||||
labels = { app = "postiz", component = "backup" }
|
||||
}
|
||||
spec {
|
||||
schedule = "0 3 * * *"
|
||||
concurrency_policy = "Forbid"
|
||||
successful_jobs_history_limit = 3
|
||||
failed_jobs_history_limit = 5
|
||||
job_template {
|
||||
metadata {}
|
||||
spec {
|
||||
backoff_limit = 1
|
||||
ttl_seconds_after_finished = 86400
|
||||
template {
|
||||
metadata {
|
||||
labels = { app = "postiz", component = "backup" }
|
||||
}
|
||||
spec {
|
||||
restart_policy = "OnFailure"
|
||||
container {
|
||||
name = "backup"
|
||||
# Same image/pattern as dbaas/postgresql-backup: official postgres
|
||||
# client tools + apt-installed curl for the Pushgateway push. The
|
||||
# bitnamilegacy/postgresql variant is stripped (no curl/wget/python),
|
||||
# so the metric push silently failed there.
|
||||
image = "docker.io/library/postgres:16.4-bullseye"
|
||||
command = ["/bin/bash", "-c"]
|
||||
args = [
|
||||
<<-EOT
|
||||
set -uo pipefail
|
||||
apt-get update -qq && apt-get install -yqq curl >/dev/null 2>&1 || true
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M)
|
||||
BACKUP_DIR=/backup
|
||||
STATUS=0
|
||||
for db in postiz; do
|
||||
echo "Dumping $db..."
|
||||
if PGPASSWORD=postiz-password pg_dump -h postiz-postgresql -U postiz \
|
||||
--format=custom --compress=6 \
|
||||
--file="$BACKUP_DIR/$db-$TIMESTAMP.dump" \
|
||||
"$db"; then
|
||||
echo " OK: $db ($(du -h "$BACKUP_DIR/$db-$TIMESTAMP.dump" | cut -f1))"
|
||||
else
|
||||
echo " FAIL: $db" >&2
|
||||
STATUS=1
|
||||
fi
|
||||
done
|
||||
find "$BACKUP_DIR" -name '*.dump' -mtime +30 -delete 2>/dev/null || true
|
||||
{
|
||||
echo "backup_last_run_timestamp $(date +%s)"
|
||||
echo "backup_last_status $STATUS"
|
||||
[ "$STATUS" -eq 0 ] && echo "backup_last_success_timestamp $(date +%s)"
|
||||
} | curl -sf --connect-timeout 5 --max-time 10 --data-binary @- \
|
||||
"http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/postiz-postgres-backup" || true
|
||||
exit $STATUS
|
||||
EOT
|
||||
]
|
||||
volume_mount {
|
||||
name = "backup"
|
||||
mount_path = "/backup"
|
||||
}
|
||||
resources {
|
||||
requests = { cpu = "10m", memory = "64Mi" }
|
||||
limits = { memory = "256Mi" }
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "backup"
|
||||
persistent_volume_claim {
|
||||
claim_name = module.nfs_backup_host.claim_name
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue