postiz+portal: remove broken alert sources (stale backup CronJob, bogus scrape annotations)
Viktor is getting daily Slack alert noise; these two were the recurring generators. The postiz-postgres-backup CronJob still dumped from the old in-namespace postiz-postgresql service that was removed in the CNPG migration (2026-06-28) — it failed every night at 03:00 and re-fired BackupCronJobFailed each day. The postiz DB now lives on the shared CNPG cluster and is already covered by the dbaas per-db dumps, so the CronJob (and its NFS backup volume) is redundant and removed rather than repaired. portal-stt/portal-tts advertised prometheus.io scrape annotations that never worked: the deployed Speaches build 404s /metrics, and openai-edge-tts has no metrics at all (its annotation pointed at a JSON endpoint, which fails exposition parsing regardless). Both produced a permanently firing ScrapeTargetDown. Annotations removed until the apps actually serve metrics. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
5a312563c6
commit
3c476dab32
3 changed files with 6 additions and 121 deletions
|
|
@ -334,13 +334,9 @@ resource "kubernetes_service" "portal_stt" {
|
||||||
name = "portal-stt"
|
name = "portal-stt"
|
||||||
namespace = kubernetes_namespace.portal_stt.metadata[0].name
|
namespace = kubernetes_namespace.portal_stt.metadata[0].name
|
||||||
labels = local.labels
|
labels = local.labels
|
||||||
annotations = {
|
# No scrape annotations: the deployed Speaches build 404s /metrics, so the
|
||||||
# Speaches exposes Prometheus metrics at /metrics — wire annotation-based
|
# annotation-based scrape only produced a permanently firing
|
||||||
# scrape (Ready-endpoint relabeling already filters non-Ready pods).
|
# ScrapeTargetDown. Re-add when the app actually serves Prometheus metrics.
|
||||||
"prometheus.io/scrape" = "true"
|
|
||||||
"prometheus.io/path" = "/metrics"
|
|
||||||
"prometheus.io/port" = "8000"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
spec {
|
spec {
|
||||||
type = "ClusterIP"
|
type = "ClusterIP"
|
||||||
|
|
|
||||||
|
|
@ -184,14 +184,9 @@ resource "kubernetes_service" "portal_tts" {
|
||||||
name = "portal-tts"
|
name = "portal-tts"
|
||||||
namespace = kubernetes_namespace.portal_tts.metadata[0].name
|
namespace = kubernetes_namespace.portal_tts.metadata[0].name
|
||||||
labels = local.labels
|
labels = local.labels
|
||||||
annotations = {
|
# No scrape annotations: openai-edge-tts exposes no Prometheus metrics, and
|
||||||
# openai-edge-tts has no /metrics; annotation-based scrape kept on a live
|
# scraping a JSON endpoint (/v1/models) fails exposition parsing anyway ->
|
||||||
# path so the Service stays in the scrape set (Ready-endpoint relabeling
|
# up=0 -> a permanently firing ScrapeTargetDown.
|
||||||
# filters non-Ready pods). /v1/models is the OpenAI model list.
|
|
||||||
"prometheus.io/scrape" = "true"
|
|
||||||
"prometheus.io/path" = "/v1/models"
|
|
||||||
"prometheus.io/port" = "8000"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
spec {
|
spec {
|
||||||
type = "ClusterIP"
|
type = "ClusterIP"
|
||||||
|
|
|
||||||
|
|
@ -493,109 +493,3 @@ resource "kubernetes_service" "temporal" {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
|
||||||
# Backup CronJob — nightly pg_dump of the bundled postiz-postgresql to NFS.
|
|
||||||
#
|
|
||||||
# The bundled PostgreSQL StatefulSet uses local-path storage on the K8s node
|
|
||||||
# OS disk (chart default), which is NOT covered by Layer 1 (LVM thin
|
|
||||||
# snapshots) or Layer 2 (sda file backup) of the 3-2-1 pipeline. A pg_dump
|
|
||||||
# CronJob writing to /srv/nfs/postiz-backup/ closes the gap: dumps land on
|
|
||||||
# Proxmox host NFS → covered by inotify-driven offsite sync to Synology.
|
|
||||||
# Three databases are dumped: postiz (app data), temporal (workflow engine),
|
|
||||||
# temporal_visibility (workflow search). Bitnami chart-default credentials
|
|
||||||
# are used — same creds the Postiz pod itself uses, scoped to the postiz
|
|
||||||
# namespace via ClusterIP-only Services.
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
module "nfs_backup_host" {
|
|
||||||
source = "../../../../modules/kubernetes/nfs_volume"
|
|
||||||
name = "postiz-backup-host"
|
|
||||||
namespace = kubernetes_namespace.postiz.metadata[0].name
|
|
||||||
nfs_server = "192.168.1.127"
|
|
||||||
nfs_path = "/srv/nfs/postiz-backup"
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "kubernetes_cron_job_v1" "postgres_backup" {
|
|
||||||
metadata {
|
|
||||||
name = "postiz-postgres-backup"
|
|
||||||
namespace = kubernetes_namespace.postiz.metadata[0].name
|
|
||||||
labels = { app = "postiz", component = "backup" }
|
|
||||||
}
|
|
||||||
spec {
|
|
||||||
schedule = "0 3 * * *"
|
|
||||||
concurrency_policy = "Forbid"
|
|
||||||
successful_jobs_history_limit = 3
|
|
||||||
failed_jobs_history_limit = 5
|
|
||||||
job_template {
|
|
||||||
metadata {}
|
|
||||||
spec {
|
|
||||||
backoff_limit = 1
|
|
||||||
ttl_seconds_after_finished = 86400
|
|
||||||
template {
|
|
||||||
metadata {
|
|
||||||
labels = { app = "postiz", component = "backup" }
|
|
||||||
}
|
|
||||||
spec {
|
|
||||||
restart_policy = "OnFailure"
|
|
||||||
container {
|
|
||||||
name = "backup"
|
|
||||||
# Same image/pattern as dbaas/postgresql-backup: official postgres
|
|
||||||
# client tools + apt-installed curl for the Pushgateway push. The
|
|
||||||
# bitnamilegacy/postgresql variant is stripped (no curl/wget/python),
|
|
||||||
# so the metric push silently failed there.
|
|
||||||
image = "docker.io/library/postgres:16.4-bullseye"
|
|
||||||
command = ["/bin/bash", "-c"]
|
|
||||||
args = [
|
|
||||||
<<-EOT
|
|
||||||
set -uo pipefail
|
|
||||||
apt-get update -qq && apt-get install -yqq curl >/dev/null 2>&1 || true
|
|
||||||
TIMESTAMP=$(date +%Y%m%d_%H%M)
|
|
||||||
BACKUP_DIR=/backup
|
|
||||||
STATUS=0
|
|
||||||
for db in postiz; do
|
|
||||||
echo "Dumping $db..."
|
|
||||||
if PGPASSWORD=postiz-password pg_dump -h postiz-postgresql -U postiz \
|
|
||||||
--format=custom --compress=6 \
|
|
||||||
--file="$BACKUP_DIR/$db-$TIMESTAMP.dump" \
|
|
||||||
"$db"; then
|
|
||||||
echo " OK: $db ($(du -h "$BACKUP_DIR/$db-$TIMESTAMP.dump" | cut -f1))"
|
|
||||||
else
|
|
||||||
echo " FAIL: $db" >&2
|
|
||||||
STATUS=1
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
find "$BACKUP_DIR" -name '*.dump' -mtime +30 -delete 2>/dev/null || true
|
|
||||||
{
|
|
||||||
echo "backup_last_run_timestamp $(date +%s)"
|
|
||||||
echo "backup_last_status $STATUS"
|
|
||||||
[ "$STATUS" -eq 0 ] && echo "backup_last_success_timestamp $(date +%s)"
|
|
||||||
} | curl -sf --connect-timeout 5 --max-time 10 --data-binary @- \
|
|
||||||
"http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/postiz-postgres-backup" || true
|
|
||||||
exit $STATUS
|
|
||||||
EOT
|
|
||||||
]
|
|
||||||
volume_mount {
|
|
||||||
name = "backup"
|
|
||||||
mount_path = "/backup"
|
|
||||||
}
|
|
||||||
resources {
|
|
||||||
requests = { cpu = "10m", memory = "64Mi" }
|
|
||||||
limits = { memory = "256Mi" }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
volume {
|
|
||||||
name = "backup"
|
|
||||||
persistent_volume_claim {
|
|
||||||
claim_name = module.nfs_backup_host.claim_name
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
lifecycle {
|
|
||||||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue