Fix celery metrics not reaching Prometheus and update Grafana dashboard

Init OTel metrics at module level in celery_app.py so prefork child
processes inherit the MeterProvider and PrometheusMetricReader from
the parent.  Previously, worker_process_init created a separate
MeterProvider in each child, disconnected from the HTTP server in
the main process — so all scrape/celery/OCR metrics were silently
lost.

Update Grafana dashboard with API Performance and Frontend Performance
sections, synced from the live cluster dashboard.
This commit is contained in:
Viktor Barzin 2026-02-22 17:58:20 +00:00
parent bfee06525b
commit 67d4ab3821
No known key found for this signature in database
GPG key ID: 0EB088298288D958
2 changed files with 2510 additions and 250 deletions

View file

@ -1,7 +1,7 @@
import sys
import time
from celery import Celery
from celery.signals import worker_ready, worker_process_init, task_prerun, task_postrun
from celery.signals import worker_ready, task_prerun, task_postrun
from dotenv import load_dotenv
import os
@ -34,20 +34,17 @@ CELERY_METRICS_PORT = int(os.getenv("CELERY_METRICS_PORT", "9090"))
# Track task start times for duration measurement
_task_start_times: dict[str, float] = {}
@worker_process_init.connect
def _init_worker_metrics(**kwargs: object) -> None:
"""Initialise OTel metrics in each prefork worker process."""
from api.metrics import init_metrics
init_metrics(os.getenv("SERVICE_NAME", "celery-worker"))
# Initialise OTel metrics at module level so prefork children inherit the
# MeterProvider and PrometheusMetricReader. The prometheus_client collectors
# are registered in the default registry before fork, so child-process
# recordings are visible to the HTTP server started in the main process.
from api.metrics import init_metrics as _init_metrics # noqa: E402
_init_metrics(os.getenv("SERVICE_NAME", "celery-worker"))
@worker_ready.connect
def _start_metrics_server(**kwargs: object) -> None:
"""Start a lightweight Prometheus HTTP server in the main worker process."""
from api.metrics import init_metrics
init_metrics(os.getenv("SERVICE_NAME", "celery-worker"))
from prometheus_client import start_http_server
start_http_server(CELERY_METRICS_PORT)

File diff suppressed because it is too large Load diff