Fix celery metrics not reaching Prometheus and update Grafana dashboard

Init OTel metrics at module level in celery_app.py so prefork child
processes inherit the MeterProvider and PrometheusMetricReader from
the parent.  Previously, worker_process_init created a separate
MeterProvider in each child, disconnected from the HTTP server in
the main process — so all scrape/celery/OCR metrics were silently
lost.

Update Grafana dashboard with API Performance and Frontend Performance
sections, synced from the live cluster dashboard.
This commit is contained in:
Viktor Barzin 2026-02-22 17:58:20 +00:00
parent bfee06525b
commit 67d4ab3821
No known key found for this signature in database
GPG key ID: 0EB088298288D958
2 changed files with 2510 additions and 250 deletions

View file

@ -1,7 +1,7 @@
import sys import sys
import time import time
from celery import Celery from celery import Celery
from celery.signals import worker_ready, worker_process_init, task_prerun, task_postrun from celery.signals import worker_ready, task_prerun, task_postrun
from dotenv import load_dotenv from dotenv import load_dotenv
import os import os
@ -34,20 +34,17 @@ CELERY_METRICS_PORT = int(os.getenv("CELERY_METRICS_PORT", "9090"))
# Track task start times for duration measurement # Track task start times for duration measurement
_task_start_times: dict[str, float] = {} _task_start_times: dict[str, float] = {}
# Initialise OTel metrics at module level so prefork children inherit the
@worker_process_init.connect # MeterProvider and PrometheusMetricReader. The prometheus_client collectors
def _init_worker_metrics(**kwargs: object) -> None: # are registered in the default registry before fork, so child-process
"""Initialise OTel metrics in each prefork worker process.""" # recordings are visible to the HTTP server started in the main process.
from api.metrics import init_metrics from api.metrics import init_metrics as _init_metrics # noqa: E402
init_metrics(os.getenv("SERVICE_NAME", "celery-worker")) _init_metrics(os.getenv("SERVICE_NAME", "celery-worker"))
@worker_ready.connect @worker_ready.connect
def _start_metrics_server(**kwargs: object) -> None: def _start_metrics_server(**kwargs: object) -> None:
"""Start a lightweight Prometheus HTTP server in the main worker process.""" """Start a lightweight Prometheus HTTP server in the main worker process."""
from api.metrics import init_metrics
init_metrics(os.getenv("SERVICE_NAME", "celery-worker"))
from prometheus_client import start_http_server from prometheus_client import start_http_server
start_http_server(CELERY_METRICS_PORT) start_http_server(CELERY_METRICS_PORT)

File diff suppressed because it is too large Load diff