Add structured JSON logging, OTel business metrics, and Grafana dashboard
Structured logging via JsonFormatter replaces uvicorn's default format so Loki can parse timestamps and fields. 14 business metrics (scrape stats, throttle events, circuit breaker state, cache hit rate, OCR success rate, Celery task lifecycle) are defined in a shared metrics module and instrumented across the scraper pipeline, API, and workers. Celery workers expose a Prometheus HTTP endpoint on configurable ports.
This commit is contained in:
parent
a1829957c1
commit
d6edb747d2
12 changed files with 742 additions and 49 deletions
157
api/metrics.py
157
api/metrics.py
|
|
@ -1,17 +1,152 @@
|
|||
# metrics.py
|
||||
from opentelemetry.metrics import set_meter_provider
|
||||
"""OpenTelemetry metrics with Prometheus export.
|
||||
|
||||
Provides ``init_metrics()`` to lazily initialise the MeterProvider and all
|
||||
business metric instruments. Safe to call from both the API and Celery
|
||||
workers — the provider is created at most once per process.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from opentelemetry.metrics import (
|
||||
Counter,
|
||||
Histogram,
|
||||
Meter,
|
||||
UpDownCounter,
|
||||
get_meter,
|
||||
set_meter_provider,
|
||||
)
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.resources import SERVICE_NAME, Resource
|
||||
from opentelemetry.exporter.prometheus import PrometheusMetricReader
|
||||
from prometheus_client import make_asgi_app
|
||||
|
||||
# Set up Prometheus reader and meter provider
|
||||
reader = PrometheusMetricReader()
|
||||
provider = MeterProvider(
|
||||
metric_readers=[reader],
|
||||
resource=Resource.create({SERVICE_NAME: "fastapi-metrics-app"}),
|
||||
)
|
||||
set_meter_provider(provider)
|
||||
_reader: PrometheusMetricReader | None = None
|
||||
_meter: Meter | None = None
|
||||
|
||||
# Expose the Prometheus metrics endpoint
|
||||
metrics_app = make_asgi_app() # Exposes /metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scrape metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
scrape_listings_found: Counter
|
||||
scrape_listings_processed: Counter
|
||||
scrape_listings_failed: Counter
|
||||
scrape_duration_seconds: Histogram
|
||||
scrape_pages_fetched: Counter
|
||||
scrape_subqueries_total: Counter
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Throttle / circuit-breaker metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
throttle_events_total: Counter
|
||||
# circuit_breaker_state is registered as an ObservableGauge in circuit_breaker.py
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# API / cache metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
geojson_cache_operations: Counter
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OCR metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
ocr_attempts: Counter
|
||||
ocr_successes: Counter
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Celery task metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
celery_tasks_total: Counter
|
||||
celery_task_duration_seconds: Histogram
|
||||
celery_tasks_active: UpDownCounter
|
||||
|
||||
|
||||
def init_metrics(service_name: str = "realestate-crawler") -> PrometheusMetricReader:
|
||||
"""Initialise the OTel MeterProvider and define all instruments.
|
||||
|
||||
Returns the ``PrometheusMetricReader`` so the API can mount the ASGI app.
|
||||
Calling this more than once is a no-op (returns the existing reader).
|
||||
"""
|
||||
global _reader, _meter
|
||||
global scrape_listings_found, scrape_listings_processed, scrape_listings_failed
|
||||
global scrape_duration_seconds, scrape_pages_fetched, scrape_subqueries_total
|
||||
global throttle_events_total
|
||||
global geojson_cache_operations
|
||||
global ocr_attempts, ocr_successes
|
||||
global celery_tasks_total, celery_task_duration_seconds, celery_tasks_active
|
||||
|
||||
if _reader is not None:
|
||||
return _reader
|
||||
|
||||
_reader = PrometheusMetricReader()
|
||||
provider = MeterProvider(
|
||||
metric_readers=[_reader],
|
||||
resource=Resource.create({SERVICE_NAME: service_name}),
|
||||
)
|
||||
set_meter_provider(provider)
|
||||
_meter = get_meter(__name__)
|
||||
|
||||
# -- Scrape --
|
||||
scrape_listings_found = _meter.create_counter(
|
||||
"scrape_listings_found_total",
|
||||
description="Total listings discovered during scrape runs",
|
||||
)
|
||||
scrape_listings_processed = _meter.create_counter(
|
||||
"scrape_listings_processed_total",
|
||||
description="Total listings successfully processed",
|
||||
)
|
||||
scrape_listings_failed = _meter.create_counter(
|
||||
"scrape_listings_failed_total",
|
||||
description="Total listings that failed processing",
|
||||
)
|
||||
scrape_duration_seconds = _meter.create_histogram(
|
||||
"scrape_duration_seconds",
|
||||
description="Duration of a full scrape run in seconds",
|
||||
)
|
||||
scrape_pages_fetched = _meter.create_counter(
|
||||
"scrape_pages_fetched_total",
|
||||
description="Total API pages fetched during scraping",
|
||||
)
|
||||
scrape_subqueries_total = _meter.create_counter(
|
||||
"scrape_subqueries_total",
|
||||
description="Total subqueries executed after query splitting",
|
||||
)
|
||||
|
||||
# -- Throttle --
|
||||
throttle_events_total = _meter.create_counter(
|
||||
"throttle_events_total",
|
||||
description="Total throttling events by type",
|
||||
)
|
||||
|
||||
# -- Cache --
|
||||
geojson_cache_operations = _meter.create_counter(
|
||||
"geojson_cache_operations_total",
|
||||
description="GeoJSON cache operations (hit/miss)",
|
||||
)
|
||||
|
||||
# -- OCR --
|
||||
ocr_attempts = _meter.create_counter(
|
||||
"ocr_attempts_total",
|
||||
description="Total OCR detection attempts",
|
||||
)
|
||||
ocr_successes = _meter.create_counter(
|
||||
"ocr_successes_total",
|
||||
description="Total OCR detections that found square meters",
|
||||
)
|
||||
|
||||
# -- Celery --
|
||||
celery_tasks_total = _meter.create_counter(
|
||||
"celery_tasks_total",
|
||||
description="Total Celery tasks by name and status",
|
||||
)
|
||||
celery_task_duration_seconds = _meter.create_histogram(
|
||||
"celery_task_duration_seconds",
|
||||
description="Duration of Celery tasks in seconds",
|
||||
)
|
||||
celery_tasks_active = _meter.create_up_down_counter(
|
||||
"celery_tasks_active",
|
||||
description="Currently active Celery tasks",
|
||||
)
|
||||
|
||||
return _reader
|
||||
|
||||
|
||||
def get_metrics_asgi_app(): # type: ignore[no-untyped-def]
|
||||
"""Return the Prometheus ASGI app for mounting at /metrics."""
|
||||
return make_asgi_app()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue