Add structured JSON logging, OTel business metrics, and Grafana dashboard
Structured logging via JsonFormatter replaces uvicorn's default format so Loki can parse timestamps and fields. 14 business metrics (scrape stats, throttle events, circuit breaker state, cache hit rate, OCR success rate, Celery task lifecycle) are defined in a shared metrics module and instrumented across the scraper pipeline, API, and workers. Celery workers expose a Prometheus HTTP endpoint on configurable ports.
This commit is contained in:
parent
a1829957c1
commit
d6edb747d2
12 changed files with 742 additions and 49 deletions
345
grafana/dashboard.json
Normal file
345
grafana/dashboard.json
Normal file
|
|
@ -0,0 +1,345 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 100,
|
||||
"title": "Scrape Overview",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "red", "value": 600 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 1 },
|
||||
"id": 1,
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"title": "Last Scrape Duration",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.5, rate(scrape_duration_seconds_bucket[24h]))",
|
||||
"legendFormat": "p50"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": { "defaults": { "unit": "short" } },
|
||||
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 1 },
|
||||
"id": 2,
|
||||
"title": "Listings Found vs Processed",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "increase(scrape_listings_found_total[1h])",
|
||||
"legendFormat": "Found"
|
||||
},
|
||||
{
|
||||
"expr": "increase(scrape_listings_processed_total[1h])",
|
||||
"legendFormat": "Processed"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 6, "x": 12, "y": 1 },
|
||||
"id": 3,
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"title": "Failed Listings (Last Scrape)",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "increase(scrape_listings_failed_total[1h])",
|
||||
"legendFormat": "Failed"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": { "defaults": { "unit": "short" } },
|
||||
"gridPos": { "h": 6, "w": 6, "x": 18, "y": 1 },
|
||||
"id": 4,
|
||||
"title": "Pages Fetched & Subqueries",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "increase(scrape_pages_fetched_total[1h])",
|
||||
"legendFormat": "Pages"
|
||||
},
|
||||
{
|
||||
"expr": "increase(scrape_subqueries_total[1h])",
|
||||
"legendFormat": "Subqueries"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 },
|
||||
"id": 101,
|
||||
"title": "Throttle & Circuit Breaker",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": { "defaults": { "unit": "short" } },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"id": 5,
|
||||
"title": "Throttle Events by Type",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "increase(throttle_events_total[5m])",
|
||||
"legendFormat": "{{ type }}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "options": { "0": { "text": "CLOSED", "color": "green" } }, "type": "value" },
|
||||
{ "options": { "1": { "text": "HALF_OPEN", "color": "yellow" } }, "type": "value" },
|
||||
{ "options": { "2": { "text": "OPEN", "color": "red" } }, "type": "value" }
|
||||
]
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"id": 6,
|
||||
"title": "Circuit Breaker State",
|
||||
"type": "state-timeline",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "circuit_breaker_state",
|
||||
"legendFormat": "State"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 16 },
|
||||
"id": 102,
|
||||
"title": "API Performance",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" } },
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 17 },
|
||||
"id": 7,
|
||||
"title": "Request Rate by Endpoint",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_server_duration_milliseconds_count[5m])) by (http_route)",
|
||||
"legendFormat": "{{ http_route }}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": { "defaults": { "unit": "ms" } },
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 17 },
|
||||
"id": 8,
|
||||
"title": "Latency Percentiles",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(http_server_duration_milliseconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_server_duration_milliseconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(http_server_duration_milliseconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p99"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit" }
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 17 },
|
||||
"id": 9,
|
||||
"title": "GeoJSON Cache Hit Rate",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(geojson_cache_operations_total{result=\"hit\"}[5m])) / sum(rate(geojson_cache_operations_total[5m]))",
|
||||
"legendFormat": "Hit Rate"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 },
|
||||
"id": 103,
|
||||
"title": "Celery Tasks",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": { "defaults": { "unit": "short" } },
|
||||
"gridPos": { "h": 8, "w": 6, "x": 0, "y": 26 },
|
||||
"id": 10,
|
||||
"title": "Active Tasks",
|
||||
"type": "stat",
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(celery_tasks_active)",
|
||||
"legendFormat": "Active"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": { "defaults": { "unit": "short" } },
|
||||
"gridPos": { "h": 8, "w": 6, "x": 6, "y": 26 },
|
||||
"id": 11,
|
||||
"title": "Task Completion Rate",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(celery_tasks_total[5m])) by (status)",
|
||||
"legendFormat": "{{ status }}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": { "defaults": { "unit": "s" } },
|
||||
"gridPos": { "h": 8, "w": 6, "x": 12, "y": 26 },
|
||||
"id": 12,
|
||||
"title": "Task Duration (p50/p95)",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(celery_task_duration_seconds_bucket[5m])) by (le, task_name))",
|
||||
"legendFormat": "p50 {{ task_name }}"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(celery_task_duration_seconds_bucket[5m])) by (le, task_name))",
|
||||
"legendFormat": "p95 {{ task_name }}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": { "defaults": { "unit": "short" } },
|
||||
"gridPos": { "h": 8, "w": 6, "x": 18, "y": 26 },
|
||||
"id": 13,
|
||||
"title": "OCR Success Rate",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "increase(ocr_attempts_total[1h])",
|
||||
"legendFormat": "Attempts"
|
||||
},
|
||||
{
|
||||
"expr": "increase(ocr_successes_total[1h])",
|
||||
"legendFormat": "Successes"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 34 },
|
||||
"id": 104,
|
||||
"title": "Logs (Loki)",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "${DS_LOKI}" },
|
||||
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 35 },
|
||||
"id": 14,
|
||||
"title": "Error Logs",
|
||||
"type": "logs",
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"showCommonLabels": false,
|
||||
"wrapLogMessage": true,
|
||||
"prettifyLogMessage": true,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending",
|
||||
"dedupStrategy": "none"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{job=\"realestate-crawler\"} | json | level = \"ERROR\"",
|
||||
"legendFormat": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"tags": ["realestate-crawler", "monitoring"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {},
|
||||
"hide": 0,
|
||||
"name": "DS_PROMETHEUS",
|
||||
"type": "datasource",
|
||||
"query": "prometheus"
|
||||
},
|
||||
{
|
||||
"current": {},
|
||||
"hide": 0,
|
||||
"name": "DS_LOKI",
|
||||
"type": "datasource",
|
||||
"query": "loki"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "Realestate Crawler",
|
||||
"uid": "realestate-crawler",
|
||||
"version": 1
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue