openclaw: realtime usage dashboard via Prometheus exporter sidecar
Stdlib-only Python exporter ($1) reads ~/.openclaw/agents/*/sessions/*.jsonl
(assistant messages with usage) plus auth-profiles.json (OAuth expiry,
Plus-tier label) and exposes Prometheus text format on :9099/metrics.
Container is python:3.12-slim; pod template gets prometheus.io/scrape
annotations so the existing kubernetes-pods job picks it up — no
ServiceMonitor needed.
Metrics exported:
openclaw_codex_messages_total{provider,model,session_kind} counter
openclaw_codex_input/output/cache_read/cache_write_tokens_total
openclaw_codex_message_errors_total{reason}
openclaw_codex_active_sessions{kind} gauge
openclaw_codex_oauth_expiry_seconds{provider,account,plan} gauge
openclaw_codex_last_run_timestamp gauge
Grafana dashboard "OpenClaw — Codex Usage" (Applications folder, 30s
refresh): messages/5h vs Plus rate-card, % of 1,200 floor, tokens/5h,
cache hit %, OAuth expiry days, active sessions, last-turn age, errors,
plus per-model timeseries + bar gauge + error table.
Plus rate-card thresholds in the gauge are conservative (1,200/5h floor;
real cap is dynamic 1,200–7,000). Re-baseline if throttling shows up
below 80%.
This commit is contained in:
parent
115ca184ff
commit
41655096c7
4 changed files with 814 additions and 0 deletions
476
stacks/monitoring/modules/monitoring/dashboards/openclaw.json
Normal file
476
stacks/monitoring/modules/monitoring/dashboards/openclaw.json
Normal file
|
|
@ -0,0 +1,476 @@
|
|||
{
|
||||
"annotations": {"list": []},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"tags": ["openclaw", "ai", "codex"],
|
||||
"time": {"from": "now-6h", "to": "now"},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "OpenClaw — Codex Usage",
|
||||
"uid": "openclaw-codex",
|
||||
"version": 1,
|
||||
"panels": [
|
||||
{
|
||||
"type": "row",
|
||||
"id": 100,
|
||||
"title": "Now",
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
|
||||
"collapsed": false,
|
||||
"panels": []
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 1,
|
||||
"title": "Messages last 5h — gpt-5.4-mini",
|
||||
"description": "Plus rate-card lower bound: 1,200 / 5h. Hard cap at the upper bound: 7,000 / 5h.",
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"gridPos": {"h": 5, "w": 6, "x": 0, "y": 1},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"decimals": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 960},
|
||||
{"color": "orange", "value": 1500},
|
||||
{"color": "red", "value": 5600}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"expr": "sum(increase(openclaw_codex_messages_total{provider=\"openai-codex\",model=\"gpt-5.4-mini\"}[5h]))",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "gauge",
|
||||
"id": 2,
|
||||
"title": "% of Plus 5h floor (1,200 cap)",
|
||||
"description": "Conservative gauge against the lower bound of the published rate-card. Real ceiling depends on dynamic allocation (1,200–7,000). Re-baseline if you observe throttling at <80%.",
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"gridPos": {"h": 5, "w": 6, "x": 6, "y": 1},
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"decimals": 1,
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 60},
|
||||
{"color": "orange", "value": 80},
|
||||
{"color": "red", "value": 95}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"expr": "100 * sum(increase(openclaw_codex_messages_total{provider=\"openai-codex\",model=\"gpt-5.4-mini\"}[5h])) / 1200",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 3,
|
||||
"title": "Tokens last 5h (input + output, codex)",
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"gridPos": {"h": 5, "w": 6, "x": 12, "y": 1},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"decimals": 0,
|
||||
"unit": "short",
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "blue", "value": null}]}
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"expr": "sum(increase(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5h])) + sum(increase(openclaw_codex_output_tokens_total{provider=\"openai-codex\"}[5h]))",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 4,
|
||||
"title": "Cache hit ratio (codex, 5h)",
|
||||
"description": "cacheRead / (cacheRead + input). Higher is better — caching cuts effective Plus quota burn.",
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"gridPos": {"h": 5, "w": 6, "x": 18, "y": 1},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"decimals": 1,
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "red", "value": null},
|
||||
{"color": "yellow", "value": 30},
|
||||
{"color": "green", "value": 60}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"expr": "100 * sum(increase(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5h])) / clamp_min(sum(increase(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5h])) + sum(increase(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5h])), 1)",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 5,
|
||||
"title": "OAuth token expiry",
|
||||
"description": "Days until the openai-codex OAuth token expires. Re-run `openclaw models auth login --provider openai-codex` before this hits 0.",
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"gridPos": {"h": 5, "w": 6, "x": 0, "y": 6},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"decimals": 1,
|
||||
"unit": "d",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "red", "value": null},
|
||||
{"color": "orange", "value": 1},
|
||||
{"color": "yellow", "value": 3},
|
||||
{"color": "green", "value": 5}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"expr": "max(openclaw_codex_oauth_expiry_seconds{provider=\"openai-codex\"}) / 86400",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 6,
|
||||
"title": "Active sessions",
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"gridPos": {"h": 5, "w": 6, "x": 6, "y": 6},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": true},
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "blue", "value": null}]}
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"expr": "openclaw_codex_active_sessions",
|
||||
"legendFormat": "{{kind}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 7,
|
||||
"title": "Last assistant turn",
|
||||
"description": "Time since the latest assistant message landed in any session.",
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"gridPos": {"h": 5, "w": 6, "x": 12, "y": 6},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 1800},
|
||||
{"color": "orange", "value": 7200},
|
||||
{"color": "red", "value": 86400}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"expr": "time() - openclaw_codex_last_run_timestamp",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"id": 8,
|
||||
"title": "Errors last 24h",
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"gridPos": {"h": 5, "w": 6, "x": 18, "y": 6},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"decimals": 0,
|
||||
"unit": "short",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 1},
|
||||
{"color": "red", "value": 10}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"expr": "sum(increase(openclaw_codex_message_errors_total[24h]))",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "row",
|
||||
"id": 200,
|
||||
"title": "Over time",
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 11},
|
||||
"collapsed": false,
|
||||
"panels": []
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 10,
|
||||
"title": "Messages / min by model",
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 12},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {
|
||||
"drawStyle": "bars",
|
||||
"fillOpacity": 60,
|
||||
"lineWidth": 1,
|
||||
"stacking": {"mode": "normal"}
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {"displayMode": "table", "placement": "right", "showLegend": true, "calcs": ["sum"]},
|
||||
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"expr": "sum by (provider, model) (rate(openclaw_codex_messages_total[1m])) * 60",
|
||||
"legendFormat": "{{provider}}/{{model}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"id": 11,
|
||||
"title": "Tokens / min by type (codex)",
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 20},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 25,
|
||||
"lineWidth": 2,
|
||||
"stacking": {"mode": "none"}
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true},
|
||||
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"expr": "sum(rate(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5m])) * 60",
|
||||
"legendFormat": "input",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"expr": "sum(rate(openclaw_codex_output_tokens_total{provider=\"openai-codex\"}[5m])) * 60",
|
||||
"legendFormat": "output",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"expr": "sum(rate(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5m])) * 60",
|
||||
"legendFormat": "cache_read",
|
||||
"refId": "C"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "bargauge",
|
||||
"id": 12,
|
||||
"title": "Messages / 5h by model",
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 20},
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"showUnfilled": true,
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"min": 0,
|
||||
"decimals": 0,
|
||||
"unit": "short",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 100},
|
||||
{"color": "orange", "value": 500},
|
||||
{"color": "red", "value": 1000}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"expr": "sum by (provider, model) (increase(openclaw_codex_messages_total[5h]))",
|
||||
"legendFormat": "{{provider}}/{{model}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "row",
|
||||
"id": 300,
|
||||
"title": "Errors",
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 28},
|
||||
"collapsed": false,
|
||||
"panels": []
|
||||
},
|
||||
{
|
||||
"type": "table",
|
||||
"id": 20,
|
||||
"title": "Recent errors by model and reason",
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 29},
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {"align": "auto", "displayMode": "auto"}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Value"},
|
||||
"properties": [
|
||||
{"id": "displayName", "value": "Errors (24h)"},
|
||||
{"id": "custom.displayMode", "value": "color-background"},
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 1},
|
||||
{"color": "red", "value": 10}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
|
||||
"expr": "sum by (provider, model, reason) (increase(openclaw_codex_message_errors_total[24h])) > 0",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {"Time": true, "__name__": true, "instance": true, "job": true, "namespace": true, "pod": true, "app": true},
|
||||
"indexByName": {"provider": 0, "model": 1, "reason": 2, "Value": 3},
|
||||
"renameByName": {}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -134,6 +134,7 @@ locals {
|
|||
# Applications
|
||||
"qbittorrent.json" = "Applications"
|
||||
"realestate-crawler.json" = "Applications"
|
||||
"openclaw.json" = "Applications"
|
||||
"uk-payslip.json" = "Finance (Personal)"
|
||||
"wealth.json" = "Finance (Personal)"
|
||||
"job-hunter.json" = "Finance"
|
||||
|
|
|
|||
264
stacks/openclaw/files/exporter.py
Normal file
264
stacks/openclaw/files/exporter.py
Normal file
|
|
@ -0,0 +1,264 @@
|
|||
#!/usr/bin/env python3
|
||||
"""OpenClaw / Codex usage exporter.
|
||||
|
||||
Reads ~/.openclaw/agents/*/sessions/*.jsonl (assistant messages with usage)
|
||||
and ~/.openclaw/agents/*/agent/auth-state.json (OAuth profiles), then exposes
|
||||
Prometheus text-format metrics on :9099/metrics. Stdlib only — no pip install
|
||||
needed at startup.
|
||||
|
||||
Metrics (all cumulative-since-session-start; use Prometheus increase()/rate()
|
||||
for windowed views):
|
||||
|
||||
openclaw_codex_messages_total{provider,model,session_kind} counter
|
||||
openclaw_codex_input_tokens_total{provider,model} counter
|
||||
openclaw_codex_output_tokens_total{provider,model} counter
|
||||
openclaw_codex_cache_read_tokens_total{provider,model} counter
|
||||
openclaw_codex_cache_write_tokens_total{provider,model} counter
|
||||
openclaw_codex_message_errors_total{provider,model,reason} counter
|
||||
openclaw_codex_active_sessions{kind} gauge
|
||||
openclaw_codex_oauth_expiry_seconds{provider,account} gauge
|
||||
openclaw_codex_last_run_timestamp gauge
|
||||
openclaw_codex_exporter_scrape_duration_ms gauge
|
||||
"""
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
from threading import Lock
|
||||
|
||||
OPENCLAW_HOME = os.environ.get("OPENCLAW_HOME", "/home/node/.openclaw")
|
||||
PORT = int(os.environ.get("METRICS_PORT", "9099"))
|
||||
CACHE_SEC = float(os.environ.get("CACHE_SEC", "5"))
|
||||
SKIP_FRAGMENTS = (".broken.", ".reset.", ".deleted.", ".bak.")
|
||||
SESSION_RE = re.compile(r"^([0-9a-f-]{36})\.jsonl$")
|
||||
|
||||
_lock = Lock()
|
||||
_cache = {"text": "", "ts": 0.0}
|
||||
|
||||
|
||||
def _esc(value: str) -> str:
|
||||
return str(value).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
|
||||
|
||||
|
||||
def _line(name: str, labels: dict, value) -> str:
|
||||
if labels:
|
||||
rendered = ",".join(f'{k}="{_esc(v)}"' for k, v in sorted(labels.items()))
|
||||
return f"{name}{{{rendered}}} {value}"
|
||||
return f"{name} {value}"
|
||||
|
||||
|
||||
def _kind_for(session_id: str, sessions_index: dict) -> str:
|
||||
for key, val in sessions_index.items():
|
||||
if val.get("sessionId") != session_id:
|
||||
continue
|
||||
if key.startswith("agent:main:cron:"):
|
||||
return "cron"
|
||||
if key.startswith("telegram:slash:"):
|
||||
return "telegram-slash"
|
||||
if key.startswith("agent:main:"):
|
||||
return "main"
|
||||
surface = (val.get("origin") or {}).get("surface")
|
||||
if surface:
|
||||
return surface
|
||||
return key.split(":", 1)[0]
|
||||
return "unknown"
|
||||
|
||||
|
||||
def _parse_ts(value):
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp()
|
||||
except ValueError:
|
||||
return 0.0
|
||||
return 0.0
|
||||
|
||||
|
||||
def _build_text() -> str:
|
||||
start = time.monotonic()
|
||||
out = []
|
||||
|
||||
sessions_index: dict = {}
|
||||
for sp in glob.glob(os.path.join(OPENCLAW_HOME, "agents/*/sessions/sessions.json")):
|
||||
try:
|
||||
with open(sp) as f:
|
||||
sessions_index.update(json.load(f))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
msg_count: dict = {}
|
||||
in_tok: dict = {}
|
||||
out_tok: dict = {}
|
||||
cr_tok: dict = {}
|
||||
cw_tok: dict = {}
|
||||
err_count: dict = {}
|
||||
latest_ts = 0.0
|
||||
|
||||
for jsonl in glob.glob(os.path.join(OPENCLAW_HOME, "agents/*/sessions/*.jsonl")):
|
||||
bn = os.path.basename(jsonl)
|
||||
if any(s in bn for s in SKIP_FRAGMENTS):
|
||||
continue
|
||||
m = SESSION_RE.match(bn)
|
||||
if not m:
|
||||
continue
|
||||
sid = m.group(1)
|
||||
kind = _kind_for(sid, sessions_index)
|
||||
try:
|
||||
with open(jsonl) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
if obj.get("type") != "message":
|
||||
continue
|
||||
msg = obj.get("message") or {}
|
||||
if msg.get("role") != "assistant":
|
||||
continue
|
||||
provider = msg.get("provider") or "unknown"
|
||||
model = msg.get("model") or "unknown"
|
||||
usage = msg.get("usage") or {}
|
||||
ts = _parse_ts(obj.get("timestamp"))
|
||||
if ts > latest_ts:
|
||||
latest_ts = ts
|
||||
if msg.get("stopReason") == "error":
|
||||
reason = (msg.get("errorMessage") or "unknown")[:80]
|
||||
ek = (provider, model, reason)
|
||||
err_count[ek] = err_count.get(ek, 0) + 1
|
||||
continue
|
||||
mk = (provider, model, kind)
|
||||
msg_count[mk] = msg_count.get(mk, 0) + 1
|
||||
pm = (provider, model)
|
||||
in_tok[pm] = in_tok.get(pm, 0) + (usage.get("input") or 0)
|
||||
out_tok[pm] = out_tok.get(pm, 0) + (usage.get("output") or 0)
|
||||
cr_tok[pm] = cr_tok.get(pm, 0) + (usage.get("cacheRead") or 0)
|
||||
cw_tok[pm] = cw_tok.get(pm, 0) + (usage.get("cacheWrite") or 0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
out.append("# HELP openclaw_codex_messages_total Cumulative assistant messages")
|
||||
out.append("# TYPE openclaw_codex_messages_total counter")
|
||||
for (p, mdl, k), c in msg_count.items():
|
||||
out.append(_line("openclaw_codex_messages_total",
|
||||
{"provider": p, "model": mdl, "session_kind": k}, c))
|
||||
|
||||
for name, src, hlp in [
|
||||
("openclaw_codex_input_tokens_total", in_tok, "Cumulative input tokens"),
|
||||
("openclaw_codex_output_tokens_total", out_tok, "Cumulative output tokens"),
|
||||
("openclaw_codex_cache_read_tokens_total", cr_tok, "Cumulative cache-read tokens"),
|
||||
("openclaw_codex_cache_write_tokens_total", cw_tok, "Cumulative cache-write tokens"),
|
||||
]:
|
||||
out.append(f"# HELP {name} {hlp}")
|
||||
out.append(f"# TYPE {name} counter")
|
||||
for (p, mdl), c in src.items():
|
||||
out.append(_line(name, {"provider": p, "model": mdl}, c))
|
||||
|
||||
out.append("# HELP openclaw_codex_message_errors_total Cumulative assistant errors")
|
||||
out.append("# TYPE openclaw_codex_message_errors_total counter")
|
||||
for (p, mdl, r), c in err_count.items():
|
||||
out.append(_line("openclaw_codex_message_errors_total",
|
||||
{"provider": p, "model": mdl, "reason": r}, c))
|
||||
|
||||
out.append("# HELP openclaw_codex_active_sessions Active sessions in sessions.json")
|
||||
out.append("# TYPE openclaw_codex_active_sessions gauge")
|
||||
kc: dict = {}
|
||||
for k in sessions_index:
|
||||
if k.startswith("agent:main:cron:"):
|
||||
kk = "cron"
|
||||
elif k.startswith("telegram:slash:"):
|
||||
kk = "telegram-slash"
|
||||
elif k.startswith("agent:main:"):
|
||||
kk = "main"
|
||||
else:
|
||||
kk = k.split(":", 1)[0]
|
||||
kc[kk] = kc.get(kk, 0) + 1
|
||||
for k, c in kc.items():
|
||||
out.append(_line("openclaw_codex_active_sessions", {"kind": k}, c))
|
||||
|
||||
if latest_ts:
|
||||
out.append("# HELP openclaw_codex_last_run_timestamp Unix ts of newest assistant message")
|
||||
out.append("# TYPE openclaw_codex_last_run_timestamp gauge")
|
||||
out.append(_line("openclaw_codex_last_run_timestamp", {}, latest_ts))
|
||||
|
||||
out.append("# HELP openclaw_codex_oauth_expiry_seconds Seconds until OAuth token expires")
|
||||
out.append("# TYPE openclaw_codex_oauth_expiry_seconds gauge")
|
||||
now = time.time()
|
||||
for af in glob.glob(os.path.join(OPENCLAW_HOME, "agents/*/agent/auth-profiles.json")):
|
||||
try:
|
||||
with open(af) as f:
|
||||
data = json.load(f)
|
||||
except Exception:
|
||||
continue
|
||||
# Schema: {"version": 1, "profiles": {"<id>": {...}}}.
|
||||
# `expires` is Unix milliseconds.
|
||||
for profile in (data.get("profiles") or {}).values():
|
||||
exp_ms = profile.get("expires")
|
||||
if not isinstance(exp_ms, (int, float)):
|
||||
continue
|
||||
exp_ts = exp_ms / 1000.0
|
||||
out.append(_line(
|
||||
"openclaw_codex_oauth_expiry_seconds",
|
||||
{
|
||||
"provider": profile.get("provider", "unknown"),
|
||||
"account": profile.get("email") or profile.get("account") or "unknown",
|
||||
"plan": profile.get("chatgptPlanType") or "unknown",
|
||||
},
|
||||
max(0, exp_ts - now),
|
||||
))
|
||||
|
||||
out.append("# HELP openclaw_codex_exporter_scrape_duration_ms Last scrape duration ms")
|
||||
out.append("# TYPE openclaw_codex_exporter_scrape_duration_ms gauge")
|
||||
out.append(_line("openclaw_codex_exporter_scrape_duration_ms", {},
|
||||
(time.monotonic() - start) * 1000))
|
||||
|
||||
return "\n".join(out) + "\n"
|
||||
|
||||
|
||||
class Handler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path == "/healthz":
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/plain")
|
||||
self.end_headers()
|
||||
self.wfile.write(b"ok\n")
|
||||
return
|
||||
if self.path != "/metrics":
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
return
|
||||
with _lock:
|
||||
now = time.time()
|
||||
if now - _cache["ts"] > CACHE_SEC:
|
||||
try:
|
||||
_cache["text"] = _build_text()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_cache["text"] = (
|
||||
f'openclaw_codex_exporter_errors_total{{kind="scrape"}} 1\n'
|
||||
f'# scrape error: {_esc(str(exc))[:200]}\n'
|
||||
)
|
||||
_cache["ts"] = now
|
||||
body = _cache["text"].encode()
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def log_message(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
print(f"openclaw exporter listening on :{PORT}", flush=True)
|
||||
HTTPServer(("0.0.0.0", PORT), Handler).serve_forever()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -261,6 +261,19 @@ resource "random_password" "gateway_token" {
|
|||
special = false
|
||||
}
|
||||
|
||||
# Prometheus exporter script — read by the openclaw-exporter sidecar.
|
||||
# Stdlib-only Python so no pip install at startup. Reads sessions JSONL +
|
||||
# auth-profiles.json from the NFS-backed openclaw home volume (mounted ro).
|
||||
resource "kubernetes_config_map" "openclaw_exporter" {
|
||||
metadata {
|
||||
name = "openclaw-exporter"
|
||||
namespace = kubernetes_namespace.openclaw.metadata[0].name
|
||||
}
|
||||
data = {
|
||||
"exporter.py" = file("${path.module}/files/exporter.py")
|
||||
}
|
||||
}
|
||||
|
||||
module "nfs_tools_host" {
|
||||
source = "../../modules/kubernetes/nfs_volume"
|
||||
name = "openclaw-tools-host"
|
||||
|
|
@ -350,6 +363,11 @@ resource "kubernetes_deployment" "openclaw" {
|
|||
}
|
||||
annotations = {
|
||||
"reloader.stakater.com/search" = "true"
|
||||
# Prometheus auto-discovers pods with these annotations.
|
||||
# Scraped by the openclaw-exporter sidecar — exposes /metrics on :9099.
|
||||
"prometheus.io/scrape" = "true"
|
||||
"prometheus.io/port" = "9099"
|
||||
"prometheus.io/path" = "/metrics"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
|
|
@ -518,6 +536,54 @@ resource "kubernetes_deployment" "openclaw" {
|
|||
}
|
||||
}
|
||||
|
||||
# Sidecar: openclaw-exporter — Prometheus exporter for Codex/OAuth usage.
|
||||
# Reads sessions JSONL files + auth-profiles.json, exposes /metrics on :9099.
|
||||
# Stdlib-only Python; no pip install at startup.
|
||||
container {
|
||||
name = "openclaw-exporter"
|
||||
image = "docker.io/library/python:3.12-slim"
|
||||
command = ["python3", "/scripts/exporter.py"]
|
||||
port {
|
||||
container_port = 9099
|
||||
name = "metrics"
|
||||
}
|
||||
env {
|
||||
name = "OPENCLAW_HOME"
|
||||
value = "/home/node/.openclaw"
|
||||
}
|
||||
env {
|
||||
name = "METRICS_PORT"
|
||||
value = "9099"
|
||||
}
|
||||
volume_mount {
|
||||
name = "openclaw-exporter-script"
|
||||
mount_path = "/scripts"
|
||||
read_only = true
|
||||
}
|
||||
volume_mount {
|
||||
name = "openclaw-home"
|
||||
mount_path = "/home/node/.openclaw"
|
||||
read_only = true
|
||||
}
|
||||
readiness_probe {
|
||||
http_get {
|
||||
path = "/healthz"
|
||||
port = 9099
|
||||
}
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 30
|
||||
}
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "10m"
|
||||
memory = "64Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "128Mi"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Sidecar: modelrelay — auto-routes to fastest healthy free model
|
||||
container {
|
||||
name = "modelrelay"
|
||||
|
|
@ -606,6 +672,13 @@ resource "kubernetes_deployment" "openclaw" {
|
|||
name = kubernetes_config_map.openclaw_config.metadata[0].name
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "openclaw-exporter-script"
|
||||
config_map {
|
||||
name = kubernetes_config_map.openclaw_exporter.metadata[0].name
|
||||
default_mode = "0555"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue