openclaw: realtime usage dashboard via Prometheus exporter sidecar

Stdlib-only Python exporter ($1) reads ~/.openclaw/agents/*/sessions/*.jsonl
(assistant messages with usage) plus auth-profiles.json (OAuth expiry,
Plus-tier label) and exposes Prometheus text format on :9099/metrics.
Container is python:3.12-slim; pod template gets prometheus.io/scrape
annotations so the existing kubernetes-pods job picks it up — no
ServiceMonitor needed.

Metrics exported:
  openclaw_codex_messages_total{provider,model,session_kind}    counter
  openclaw_codex_input/output/cache_read/cache_write_tokens_total
  openclaw_codex_message_errors_total{reason}
  openclaw_codex_active_sessions{kind}                          gauge
  openclaw_codex_oauth_expiry_seconds{provider,account,plan}    gauge
  openclaw_codex_last_run_timestamp                             gauge

Grafana dashboard "OpenClaw — Codex Usage" (Applications folder, 30s
refresh): messages/5h vs Plus rate-card, % of 1,200 floor, tokens/5h,
cache hit %, OAuth expiry days, active sessions, last-turn age, errors,
plus per-model timeseries + bar gauge + error table.

Plus rate-card thresholds in the gauge are conservative (1,200/5h floor;
real cap is dynamic 1,200–7,000). Re-baseline if throttling shows up
below 80%.
This commit is contained in:
Viktor Barzin 2026-05-07 09:04:25 +00:00
parent 115ca184ff
commit 41655096c7
4 changed files with 814 additions and 0 deletions

View file

@ -0,0 +1,476 @@
{
"annotations": {"list": []},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"refresh": "30s",
"schemaVersion": 38,
"tags": ["openclaw", "ai", "codex"],
"time": {"from": "now-6h", "to": "now"},
"timepicker": {},
"timezone": "",
"title": "OpenClaw — Codex Usage",
"uid": "openclaw-codex",
"version": 1,
"panels": [
{
"type": "row",
"id": 100,
"title": "Now",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"collapsed": false,
"panels": []
},
{
"type": "stat",
"id": 1,
"title": "Messages last 5h — gpt-5.4-mini",
"description": "Plus rate-card lower bound: 1,200 / 5h. Hard cap at the upper bound: 7,000 / 5h.",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 0, "y": 1},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
"textMode": "auto"
},
"fieldConfig": {
"defaults": {
"decimals": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 960},
{"color": "orange", "value": 1500},
{"color": "red", "value": 5600}
]
},
"unit": "short"
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum(increase(openclaw_codex_messages_total{provider=\"openai-codex\",model=\"gpt-5.4-mini\"}[5h]))",
"refId": "A"
}
]
},
{
"type": "gauge",
"id": 2,
"title": "% of Plus 5h floor (1,200 cap)",
"description": "Conservative gauge against the lower bound of the published rate-card. Real ceiling depends on dynamic allocation (1,2007,000). Re-baseline if you observe throttling at <80%.",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 6, "y": 1},
"options": {
"orientation": "auto",
"showThresholdLabels": false,
"showThresholdMarkers": true,
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
},
"fieldConfig": {
"defaults": {
"min": 0,
"max": 100,
"decimals": 1,
"unit": "percent",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 60},
{"color": "orange", "value": 80},
{"color": "red", "value": 95}
]
}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "100 * sum(increase(openclaw_codex_messages_total{provider=\"openai-codex\",model=\"gpt-5.4-mini\"}[5h])) / 1200",
"refId": "A"
}
]
},
{
"type": "stat",
"id": 3,
"title": "Tokens last 5h (input + output, codex)",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 12, "y": 1},
"options": {
"colorMode": "value",
"graphMode": "area",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
},
"fieldConfig": {
"defaults": {
"decimals": 0,
"unit": "short",
"thresholds": {"mode": "absolute", "steps": [{"color": "blue", "value": null}]}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum(increase(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5h])) + sum(increase(openclaw_codex_output_tokens_total{provider=\"openai-codex\"}[5h]))",
"refId": "A"
}
]
},
{
"type": "stat",
"id": 4,
"title": "Cache hit ratio (codex, 5h)",
"description": "cacheRead / (cacheRead + input). Higher is better — caching cuts effective Plus quota burn.",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 18, "y": 1},
"options": {
"colorMode": "value",
"graphMode": "area",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
},
"fieldConfig": {
"defaults": {
"min": 0,
"max": 100,
"decimals": 1,
"unit": "percent",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": null},
{"color": "yellow", "value": 30},
{"color": "green", "value": 60}
]
}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "100 * sum(increase(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5h])) / clamp_min(sum(increase(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5h])) + sum(increase(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5h])), 1)",
"refId": "A"
}
]
},
{
"type": "stat",
"id": 5,
"title": "OAuth token expiry",
"description": "Days until the openai-codex OAuth token expires. Re-run `openclaw models auth login --provider openai-codex` before this hits 0.",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 0, "y": 6},
"options": {
"colorMode": "background",
"graphMode": "none",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
},
"fieldConfig": {
"defaults": {
"decimals": 1,
"unit": "d",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": null},
{"color": "orange", "value": 1},
{"color": "yellow", "value": 3},
{"color": "green", "value": 5}
]
}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "max(openclaw_codex_oauth_expiry_seconds{provider=\"openai-codex\"}) / 86400",
"refId": "A"
}
]
},
{
"type": "stat",
"id": 6,
"title": "Active sessions",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 6, "y": 6},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": true},
"textMode": "value_and_name"
},
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {"mode": "absolute", "steps": [{"color": "blue", "value": null}]}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "openclaw_codex_active_sessions",
"legendFormat": "{{kind}}",
"refId": "A"
}
]
},
{
"type": "stat",
"id": 7,
"title": "Last assistant turn",
"description": "Time since the latest assistant message landed in any session.",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 12, "y": 6},
"options": {
"colorMode": "background",
"graphMode": "none",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
},
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 1800},
{"color": "orange", "value": 7200},
{"color": "red", "value": 86400}
]
}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "time() - openclaw_codex_last_run_timestamp",
"refId": "A"
}
]
},
{
"type": "stat",
"id": 8,
"title": "Errors last 24h",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 18, "y": 6},
"options": {
"colorMode": "background",
"graphMode": "area",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
},
"fieldConfig": {
"defaults": {
"decimals": 0,
"unit": "short",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 1},
{"color": "red", "value": 10}
]
}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum(increase(openclaw_codex_message_errors_total[24h]))",
"refId": "A"
}
]
},
{
"type": "row",
"id": 200,
"title": "Over time",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 11},
"collapsed": false,
"panels": []
},
{
"type": "timeseries",
"id": 10,
"title": "Messages / min by model",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 12},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "bars",
"fillOpacity": 60,
"lineWidth": 1,
"stacking": {"mode": "normal"}
},
"unit": "short"
}
},
"options": {
"legend": {"displayMode": "table", "placement": "right", "showLegend": true, "calcs": ["sum"]},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum by (provider, model) (rate(openclaw_codex_messages_total[1m])) * 60",
"legendFormat": "{{provider}}/{{model}}",
"refId": "A"
}
]
},
{
"type": "timeseries",
"id": 11,
"title": "Tokens / min by type (codex)",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 20},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line",
"fillOpacity": 25,
"lineWidth": 2,
"stacking": {"mode": "none"}
},
"unit": "short"
}
},
"options": {
"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum(rate(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5m])) * 60",
"legendFormat": "input",
"refId": "A"
},
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum(rate(openclaw_codex_output_tokens_total{provider=\"openai-codex\"}[5m])) * 60",
"legendFormat": "output",
"refId": "B"
},
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum(rate(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5m])) * 60",
"legendFormat": "cache_read",
"refId": "C"
}
]
},
{
"type": "bargauge",
"id": 12,
"title": "Messages / 5h by model",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 20},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"showUnfilled": true,
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
},
"fieldConfig": {
"defaults": {
"min": 0,
"decimals": 0,
"unit": "short",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 100},
{"color": "orange", "value": 500},
{"color": "red", "value": 1000}
]
}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum by (provider, model) (increase(openclaw_codex_messages_total[5h]))",
"legendFormat": "{{provider}}/{{model}}",
"refId": "A"
}
]
},
{
"type": "row",
"id": 300,
"title": "Errors",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 28},
"collapsed": false,
"panels": []
},
{
"type": "table",
"id": 20,
"title": "Recent errors by model and reason",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 29},
"options": {
"showHeader": true
},
"fieldConfig": {
"defaults": {
"custom": {"align": "auto", "displayMode": "auto"}
},
"overrides": [
{
"matcher": {"id": "byName", "options": "Value"},
"properties": [
{"id": "displayName", "value": "Errors (24h)"},
{"id": "custom.displayMode", "value": "color-background"},
{
"id": "thresholds",
"value": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 1},
{"color": "red", "value": 10}
]
}
}
]
}
]
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum by (provider, model, reason) (increase(openclaw_codex_message_errors_total[24h])) > 0",
"format": "table",
"instant": true,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {"Time": true, "__name__": true, "instance": true, "job": true, "namespace": true, "pod": true, "app": true},
"indexByName": {"provider": 0, "model": 1, "reason": 2, "Value": 3},
"renameByName": {}
}
}
]
}
]
}

View file

@ -134,6 +134,7 @@ locals {
# Applications
"qbittorrent.json" = "Applications"
"realestate-crawler.json" = "Applications"
"openclaw.json" = "Applications"
"uk-payslip.json" = "Finance (Personal)"
"wealth.json" = "Finance (Personal)"
"job-hunter.json" = "Finance"

View file

@ -0,0 +1,264 @@
#!/usr/bin/env python3
"""OpenClaw / Codex usage exporter.
Reads ~/.openclaw/agents/*/sessions/*.jsonl (assistant messages with usage)
and ~/.openclaw/agents/*/agent/auth-state.json (OAuth profiles), then exposes
Prometheus text-format metrics on :9099/metrics. Stdlib only no pip install
needed at startup.
Metrics (all cumulative-since-session-start; use Prometheus increase()/rate()
for windowed views):
openclaw_codex_messages_total{provider,model,session_kind} counter
openclaw_codex_input_tokens_total{provider,model} counter
openclaw_codex_output_tokens_total{provider,model} counter
openclaw_codex_cache_read_tokens_total{provider,model} counter
openclaw_codex_cache_write_tokens_total{provider,model} counter
openclaw_codex_message_errors_total{provider,model,reason} counter
openclaw_codex_active_sessions{kind} gauge
openclaw_codex_oauth_expiry_seconds{provider,account} gauge
openclaw_codex_last_run_timestamp gauge
openclaw_codex_exporter_scrape_duration_ms gauge
"""
import glob
import json
import os
import re
import time
from datetime import datetime
from http.server import BaseHTTPRequestHandler, HTTPServer
from threading import Lock
OPENCLAW_HOME = os.environ.get("OPENCLAW_HOME", "/home/node/.openclaw")
PORT = int(os.environ.get("METRICS_PORT", "9099"))
CACHE_SEC = float(os.environ.get("CACHE_SEC", "5"))
SKIP_FRAGMENTS = (".broken.", ".reset.", ".deleted.", ".bak.")
SESSION_RE = re.compile(r"^([0-9a-f-]{36})\.jsonl$")
_lock = Lock()
_cache = {"text": "", "ts": 0.0}
def _esc(value: str) -> str:
return str(value).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
def _line(name: str, labels: dict, value) -> str:
if labels:
rendered = ",".join(f'{k}="{_esc(v)}"' for k, v in sorted(labels.items()))
return f"{name}{{{rendered}}} {value}"
return f"{name} {value}"
def _kind_for(session_id: str, sessions_index: dict) -> str:
for key, val in sessions_index.items():
if val.get("sessionId") != session_id:
continue
if key.startswith("agent:main:cron:"):
return "cron"
if key.startswith("telegram:slash:"):
return "telegram-slash"
if key.startswith("agent:main:"):
return "main"
surface = (val.get("origin") or {}).get("surface")
if surface:
return surface
return key.split(":", 1)[0]
return "unknown"
def _parse_ts(value):
if isinstance(value, (int, float)):
return float(value)
if isinstance(value, str):
try:
return datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp()
except ValueError:
return 0.0
return 0.0
def _build_text() -> str:
start = time.monotonic()
out = []
sessions_index: dict = {}
for sp in glob.glob(os.path.join(OPENCLAW_HOME, "agents/*/sessions/sessions.json")):
try:
with open(sp) as f:
sessions_index.update(json.load(f))
except Exception:
pass
msg_count: dict = {}
in_tok: dict = {}
out_tok: dict = {}
cr_tok: dict = {}
cw_tok: dict = {}
err_count: dict = {}
latest_ts = 0.0
for jsonl in glob.glob(os.path.join(OPENCLAW_HOME, "agents/*/sessions/*.jsonl")):
bn = os.path.basename(jsonl)
if any(s in bn for s in SKIP_FRAGMENTS):
continue
m = SESSION_RE.match(bn)
if not m:
continue
sid = m.group(1)
kind = _kind_for(sid, sessions_index)
try:
with open(jsonl) as f:
for line in f:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except Exception:
continue
if obj.get("type") != "message":
continue
msg = obj.get("message") or {}
if msg.get("role") != "assistant":
continue
provider = msg.get("provider") or "unknown"
model = msg.get("model") or "unknown"
usage = msg.get("usage") or {}
ts = _parse_ts(obj.get("timestamp"))
if ts > latest_ts:
latest_ts = ts
if msg.get("stopReason") == "error":
reason = (msg.get("errorMessage") or "unknown")[:80]
ek = (provider, model, reason)
err_count[ek] = err_count.get(ek, 0) + 1
continue
mk = (provider, model, kind)
msg_count[mk] = msg_count.get(mk, 0) + 1
pm = (provider, model)
in_tok[pm] = in_tok.get(pm, 0) + (usage.get("input") or 0)
out_tok[pm] = out_tok.get(pm, 0) + (usage.get("output") or 0)
cr_tok[pm] = cr_tok.get(pm, 0) + (usage.get("cacheRead") or 0)
cw_tok[pm] = cw_tok.get(pm, 0) + (usage.get("cacheWrite") or 0)
except Exception:
pass
out.append("# HELP openclaw_codex_messages_total Cumulative assistant messages")
out.append("# TYPE openclaw_codex_messages_total counter")
for (p, mdl, k), c in msg_count.items():
out.append(_line("openclaw_codex_messages_total",
{"provider": p, "model": mdl, "session_kind": k}, c))
for name, src, hlp in [
("openclaw_codex_input_tokens_total", in_tok, "Cumulative input tokens"),
("openclaw_codex_output_tokens_total", out_tok, "Cumulative output tokens"),
("openclaw_codex_cache_read_tokens_total", cr_tok, "Cumulative cache-read tokens"),
("openclaw_codex_cache_write_tokens_total", cw_tok, "Cumulative cache-write tokens"),
]:
out.append(f"# HELP {name} {hlp}")
out.append(f"# TYPE {name} counter")
for (p, mdl), c in src.items():
out.append(_line(name, {"provider": p, "model": mdl}, c))
out.append("# HELP openclaw_codex_message_errors_total Cumulative assistant errors")
out.append("# TYPE openclaw_codex_message_errors_total counter")
for (p, mdl, r), c in err_count.items():
out.append(_line("openclaw_codex_message_errors_total",
{"provider": p, "model": mdl, "reason": r}, c))
out.append("# HELP openclaw_codex_active_sessions Active sessions in sessions.json")
out.append("# TYPE openclaw_codex_active_sessions gauge")
kc: dict = {}
for k in sessions_index:
if k.startswith("agent:main:cron:"):
kk = "cron"
elif k.startswith("telegram:slash:"):
kk = "telegram-slash"
elif k.startswith("agent:main:"):
kk = "main"
else:
kk = k.split(":", 1)[0]
kc[kk] = kc.get(kk, 0) + 1
for k, c in kc.items():
out.append(_line("openclaw_codex_active_sessions", {"kind": k}, c))
if latest_ts:
out.append("# HELP openclaw_codex_last_run_timestamp Unix ts of newest assistant message")
out.append("# TYPE openclaw_codex_last_run_timestamp gauge")
out.append(_line("openclaw_codex_last_run_timestamp", {}, latest_ts))
out.append("# HELP openclaw_codex_oauth_expiry_seconds Seconds until OAuth token expires")
out.append("# TYPE openclaw_codex_oauth_expiry_seconds gauge")
now = time.time()
for af in glob.glob(os.path.join(OPENCLAW_HOME, "agents/*/agent/auth-profiles.json")):
try:
with open(af) as f:
data = json.load(f)
except Exception:
continue
# Schema: {"version": 1, "profiles": {"<id>": {...}}}.
# `expires` is Unix milliseconds.
for profile in (data.get("profiles") or {}).values():
exp_ms = profile.get("expires")
if not isinstance(exp_ms, (int, float)):
continue
exp_ts = exp_ms / 1000.0
out.append(_line(
"openclaw_codex_oauth_expiry_seconds",
{
"provider": profile.get("provider", "unknown"),
"account": profile.get("email") or profile.get("account") or "unknown",
"plan": profile.get("chatgptPlanType") or "unknown",
},
max(0, exp_ts - now),
))
out.append("# HELP openclaw_codex_exporter_scrape_duration_ms Last scrape duration ms")
out.append("# TYPE openclaw_codex_exporter_scrape_duration_ms gauge")
out.append(_line("openclaw_codex_exporter_scrape_duration_ms", {},
(time.monotonic() - start) * 1000))
return "\n".join(out) + "\n"
class Handler(BaseHTTPRequestHandler):
def do_GET(self):
if self.path == "/healthz":
self.send_response(200)
self.send_header("Content-Type", "text/plain")
self.end_headers()
self.wfile.write(b"ok\n")
return
if self.path != "/metrics":
self.send_response(404)
self.end_headers()
return
with _lock:
now = time.time()
if now - _cache["ts"] > CACHE_SEC:
try:
_cache["text"] = _build_text()
except Exception as exc: # noqa: BLE001
_cache["text"] = (
f'openclaw_codex_exporter_errors_total{{kind="scrape"}} 1\n'
f'# scrape error: {_esc(str(exc))[:200]}\n'
)
_cache["ts"] = now
body = _cache["text"].encode()
self.send_response(200)
self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def log_message(self, *args, **kwargs):
pass
def main():
print(f"openclaw exporter listening on :{PORT}", flush=True)
HTTPServer(("0.0.0.0", PORT), Handler).serve_forever()
if __name__ == "__main__":
main()

View file

@ -261,6 +261,19 @@ resource "random_password" "gateway_token" {
special = false
}
# Prometheus exporter script read by the openclaw-exporter sidecar.
# Stdlib-only Python so no pip install at startup. Reads sessions JSONL +
# auth-profiles.json from the NFS-backed openclaw home volume (mounted ro).
resource "kubernetes_config_map" "openclaw_exporter" {
metadata {
name = "openclaw-exporter"
namespace = kubernetes_namespace.openclaw.metadata[0].name
}
data = {
"exporter.py" = file("${path.module}/files/exporter.py")
}
}
module "nfs_tools_host" {
source = "../../modules/kubernetes/nfs_volume"
name = "openclaw-tools-host"
@ -350,6 +363,11 @@ resource "kubernetes_deployment" "openclaw" {
}
annotations = {
"reloader.stakater.com/search" = "true"
# Prometheus auto-discovers pods with these annotations.
# Scraped by the openclaw-exporter sidecar exposes /metrics on :9099.
"prometheus.io/scrape" = "true"
"prometheus.io/port" = "9099"
"prometheus.io/path" = "/metrics"
}
}
spec {
@ -518,6 +536,54 @@ resource "kubernetes_deployment" "openclaw" {
}
}
# Sidecar: openclaw-exporter Prometheus exporter for Codex/OAuth usage.
# Reads sessions JSONL files + auth-profiles.json, exposes /metrics on :9099.
# Stdlib-only Python; no pip install at startup.
container {
name = "openclaw-exporter"
image = "docker.io/library/python:3.12-slim"
command = ["python3", "/scripts/exporter.py"]
port {
container_port = 9099
name = "metrics"
}
env {
name = "OPENCLAW_HOME"
value = "/home/node/.openclaw"
}
env {
name = "METRICS_PORT"
value = "9099"
}
volume_mount {
name = "openclaw-exporter-script"
mount_path = "/scripts"
read_only = true
}
volume_mount {
name = "openclaw-home"
mount_path = "/home/node/.openclaw"
read_only = true
}
readiness_probe {
http_get {
path = "/healthz"
port = 9099
}
initial_delay_seconds = 5
period_seconds = 30
}
resources {
requests = {
cpu = "10m"
memory = "64Mi"
}
limits = {
memory = "128Mi"
}
}
}
# Sidecar: modelrelay auto-routes to fastest healthy free model
container {
name = "modelrelay"
@ -606,6 +672,13 @@ resource "kubernetes_deployment" "openclaw" {
name = kubernetes_config_map.openclaw_config.metadata[0].name
}
}
volume {
name = "openclaw-exporter-script"
config_map {
name = kubernetes_config_map.openclaw_exporter.metadata[0].name
default_mode = "0555"
}
}
}
}
}