From 41655096c7baa55778ebb2484c27bb163bd7dd62 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Thu, 7 May 2026 09:04:25 +0000 Subject: [PATCH] openclaw: realtime usage dashboard via Prometheus exporter sidecar MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stdlib-only Python exporter ($1) reads ~/.openclaw/agents/*/sessions/*.jsonl (assistant messages with usage) plus auth-profiles.json (OAuth expiry, Plus-tier label) and exposes Prometheus text format on :9099/metrics. Container is python:3.12-slim; pod template gets prometheus.io/scrape annotations so the existing kubernetes-pods job picks it up — no ServiceMonitor needed. Metrics exported: openclaw_codex_messages_total{provider,model,session_kind} counter openclaw_codex_input/output/cache_read/cache_write_tokens_total openclaw_codex_message_errors_total{reason} openclaw_codex_active_sessions{kind} gauge openclaw_codex_oauth_expiry_seconds{provider,account,plan} gauge openclaw_codex_last_run_timestamp gauge Grafana dashboard "OpenClaw — Codex Usage" (Applications folder, 30s refresh): messages/5h vs Plus rate-card, % of 1,200 floor, tokens/5h, cache hit %, OAuth expiry days, active sessions, last-turn age, errors, plus per-model timeseries + bar gauge + error table. Plus rate-card thresholds in the gauge are conservative (1,200/5h floor; real cap is dynamic 1,200–7,000). Re-baseline if throttling shows up below 80%. --- .../monitoring/dashboards/openclaw.json | 476 ++++++++++++++++++ .../monitoring/modules/monitoring/grafana.tf | 1 + stacks/openclaw/files/exporter.py | 264 ++++++++++ stacks/openclaw/main.tf | 73 +++ 4 files changed, 814 insertions(+) create mode 100644 stacks/monitoring/modules/monitoring/dashboards/openclaw.json create mode 100644 stacks/openclaw/files/exporter.py diff --git a/stacks/monitoring/modules/monitoring/dashboards/openclaw.json b/stacks/monitoring/modules/monitoring/dashboards/openclaw.json new file mode 100644 index 00000000..47657279 --- /dev/null +++ b/stacks/monitoring/modules/monitoring/dashboards/openclaw.json @@ -0,0 +1,476 @@ +{ + "annotations": {"list": []}, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "refresh": "30s", + "schemaVersion": 38, + "tags": ["openclaw", "ai", "codex"], + "time": {"from": "now-6h", "to": "now"}, + "timepicker": {}, + "timezone": "", + "title": "OpenClaw — Codex Usage", + "uid": "openclaw-codex", + "version": 1, + "panels": [ + { + "type": "row", + "id": 100, + "title": "Now", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "collapsed": false, + "panels": [] + }, + { + "type": "stat", + "id": 1, + "title": "Messages last 5h — gpt-5.4-mini", + "description": "Plus rate-card lower bound: 1,200 / 5h. Hard cap at the upper bound: 7,000 / 5h.", + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "gridPos": {"h": 5, "w": 6, "x": 0, "y": 1}, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "textMode": "auto" + }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 960}, + {"color": "orange", "value": 1500}, + {"color": "red", "value": 5600} + ] + }, + "unit": "short" + } + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "expr": "sum(increase(openclaw_codex_messages_total{provider=\"openai-codex\",model=\"gpt-5.4-mini\"}[5h]))", + "refId": "A" + } + ] + }, + { + "type": "gauge", + "id": 2, + "title": "% of Plus 5h floor (1,200 cap)", + "description": "Conservative gauge against the lower bound of the published rate-card. Real ceiling depends on dynamic allocation (1,200–7,000). Re-baseline if you observe throttling at <80%.", + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "gridPos": {"h": 5, "w": 6, "x": 6, "y": 1}, + "options": { + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true, + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false} + }, + "fieldConfig": { + "defaults": { + "min": 0, + "max": 100, + "decimals": 1, + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 60}, + {"color": "orange", "value": 80}, + {"color": "red", "value": 95} + ] + } + } + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "expr": "100 * sum(increase(openclaw_codex_messages_total{provider=\"openai-codex\",model=\"gpt-5.4-mini\"}[5h])) / 1200", + "refId": "A" + } + ] + }, + { + "type": "stat", + "id": 3, + "title": "Tokens last 5h (input + output, codex)", + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "gridPos": {"h": 5, "w": 6, "x": 12, "y": 1}, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false} + }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "unit": "short", + "thresholds": {"mode": "absolute", "steps": [{"color": "blue", "value": null}]} + } + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "expr": "sum(increase(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5h])) + sum(increase(openclaw_codex_output_tokens_total{provider=\"openai-codex\"}[5h]))", + "refId": "A" + } + ] + }, + { + "type": "stat", + "id": 4, + "title": "Cache hit ratio (codex, 5h)", + "description": "cacheRead / (cacheRead + input). Higher is better — caching cuts effective Plus quota burn.", + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "gridPos": {"h": 5, "w": 6, "x": 18, "y": 1}, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false} + }, + "fieldConfig": { + "defaults": { + "min": 0, + "max": 100, + "decimals": 1, + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "yellow", "value": 30}, + {"color": "green", "value": 60} + ] + } + } + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "expr": "100 * sum(increase(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5h])) / clamp_min(sum(increase(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5h])) + sum(increase(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5h])), 1)", + "refId": "A" + } + ] + }, + { + "type": "stat", + "id": 5, + "title": "OAuth token expiry", + "description": "Days until the openai-codex OAuth token expires. Re-run `openclaw models auth login --provider openai-codex` before this hits 0.", + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "gridPos": {"h": 5, "w": 6, "x": 0, "y": 6}, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false} + }, + "fieldConfig": { + "defaults": { + "decimals": 1, + "unit": "d", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "orange", "value": 1}, + {"color": "yellow", "value": 3}, + {"color": "green", "value": 5} + ] + } + } + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "expr": "max(openclaw_codex_oauth_expiry_seconds{provider=\"openai-codex\"}) / 86400", + "refId": "A" + } + ] + }, + { + "type": "stat", + "id": 6, + "title": "Active sessions", + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "gridPos": {"h": 5, "w": 6, "x": 6, "y": 6}, + "options": { + "colorMode": "value", + "graphMode": "none", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": true}, + "textMode": "value_and_name" + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": {"mode": "absolute", "steps": [{"color": "blue", "value": null}]} + } + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "expr": "openclaw_codex_active_sessions", + "legendFormat": "{{kind}}", + "refId": "A" + } + ] + }, + { + "type": "stat", + "id": 7, + "title": "Last assistant turn", + "description": "Time since the latest assistant message landed in any session.", + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "gridPos": {"h": 5, "w": 6, "x": 12, "y": 6}, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false} + }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 1800}, + {"color": "orange", "value": 7200}, + {"color": "red", "value": 86400} + ] + } + } + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "expr": "time() - openclaw_codex_last_run_timestamp", + "refId": "A" + } + ] + }, + { + "type": "stat", + "id": 8, + "title": "Errors last 24h", + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "gridPos": {"h": 5, "w": 6, "x": 18, "y": 6}, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false} + }, + "fieldConfig": { + "defaults": { + "decimals": 0, + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 10} + ] + } + } + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "expr": "sum(increase(openclaw_codex_message_errors_total[24h]))", + "refId": "A" + } + ] + }, + { + "type": "row", + "id": 200, + "title": "Over time", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 11}, + "collapsed": false, + "panels": [] + }, + { + "type": "timeseries", + "id": 10, + "title": "Messages / min by model", + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 12}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "drawStyle": "bars", + "fillOpacity": 60, + "lineWidth": 1, + "stacking": {"mode": "normal"} + }, + "unit": "short" + } + }, + "options": { + "legend": {"displayMode": "table", "placement": "right", "showLegend": true, "calcs": ["sum"]}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "expr": "sum by (provider, model) (rate(openclaw_codex_messages_total[1m])) * 60", + "legendFormat": "{{provider}}/{{model}}", + "refId": "A" + } + ] + }, + { + "type": "timeseries", + "id": 11, + "title": "Tokens / min by type (codex)", + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 20}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "drawStyle": "line", + "fillOpacity": 25, + "lineWidth": 2, + "stacking": {"mode": "none"} + }, + "unit": "short" + } + }, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "expr": "sum(rate(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5m])) * 60", + "legendFormat": "input", + "refId": "A" + }, + { + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "expr": "sum(rate(openclaw_codex_output_tokens_total{provider=\"openai-codex\"}[5m])) * 60", + "legendFormat": "output", + "refId": "B" + }, + { + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "expr": "sum(rate(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5m])) * 60", + "legendFormat": "cache_read", + "refId": "C" + } + ] + }, + { + "type": "bargauge", + "id": 12, + "title": "Messages / 5h by model", + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 20}, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "showUnfilled": true, + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false} + }, + "fieldConfig": { + "defaults": { + "min": 0, + "decimals": 0, + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 100}, + {"color": "orange", "value": 500}, + {"color": "red", "value": 1000} + ] + } + } + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "expr": "sum by (provider, model) (increase(openclaw_codex_messages_total[5h]))", + "legendFormat": "{{provider}}/{{model}}", + "refId": "A" + } + ] + }, + { + "type": "row", + "id": 300, + "title": "Errors", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 28}, + "collapsed": false, + "panels": [] + }, + { + "type": "table", + "id": 20, + "title": "Recent errors by model and reason", + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 29}, + "options": { + "showHeader": true + }, + "fieldConfig": { + "defaults": { + "custom": {"align": "auto", "displayMode": "auto"} + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Value"}, + "properties": [ + {"id": "displayName", "value": "Errors (24h)"}, + {"id": "custom.displayMode", "value": "color-background"}, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 10} + ] + } + } + ] + } + ] + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "expr": "sum by (provider, model, reason) (increase(openclaw_codex_message_errors_total[24h])) > 0", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": {"Time": true, "__name__": true, "instance": true, "job": true, "namespace": true, "pod": true, "app": true}, + "indexByName": {"provider": 0, "model": 1, "reason": 2, "Value": 3}, + "renameByName": {} + } + } + ] + } + ] +} diff --git a/stacks/monitoring/modules/monitoring/grafana.tf b/stacks/monitoring/modules/monitoring/grafana.tf index b5a5f249..e7abd8c6 100644 --- a/stacks/monitoring/modules/monitoring/grafana.tf +++ b/stacks/monitoring/modules/monitoring/grafana.tf @@ -134,6 +134,7 @@ locals { # Applications "qbittorrent.json" = "Applications" "realestate-crawler.json" = "Applications" + "openclaw.json" = "Applications" "uk-payslip.json" = "Finance (Personal)" "wealth.json" = "Finance (Personal)" "job-hunter.json" = "Finance" diff --git a/stacks/openclaw/files/exporter.py b/stacks/openclaw/files/exporter.py new file mode 100644 index 00000000..e8d06191 --- /dev/null +++ b/stacks/openclaw/files/exporter.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +"""OpenClaw / Codex usage exporter. + +Reads ~/.openclaw/agents/*/sessions/*.jsonl (assistant messages with usage) +and ~/.openclaw/agents/*/agent/auth-state.json (OAuth profiles), then exposes +Prometheus text-format metrics on :9099/metrics. Stdlib only — no pip install +needed at startup. + +Metrics (all cumulative-since-session-start; use Prometheus increase()/rate() +for windowed views): + + openclaw_codex_messages_total{provider,model,session_kind} counter + openclaw_codex_input_tokens_total{provider,model} counter + openclaw_codex_output_tokens_total{provider,model} counter + openclaw_codex_cache_read_tokens_total{provider,model} counter + openclaw_codex_cache_write_tokens_total{provider,model} counter + openclaw_codex_message_errors_total{provider,model,reason} counter + openclaw_codex_active_sessions{kind} gauge + openclaw_codex_oauth_expiry_seconds{provider,account} gauge + openclaw_codex_last_run_timestamp gauge + openclaw_codex_exporter_scrape_duration_ms gauge +""" +import glob +import json +import os +import re +import time +from datetime import datetime +from http.server import BaseHTTPRequestHandler, HTTPServer +from threading import Lock + +OPENCLAW_HOME = os.environ.get("OPENCLAW_HOME", "/home/node/.openclaw") +PORT = int(os.environ.get("METRICS_PORT", "9099")) +CACHE_SEC = float(os.environ.get("CACHE_SEC", "5")) +SKIP_FRAGMENTS = (".broken.", ".reset.", ".deleted.", ".bak.") +SESSION_RE = re.compile(r"^([0-9a-f-]{36})\.jsonl$") + +_lock = Lock() +_cache = {"text": "", "ts": 0.0} + + +def _esc(value: str) -> str: + return str(value).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n") + + +def _line(name: str, labels: dict, value) -> str: + if labels: + rendered = ",".join(f'{k}="{_esc(v)}"' for k, v in sorted(labels.items())) + return f"{name}{{{rendered}}} {value}" + return f"{name} {value}" + + +def _kind_for(session_id: str, sessions_index: dict) -> str: + for key, val in sessions_index.items(): + if val.get("sessionId") != session_id: + continue + if key.startswith("agent:main:cron:"): + return "cron" + if key.startswith("telegram:slash:"): + return "telegram-slash" + if key.startswith("agent:main:"): + return "main" + surface = (val.get("origin") or {}).get("surface") + if surface: + return surface + return key.split(":", 1)[0] + return "unknown" + + +def _parse_ts(value): + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + try: + return datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp() + except ValueError: + return 0.0 + return 0.0 + + +def _build_text() -> str: + start = time.monotonic() + out = [] + + sessions_index: dict = {} + for sp in glob.glob(os.path.join(OPENCLAW_HOME, "agents/*/sessions/sessions.json")): + try: + with open(sp) as f: + sessions_index.update(json.load(f)) + except Exception: + pass + + msg_count: dict = {} + in_tok: dict = {} + out_tok: dict = {} + cr_tok: dict = {} + cw_tok: dict = {} + err_count: dict = {} + latest_ts = 0.0 + + for jsonl in glob.glob(os.path.join(OPENCLAW_HOME, "agents/*/sessions/*.jsonl")): + bn = os.path.basename(jsonl) + if any(s in bn for s in SKIP_FRAGMENTS): + continue + m = SESSION_RE.match(bn) + if not m: + continue + sid = m.group(1) + kind = _kind_for(sid, sessions_index) + try: + with open(jsonl) as f: + for line in f: + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except Exception: + continue + if obj.get("type") != "message": + continue + msg = obj.get("message") or {} + if msg.get("role") != "assistant": + continue + provider = msg.get("provider") or "unknown" + model = msg.get("model") or "unknown" + usage = msg.get("usage") or {} + ts = _parse_ts(obj.get("timestamp")) + if ts > latest_ts: + latest_ts = ts + if msg.get("stopReason") == "error": + reason = (msg.get("errorMessage") or "unknown")[:80] + ek = (provider, model, reason) + err_count[ek] = err_count.get(ek, 0) + 1 + continue + mk = (provider, model, kind) + msg_count[mk] = msg_count.get(mk, 0) + 1 + pm = (provider, model) + in_tok[pm] = in_tok.get(pm, 0) + (usage.get("input") or 0) + out_tok[pm] = out_tok.get(pm, 0) + (usage.get("output") or 0) + cr_tok[pm] = cr_tok.get(pm, 0) + (usage.get("cacheRead") or 0) + cw_tok[pm] = cw_tok.get(pm, 0) + (usage.get("cacheWrite") or 0) + except Exception: + pass + + out.append("# HELP openclaw_codex_messages_total Cumulative assistant messages") + out.append("# TYPE openclaw_codex_messages_total counter") + for (p, mdl, k), c in msg_count.items(): + out.append(_line("openclaw_codex_messages_total", + {"provider": p, "model": mdl, "session_kind": k}, c)) + + for name, src, hlp in [ + ("openclaw_codex_input_tokens_total", in_tok, "Cumulative input tokens"), + ("openclaw_codex_output_tokens_total", out_tok, "Cumulative output tokens"), + ("openclaw_codex_cache_read_tokens_total", cr_tok, "Cumulative cache-read tokens"), + ("openclaw_codex_cache_write_tokens_total", cw_tok, "Cumulative cache-write tokens"), + ]: + out.append(f"# HELP {name} {hlp}") + out.append(f"# TYPE {name} counter") + for (p, mdl), c in src.items(): + out.append(_line(name, {"provider": p, "model": mdl}, c)) + + out.append("# HELP openclaw_codex_message_errors_total Cumulative assistant errors") + out.append("# TYPE openclaw_codex_message_errors_total counter") + for (p, mdl, r), c in err_count.items(): + out.append(_line("openclaw_codex_message_errors_total", + {"provider": p, "model": mdl, "reason": r}, c)) + + out.append("# HELP openclaw_codex_active_sessions Active sessions in sessions.json") + out.append("# TYPE openclaw_codex_active_sessions gauge") + kc: dict = {} + for k in sessions_index: + if k.startswith("agent:main:cron:"): + kk = "cron" + elif k.startswith("telegram:slash:"): + kk = "telegram-slash" + elif k.startswith("agent:main:"): + kk = "main" + else: + kk = k.split(":", 1)[0] + kc[kk] = kc.get(kk, 0) + 1 + for k, c in kc.items(): + out.append(_line("openclaw_codex_active_sessions", {"kind": k}, c)) + + if latest_ts: + out.append("# HELP openclaw_codex_last_run_timestamp Unix ts of newest assistant message") + out.append("# TYPE openclaw_codex_last_run_timestamp gauge") + out.append(_line("openclaw_codex_last_run_timestamp", {}, latest_ts)) + + out.append("# HELP openclaw_codex_oauth_expiry_seconds Seconds until OAuth token expires") + out.append("# TYPE openclaw_codex_oauth_expiry_seconds gauge") + now = time.time() + for af in glob.glob(os.path.join(OPENCLAW_HOME, "agents/*/agent/auth-profiles.json")): + try: + with open(af) as f: + data = json.load(f) + except Exception: + continue + # Schema: {"version": 1, "profiles": {"": {...}}}. + # `expires` is Unix milliseconds. + for profile in (data.get("profiles") or {}).values(): + exp_ms = profile.get("expires") + if not isinstance(exp_ms, (int, float)): + continue + exp_ts = exp_ms / 1000.0 + out.append(_line( + "openclaw_codex_oauth_expiry_seconds", + { + "provider": profile.get("provider", "unknown"), + "account": profile.get("email") or profile.get("account") or "unknown", + "plan": profile.get("chatgptPlanType") or "unknown", + }, + max(0, exp_ts - now), + )) + + out.append("# HELP openclaw_codex_exporter_scrape_duration_ms Last scrape duration ms") + out.append("# TYPE openclaw_codex_exporter_scrape_duration_ms gauge") + out.append(_line("openclaw_codex_exporter_scrape_duration_ms", {}, + (time.monotonic() - start) * 1000)) + + return "\n".join(out) + "\n" + + +class Handler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path == "/healthz": + self.send_response(200) + self.send_header("Content-Type", "text/plain") + self.end_headers() + self.wfile.write(b"ok\n") + return + if self.path != "/metrics": + self.send_response(404) + self.end_headers() + return + with _lock: + now = time.time() + if now - _cache["ts"] > CACHE_SEC: + try: + _cache["text"] = _build_text() + except Exception as exc: # noqa: BLE001 + _cache["text"] = ( + f'openclaw_codex_exporter_errors_total{{kind="scrape"}} 1\n' + f'# scrape error: {_esc(str(exc))[:200]}\n' + ) + _cache["ts"] = now + body = _cache["text"].encode() + self.send_response(200) + self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, *args, **kwargs): + pass + + +def main(): + print(f"openclaw exporter listening on :{PORT}", flush=True) + HTTPServer(("0.0.0.0", PORT), Handler).serve_forever() + + +if __name__ == "__main__": + main() diff --git a/stacks/openclaw/main.tf b/stacks/openclaw/main.tf index 3a49315a..665b86eb 100644 --- a/stacks/openclaw/main.tf +++ b/stacks/openclaw/main.tf @@ -261,6 +261,19 @@ resource "random_password" "gateway_token" { special = false } +# Prometheus exporter script — read by the openclaw-exporter sidecar. +# Stdlib-only Python so no pip install at startup. Reads sessions JSONL + +# auth-profiles.json from the NFS-backed openclaw home volume (mounted ro). +resource "kubernetes_config_map" "openclaw_exporter" { + metadata { + name = "openclaw-exporter" + namespace = kubernetes_namespace.openclaw.metadata[0].name + } + data = { + "exporter.py" = file("${path.module}/files/exporter.py") + } +} + module "nfs_tools_host" { source = "../../modules/kubernetes/nfs_volume" name = "openclaw-tools-host" @@ -350,6 +363,11 @@ resource "kubernetes_deployment" "openclaw" { } annotations = { "reloader.stakater.com/search" = "true" + # Prometheus auto-discovers pods with these annotations. + # Scraped by the openclaw-exporter sidecar — exposes /metrics on :9099. + "prometheus.io/scrape" = "true" + "prometheus.io/port" = "9099" + "prometheus.io/path" = "/metrics" } } spec { @@ -518,6 +536,54 @@ resource "kubernetes_deployment" "openclaw" { } } + # Sidecar: openclaw-exporter — Prometheus exporter for Codex/OAuth usage. + # Reads sessions JSONL files + auth-profiles.json, exposes /metrics on :9099. + # Stdlib-only Python; no pip install at startup. + container { + name = "openclaw-exporter" + image = "docker.io/library/python:3.12-slim" + command = ["python3", "/scripts/exporter.py"] + port { + container_port = 9099 + name = "metrics" + } + env { + name = "OPENCLAW_HOME" + value = "/home/node/.openclaw" + } + env { + name = "METRICS_PORT" + value = "9099" + } + volume_mount { + name = "openclaw-exporter-script" + mount_path = "/scripts" + read_only = true + } + volume_mount { + name = "openclaw-home" + mount_path = "/home/node/.openclaw" + read_only = true + } + readiness_probe { + http_get { + path = "/healthz" + port = 9099 + } + initial_delay_seconds = 5 + period_seconds = 30 + } + resources { + requests = { + cpu = "10m" + memory = "64Mi" + } + limits = { + memory = "128Mi" + } + } + } + # Sidecar: modelrelay — auto-routes to fastest healthy free model container { name = "modelrelay" @@ -606,6 +672,13 @@ resource "kubernetes_deployment" "openclaw" { name = kubernetes_config_map.openclaw_config.metadata[0].name } } + volume { + name = "openclaw-exporter-script" + config_map { + name = kubernetes_config_map.openclaw_exporter.metadata[0].name + default_mode = "0555" + } + } } } }