openclaw: realtime usage dashboard via Prometheus exporter sidecar

Stdlib-only Python exporter ($1) reads ~/.openclaw/agents/*/sessions/*.jsonl (assistant messages with usage) plus auth-profiles.json (OAuth expiry, Plus-tier label) and exposes Prometheus text format on :9099/metrics. Container is python:3.12-slim; pod template gets prometheus.io/scrape annotations so the existing kubernetes-pods job picks it up — no ServiceMonitor needed. Metrics exported: openclaw_codex_messages_total{provider,model,session_kind} counter openclaw_codex_input/output/cache_read/cache_write_tokens_total openclaw_codex_message_errors_total{reason} openclaw_codex_active_sessions{kind} gauge openclaw_codex_oauth_expiry_seconds{provider,account,plan} gauge openclaw_codex_last_run_timestamp gauge Grafana dashboard "OpenClaw — Codex Usage" (Applications folder, 30s refresh): messages/5h vs Plus rate-card, % of 1,200 floor, tokens/5h, cache hit %, OAuth expiry days, active sessions, last-turn age, errors, plus per-model timeseries + bar gauge + error table. Plus rate-card thresholds in the gauge are conservative (1,200/5h floor; real cap is dynamic 1,200–7,000). Re-baseline if throttling shows up below 80%.
2026-05-07 09:04:25 +00:00 · 2026-05-07 09:04:25 +00:00 · 41655096c7
commit 41655096c7
parent 115ca184ff
4 changed files with 814 additions and 0 deletions
--- a/stacks/monitoring/modules/monitoring/dashboards/openclaw.json
+++ b/stacks/monitoring/modules/monitoring/dashboards/openclaw.json
@ -0,0 +1,476 @@
+{
+  "annotations": {"list": []},
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "refresh": "30s",
+  "schemaVersion": 38,
+  "tags": ["openclaw", "ai", "codex"],
+  "time": {"from": "now-6h", "to": "now"},
+  "timepicker": {},
+  "timezone": "",
+  "title": "OpenClaw — Codex Usage",
+  "uid": "openclaw-codex",
+  "version": 1,
+  "panels": [
+    {
+      "type": "row",
+      "id": 100,
+      "title": "Now",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "stat",
+      "id": 1,
+      "title": "Messages last 5h — gpt-5.4-mini",
+      "description": "Plus rate-card lower bound: 1,200 / 5h. Hard cap at the upper bound: 7,000 / 5h.",
+      "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+      "gridPos": {"h": 5, "w": 6, "x": 0, "y": 1},
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+        "textMode": "auto"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "decimals": 0,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {"color": "green", "value": null},
+              {"color": "yellow", "value": 960},
+              {"color": "orange", "value": 1500},
+              {"color": "red", "value": 5600}
+            ]
+          },
+          "unit": "short"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+          "expr": "sum(increase(openclaw_codex_messages_total{provider=\"openai-codex\",model=\"gpt-5.4-mini\"}[5h]))",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "gauge",
+      "id": 2,
+      "title": "% of Plus 5h floor (1,200 cap)",
+      "description": "Conservative gauge against the lower bound of the published rate-card. Real ceiling depends on dynamic allocation (1,200–7,000). Re-baseline if you observe throttling at <80%.",
+      "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+      "gridPos": {"h": 5, "w": 6, "x": 6, "y": 1},
+      "options": {
+        "orientation": "auto",
+        "showThresholdLabels": false,
+        "showThresholdMarkers": true,
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
+      },
+      "fieldConfig": {
+        "defaults": {
+          "min": 0,
+          "max": 100,
+          "decimals": 1,
+          "unit": "percent",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {"color": "green", "value": null},
+              {"color": "yellow", "value": 60},
+              {"color": "orange", "value": 80},
+              {"color": "red", "value": 95}
+            ]
+          }
+        }
+      },
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+          "expr": "100 * sum(increase(openclaw_codex_messages_total{provider=\"openai-codex\",model=\"gpt-5.4-mini\"}[5h])) / 1200",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "stat",
+      "id": 3,
+      "title": "Tokens last 5h (input + output, codex)",
+      "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+      "gridPos": {"h": 5, "w": 6, "x": 12, "y": 1},
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
+      },
+      "fieldConfig": {
+        "defaults": {
+          "decimals": 0,
+          "unit": "short",
+          "thresholds": {"mode": "absolute", "steps": [{"color": "blue", "value": null}]}
+        }
+      },
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+          "expr": "sum(increase(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5h])) + sum(increase(openclaw_codex_output_tokens_total{provider=\"openai-codex\"}[5h]))",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "stat",
+      "id": 4,
+      "title": "Cache hit ratio (codex, 5h)",
+      "description": "cacheRead / (cacheRead + input). Higher is better — caching cuts effective Plus quota burn.",
+      "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+      "gridPos": {"h": 5, "w": 6, "x": 18, "y": 1},
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
+      },
+      "fieldConfig": {
+        "defaults": {
+          "min": 0,
+          "max": 100,
+          "decimals": 1,
+          "unit": "percent",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {"color": "red", "value": null},
+              {"color": "yellow", "value": 30},
+              {"color": "green", "value": 60}
+            ]
+          }
+        }
+      },
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+          "expr": "100 * sum(increase(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5h])) / clamp_min(sum(increase(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5h])) + sum(increase(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5h])), 1)",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "stat",
+      "id": 5,
+      "title": "OAuth token expiry",
+      "description": "Days until the openai-codex OAuth token expires. Re-run `openclaw models auth login --provider openai-codex` before this hits 0.",
+      "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+      "gridPos": {"h": 5, "w": 6, "x": 0, "y": 6},
+      "options": {
+        "colorMode": "background",
+        "graphMode": "none",
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
+      },
+      "fieldConfig": {
+        "defaults": {
+          "decimals": 1,
+          "unit": "d",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {"color": "red", "value": null},
+              {"color": "orange", "value": 1},
+              {"color": "yellow", "value": 3},
+              {"color": "green", "value": 5}
+            ]
+          }
+        }
+      },
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+          "expr": "max(openclaw_codex_oauth_expiry_seconds{provider=\"openai-codex\"}) / 86400",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "stat",
+      "id": 6,
+      "title": "Active sessions",
+      "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+      "gridPos": {"h": 5, "w": 6, "x": 6, "y": 6},
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": true},
+        "textMode": "value_and_name"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short",
+          "thresholds": {"mode": "absolute", "steps": [{"color": "blue", "value": null}]}
+        }
+      },
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+          "expr": "openclaw_codex_active_sessions",
+          "legendFormat": "{{kind}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "stat",
+      "id": 7,
+      "title": "Last assistant turn",
+      "description": "Time since the latest assistant message landed in any session.",
+      "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+      "gridPos": {"h": 5, "w": 6, "x": 12, "y": 6},
+      "options": {
+        "colorMode": "background",
+        "graphMode": "none",
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {"color": "green", "value": null},
+              {"color": "yellow", "value": 1800},
+              {"color": "orange", "value": 7200},
+              {"color": "red", "value": 86400}
+            ]
+          }
+        }
+      },
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+          "expr": "time() - openclaw_codex_last_run_timestamp",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "stat",
+      "id": 8,
+      "title": "Errors last 24h",
+      "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+      "gridPos": {"h": 5, "w": 6, "x": 18, "y": 6},
+      "options": {
+        "colorMode": "background",
+        "graphMode": "area",
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
+      },
+      "fieldConfig": {
+        "defaults": {
+          "decimals": 0,
+          "unit": "short",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {"color": "green", "value": null},
+              {"color": "yellow", "value": 1},
+              {"color": "red", "value": 10}
+            ]
+          }
+        }
+      },
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+          "expr": "sum(increase(openclaw_codex_message_errors_total[24h]))",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "id": 200,
+      "title": "Over time",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 11},
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "timeseries",
+      "id": 10,
+      "title": "Messages / min by model",
+      "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+      "gridPos": {"h": 8, "w": 24, "x": 0, "y": 12},
+      "fieldConfig": {
+        "defaults": {
+          "color": {"mode": "palette-classic"},
+          "custom": {
+            "drawStyle": "bars",
+            "fillOpacity": 60,
+            "lineWidth": 1,
+            "stacking": {"mode": "normal"}
+          },
+          "unit": "short"
+        }
+      },
+      "options": {
+        "legend": {"displayMode": "table", "placement": "right", "showLegend": true, "calcs": ["sum"]},
+        "tooltip": {"mode": "multi", "sort": "desc"}
+      },
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+          "expr": "sum by (provider, model) (rate(openclaw_codex_messages_total[1m])) * 60",
+          "legendFormat": "{{provider}}/{{model}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "id": 11,
+      "title": "Tokens / min by type (codex)",
+      "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 20},
+      "fieldConfig": {
+        "defaults": {
+          "color": {"mode": "palette-classic"},
+          "custom": {
+            "drawStyle": "line",
+            "fillOpacity": 25,
+            "lineWidth": 2,
+            "stacking": {"mode": "none"}
+          },
+          "unit": "short"
+        }
+      },
+      "options": {
+        "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true},
+        "tooltip": {"mode": "multi", "sort": "desc"}
+      },
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+          "expr": "sum(rate(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5m])) * 60",
+          "legendFormat": "input",
+          "refId": "A"
+        },
+        {
+          "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+          "expr": "sum(rate(openclaw_codex_output_tokens_total{provider=\"openai-codex\"}[5m])) * 60",
+          "legendFormat": "output",
+          "refId": "B"
+        },
+        {
+          "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+          "expr": "sum(rate(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5m])) * 60",
+          "legendFormat": "cache_read",
+          "refId": "C"
+        }
+      ]
+    },
+    {
+      "type": "bargauge",
+      "id": 12,
+      "title": "Messages / 5h by model",
+      "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 20},
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "showUnfilled": true,
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
+      },
+      "fieldConfig": {
+        "defaults": {
+          "min": 0,
+          "decimals": 0,
+          "unit": "short",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {"color": "green", "value": null},
+              {"color": "yellow", "value": 100},
+              {"color": "orange", "value": 500},
+              {"color": "red", "value": 1000}
+            ]
+          }
+        }
+      },
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+          "expr": "sum by (provider, model) (increase(openclaw_codex_messages_total[5h]))",
+          "legendFormat": "{{provider}}/{{model}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "id": 300,
+      "title": "Errors",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 28},
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "table",
+      "id": 20,
+      "title": "Recent errors by model and reason",
+      "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+      "gridPos": {"h": 8, "w": 24, "x": 0, "y": 29},
+      "options": {
+        "showHeader": true
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {"align": "auto", "displayMode": "auto"}
+        },
+        "overrides": [
+          {
+            "matcher": {"id": "byName", "options": "Value"},
+            "properties": [
+              {"id": "displayName", "value": "Errors (24h)"},
+              {"id": "custom.displayMode", "value": "color-background"},
+              {
+                "id": "thresholds",
+                "value": {
+                  "mode": "absolute",
+                  "steps": [
+                    {"color": "green", "value": null},
+                    {"color": "yellow", "value": 1},
+                    {"color": "red", "value": 10}
+                  ]
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "targets": [
+        {
+          "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
+          "expr": "sum by (provider, model, reason) (increase(openclaw_codex_message_errors_total[24h])) > 0",
+          "format": "table",
+          "instant": true,
+          "refId": "A"
+        }
+      ],
+      "transformations": [
+        {
+          "id": "organize",
+          "options": {
+            "excludeByName": {"Time": true, "__name__": true, "instance": true, "job": true, "namespace": true, "pod": true, "app": true},
+            "indexByName": {"provider": 0, "model": 1, "reason": 2, "Value": 3},
+            "renameByName": {}
+          }
+        }
+      ]
+    }
+  ]
+}
--- a/stacks/monitoring/modules/monitoring/grafana.tf
+++ b/stacks/monitoring/modules/monitoring/grafana.tf
@ -134,6 +134,7 @@ locals {
    # Applications
    "qbittorrent.json"        = "Applications"
    "realestate-crawler.json" = "Applications"
+    "openclaw.json"           = "Applications"
    "uk-payslip.json"         = "Finance (Personal)"
    "wealth.json"             = "Finance (Personal)"
    "job-hunter.json"         = "Finance"
--- a/stacks/openclaw/files/exporter.py
+++ b/stacks/openclaw/files/exporter.py
@ -0,0 +1,264 @@
+#!/usr/bin/env python3
+"""OpenClaw / Codex usage exporter.
+
+Reads ~/.openclaw/agents/*/sessions/*.jsonl (assistant messages with usage)
+and ~/.openclaw/agents/*/agent/auth-state.json (OAuth profiles), then exposes
+Prometheus text-format metrics on :9099/metrics. Stdlib only — no pip install
+needed at startup.
+
+Metrics (all cumulative-since-session-start; use Prometheus increase()/rate()
+for windowed views):
+
+  openclaw_codex_messages_total{provider,model,session_kind}    counter
+  openclaw_codex_input_tokens_total{provider,model}             counter
+  openclaw_codex_output_tokens_total{provider,model}            counter
+  openclaw_codex_cache_read_tokens_total{provider,model}        counter
+  openclaw_codex_cache_write_tokens_total{provider,model}       counter
+  openclaw_codex_message_errors_total{provider,model,reason}    counter
+  openclaw_codex_active_sessions{kind}                          gauge
+  openclaw_codex_oauth_expiry_seconds{provider,account}         gauge
+  openclaw_codex_last_run_timestamp                             gauge
+  openclaw_codex_exporter_scrape_duration_ms                    gauge
+"""
+import glob
+import json
+import os
+import re
+import time
+from datetime import datetime
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from threading import Lock
+
+OPENCLAW_HOME = os.environ.get("OPENCLAW_HOME", "/home/node/.openclaw")
+PORT = int(os.environ.get("METRICS_PORT", "9099"))
+CACHE_SEC = float(os.environ.get("CACHE_SEC", "5"))
+SKIP_FRAGMENTS = (".broken.", ".reset.", ".deleted.", ".bak.")
+SESSION_RE = re.compile(r"^([0-9a-f-]{36})\.jsonl$")
+
+_lock = Lock()
+_cache = {"text": "", "ts": 0.0}
+
+
+def _esc(value: str) -> str:
+    return str(value).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
+
+
+def _line(name: str, labels: dict, value) -> str:
+    if labels:
+        rendered = ",".join(f'{k}="{_esc(v)}"' for k, v in sorted(labels.items()))
+        return f"{name}{{{rendered}}} {value}"
+    return f"{name} {value}"
+
+
+def _kind_for(session_id: str, sessions_index: dict) -> str:
+    for key, val in sessions_index.items():
+        if val.get("sessionId") != session_id:
+            continue
+        if key.startswith("agent:main:cron:"):
+            return "cron"
+        if key.startswith("telegram:slash:"):
+            return "telegram-slash"
+        if key.startswith("agent:main:"):
+            return "main"
+        surface = (val.get("origin") or {}).get("surface")
+        if surface:
+            return surface
+        return key.split(":", 1)[0]
+    return "unknown"
+
+
+def _parse_ts(value):
+    if isinstance(value, (int, float)):
+        return float(value)
+    if isinstance(value, str):
+        try:
+            return datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp()
+        except ValueError:
+            return 0.0
+    return 0.0
+
+
+def _build_text() -> str:
+    start = time.monotonic()
+    out = []
+
+    sessions_index: dict = {}
+    for sp in glob.glob(os.path.join(OPENCLAW_HOME, "agents/*/sessions/sessions.json")):
+        try:
+            with open(sp) as f:
+                sessions_index.update(json.load(f))
+        except Exception:
+            pass
+
+    msg_count: dict = {}
+    in_tok: dict = {}
+    out_tok: dict = {}
+    cr_tok: dict = {}
+    cw_tok: dict = {}
+    err_count: dict = {}
+    latest_ts = 0.0
+
+    for jsonl in glob.glob(os.path.join(OPENCLAW_HOME, "agents/*/sessions/*.jsonl")):
+        bn = os.path.basename(jsonl)
+        if any(s in bn for s in SKIP_FRAGMENTS):
+            continue
+        m = SESSION_RE.match(bn)
+        if not m:
+            continue
+        sid = m.group(1)
+        kind = _kind_for(sid, sessions_index)
+        try:
+            with open(jsonl) as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        obj = json.loads(line)
+                    except Exception:
+                        continue
+                    if obj.get("type") != "message":
+                        continue
+                    msg = obj.get("message") or {}
+                    if msg.get("role") != "assistant":
+                        continue
+                    provider = msg.get("provider") or "unknown"
+                    model = msg.get("model") or "unknown"
+                    usage = msg.get("usage") or {}
+                    ts = _parse_ts(obj.get("timestamp"))
+                    if ts > latest_ts:
+                        latest_ts = ts
+                    if msg.get("stopReason") == "error":
+                        reason = (msg.get("errorMessage") or "unknown")[:80]
+                        ek = (provider, model, reason)
+                        err_count[ek] = err_count.get(ek, 0) + 1
+                        continue
+                    mk = (provider, model, kind)
+                    msg_count[mk] = msg_count.get(mk, 0) + 1
+                    pm = (provider, model)
+                    in_tok[pm] = in_tok.get(pm, 0) + (usage.get("input") or 0)
+                    out_tok[pm] = out_tok.get(pm, 0) + (usage.get("output") or 0)
+                    cr_tok[pm] = cr_tok.get(pm, 0) + (usage.get("cacheRead") or 0)
+                    cw_tok[pm] = cw_tok.get(pm, 0) + (usage.get("cacheWrite") or 0)
+        except Exception:
+            pass
+
+    out.append("# HELP openclaw_codex_messages_total Cumulative assistant messages")
+    out.append("# TYPE openclaw_codex_messages_total counter")
+    for (p, mdl, k), c in msg_count.items():
+        out.append(_line("openclaw_codex_messages_total",
+                         {"provider": p, "model": mdl, "session_kind": k}, c))
+
+    for name, src, hlp in [
+        ("openclaw_codex_input_tokens_total", in_tok, "Cumulative input tokens"),
+        ("openclaw_codex_output_tokens_total", out_tok, "Cumulative output tokens"),
+        ("openclaw_codex_cache_read_tokens_total", cr_tok, "Cumulative cache-read tokens"),
+        ("openclaw_codex_cache_write_tokens_total", cw_tok, "Cumulative cache-write tokens"),
+    ]:
+        out.append(f"# HELP {name} {hlp}")
+        out.append(f"# TYPE {name} counter")
+        for (p, mdl), c in src.items():
+            out.append(_line(name, {"provider": p, "model": mdl}, c))
+
+    out.append("# HELP openclaw_codex_message_errors_total Cumulative assistant errors")
+    out.append("# TYPE openclaw_codex_message_errors_total counter")
+    for (p, mdl, r), c in err_count.items():
+        out.append(_line("openclaw_codex_message_errors_total",
+                         {"provider": p, "model": mdl, "reason": r}, c))
+
+    out.append("# HELP openclaw_codex_active_sessions Active sessions in sessions.json")
+    out.append("# TYPE openclaw_codex_active_sessions gauge")
+    kc: dict = {}
+    for k in sessions_index:
+        if k.startswith("agent:main:cron:"):
+            kk = "cron"
+        elif k.startswith("telegram:slash:"):
+            kk = "telegram-slash"
+        elif k.startswith("agent:main:"):
+            kk = "main"
+        else:
+            kk = k.split(":", 1)[0]
+        kc[kk] = kc.get(kk, 0) + 1
+    for k, c in kc.items():
+        out.append(_line("openclaw_codex_active_sessions", {"kind": k}, c))
+
+    if latest_ts:
+        out.append("# HELP openclaw_codex_last_run_timestamp Unix ts of newest assistant message")
+        out.append("# TYPE openclaw_codex_last_run_timestamp gauge")
+        out.append(_line("openclaw_codex_last_run_timestamp", {}, latest_ts))
+
+    out.append("# HELP openclaw_codex_oauth_expiry_seconds Seconds until OAuth token expires")
+    out.append("# TYPE openclaw_codex_oauth_expiry_seconds gauge")
+    now = time.time()
+    for af in glob.glob(os.path.join(OPENCLAW_HOME, "agents/*/agent/auth-profiles.json")):
+        try:
+            with open(af) as f:
+                data = json.load(f)
+        except Exception:
+            continue
+        # Schema: {"version": 1, "profiles": {"<id>": {...}}}.
+        # `expires` is Unix milliseconds.
+        for profile in (data.get("profiles") or {}).values():
+            exp_ms = profile.get("expires")
+            if not isinstance(exp_ms, (int, float)):
+                continue
+            exp_ts = exp_ms / 1000.0
+            out.append(_line(
+                "openclaw_codex_oauth_expiry_seconds",
+                {
+                    "provider": profile.get("provider", "unknown"),
+                    "account": profile.get("email") or profile.get("account") or "unknown",
+                    "plan": profile.get("chatgptPlanType") or "unknown",
+                },
+                max(0, exp_ts - now),
+            ))
+
+    out.append("# HELP openclaw_codex_exporter_scrape_duration_ms Last scrape duration ms")
+    out.append("# TYPE openclaw_codex_exporter_scrape_duration_ms gauge")
+    out.append(_line("openclaw_codex_exporter_scrape_duration_ms", {},
+                     (time.monotonic() - start) * 1000))
+
+    return "\n".join(out) + "\n"
+
+
+class Handler(BaseHTTPRequestHandler):
+    def do_GET(self):
+        if self.path == "/healthz":
+            self.send_response(200)
+            self.send_header("Content-Type", "text/plain")
+            self.end_headers()
+            self.wfile.write(b"ok\n")
+            return
+        if self.path != "/metrics":
+            self.send_response(404)
+            self.end_headers()
+            return
+        with _lock:
+            now = time.time()
+            if now - _cache["ts"] > CACHE_SEC:
+                try:
+                    _cache["text"] = _build_text()
+                except Exception as exc:  # noqa: BLE001
+                    _cache["text"] = (
+                        f'openclaw_codex_exporter_errors_total{{kind="scrape"}} 1\n'
+                        f'# scrape error: {_esc(str(exc))[:200]}\n'
+                    )
+                _cache["ts"] = now
+            body = _cache["text"].encode()
+        self.send_response(200)
+        self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def log_message(self, *args, **kwargs):
+        pass
+
+
+def main():
+    print(f"openclaw exporter listening on :{PORT}", flush=True)
+    HTTPServer(("0.0.0.0", PORT), Handler).serve_forever()
+
+
+if __name__ == "__main__":
+    main()
--- a/stacks/openclaw/main.tf
+++ b/stacks/openclaw/main.tf
@ -261,6 +261,19 @@ resource "random_password" "gateway_token" {
  special = false
 }

+# Prometheus exporter script — read by the openclaw-exporter sidecar.
+# Stdlib-only Python so no pip install at startup. Reads sessions JSONL +
+# auth-profiles.json from the NFS-backed openclaw home volume (mounted ro).
+resource "kubernetes_config_map" "openclaw_exporter" {
+  metadata {
+    name      = "openclaw-exporter"
+    namespace = kubernetes_namespace.openclaw.metadata[0].name
+  }
+  data = {
+    "exporter.py" = file("${path.module}/files/exporter.py")
+  }
+}
+
 module "nfs_tools_host" {
  source     = "../../modules/kubernetes/nfs_volume"
  name       = "openclaw-tools-host"
@ -350,6 +363,11 @@ resource "kubernetes_deployment" "openclaw" {
        }
        annotations = {
          "reloader.stakater.com/search" = "true"
+          # Prometheus auto-discovers pods with these annotations.
+          # Scraped by the openclaw-exporter sidecar — exposes /metrics on :9099.
+          "prometheus.io/scrape" = "true"
+          "prometheus.io/port"   = "9099"
+          "prometheus.io/path"   = "/metrics"
        }
      }
      spec {
@ -518,6 +536,54 @@ resource "kubernetes_deployment" "openclaw" {
          }
        }

+        # Sidecar: openclaw-exporter — Prometheus exporter for Codex/OAuth usage.
+        # Reads sessions JSONL files + auth-profiles.json, exposes /metrics on :9099.
+        # Stdlib-only Python; no pip install at startup.
+        container {
+          name    = "openclaw-exporter"
+          image   = "docker.io/library/python:3.12-slim"
+          command = ["python3", "/scripts/exporter.py"]
+          port {
+            container_port = 9099
+            name           = "metrics"
+          }
+          env {
+            name  = "OPENCLAW_HOME"
+            value = "/home/node/.openclaw"
+          }
+          env {
+            name  = "METRICS_PORT"
+            value = "9099"
+          }
+          volume_mount {
+            name       = "openclaw-exporter-script"
+            mount_path = "/scripts"
+            read_only  = true
+          }
+          volume_mount {
+            name       = "openclaw-home"
+            mount_path = "/home/node/.openclaw"
+            read_only  = true
+          }
+          readiness_probe {
+            http_get {
+              path = "/healthz"
+              port = 9099
+            }
+            initial_delay_seconds = 5
+            period_seconds        = 30
+          }
+          resources {
+            requests = {
+              cpu    = "10m"
+              memory = "64Mi"
+            }
+            limits = {
+              memory = "128Mi"
+            }
+          }
+        }
+
        # Sidecar: modelrelay — auto-routes to fastest healthy free model
        container {
          name  = "modelrelay"
@ -606,6 +672,13 @@ resource "kubernetes_deployment" "openclaw" {
            name = kubernetes_config_map.openclaw_config.metadata[0].name
          }
        }
+        volume {
+          name = "openclaw-exporter-script"
+          config_map {
+            name         = kubernetes_config_map.openclaw_exporter.metadata[0].name
+            default_mode = "0555"
+          }
+        }
      }
    }
  }