fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]

6d224861 came from a --no-checkout worktree whose empty index made the
commit drop every file except two. This restores 05b50d2b's full tree and
correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su
entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the
live infra was never applied from the broken commit.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-09 08:45:33 +00:00
parent 6d224861c4
commit fd0f4a0365
1166 changed files with 358546 additions and 0 deletions

42
stacks/monitoring/main.tf Normal file
View file

@ -0,0 +1,42 @@
# =============================================================================
# Monitoring Stack Prometheus / Grafana / Loki
# =============================================================================
variable "tls_secret_name" { type = string }
variable "nfs_server" { type = string }
variable "mysql_host" { type = string }
variable "monitoring_idrac_username" { type = string }
data "vault_kv_secret_v2" "secrets" {
mount = "secret"
name = "platform"
}
data "vault_kv_secret_v2" "viktor" {
mount = "secret"
name = "viktor"
}
module "monitoring" {
source = "./modules/monitoring"
tls_secret_name = var.tls_secret_name
nfs_server = var.nfs_server
mysql_host = var.mysql_host
alertmanager_account_password = data.vault_kv_secret_v2.secrets.data["alertmanager_account_password"]
idrac_username = var.monitoring_idrac_username
idrac_password = data.vault_kv_secret_v2.secrets.data["monitoring_idrac_password"]
alertmanager_slack_api_url = data.vault_kv_secret_v2.secrets.data["alertmanager_slack_api_url"]
tiny_tuya_service_secret = data.vault_kv_secret_v2.secrets.data["tiny_tuya_service_secret"]
haos_api_token = data.vault_kv_secret_v2.secrets.data["haos_api_token"]
pve_password = data.vault_kv_secret_v2.secrets.data["pve_password"]
grafana_admin_password = data.vault_kv_secret_v2.secrets.data["grafana_admin_password"]
kube_config_path = var.kube_config_path
registry_user = data.vault_kv_secret_v2.viktor.data["registry_user"]
registry_password = data.vault_kv_secret_v2.viktor.data["registry_password"]
# try() so apply succeeds before the Vault key is populated during Phase 0
# bootstrap (see docs/runbooks/forgejo-registry-setup.md). Empty token =
# probe will report an auth failure and fire RegistryCatalogInaccessible
# that's the intended visible-broken state until the PAT is created.
forgejo_pull_token = try(data.vault_kv_secret_v2.viktor.data["forgejo_pull_token"], "")
tier = local.tiers.cluster
}

View file

@ -0,0 +1,27 @@
# dockerhub: viktorbarzin/redfish-exporter
# repo: https://pkg.go.dev/github.com/jenningsloy318/redfish_exporter#section-readme
FROM golang:rc-bullseye AS builder
LABEL maintainer="Viktor Barzin <me@viktorbarzin.me>"
ARG ARCH=amd64
ENV GOROOT /usr/local/go
ENV GOPATH /go
ENV PATH "$GOROOT/bin:$GOPATH/bin:$PATH"
ENV GO_VERSION 1.15.2
ENV GO111MODULE=on
# Build dependencies
RUN mkdir -p /go/src/github.com/ && \
git clone https://github.com/jenningsloy318/redfish_exporter /go/src/github.com/jenningsloy318/redfish_exporter && \
cd /go/src/github.com/jenningsloy318/redfish_exporter && \
make build
FROM golang:rc-bullseye
COPY --from=builder /go/src/github.com/jenningsloy318/redfish_exporter/build/redfish_exporter /usr/local/bin/redfish_exporter
RUN mkdir /etc/prometheus
# config file mounter at runtime
CMD ["/usr/local/bin/redfish_exporter", "--config.file", "/etc/prometheus/redfish_exporter.yml"]

View file

@ -0,0 +1,289 @@
alloy:
# Resource limits for the alloy container itself.
# Must be under `alloy.resources` (NOT `controller.resources`) — the chart
# only maps THIS key onto the alloy container. Without it, the container gets
# `resources: {}` and inherits Kyverno LimitRange `tier-defaults` (256Mi),
# which is below Alloy's 400-450Mi steady state and caused page-cache
# thrashing → 185 MB/s sdc reads → host IO saturation (2026-05-26).
# Burstable QoS (request < limit) — workers are at 97-99% memory-request
# saturation; a 1Gi request blocks scheduling on node2/node3.
resources:
requests:
cpu: 50m
memory: 512Mi
limits:
memory: 1Gi
configMap:
content: |-
// Write your Alloy config here:
logging {
level = "info"
format = "logfmt"
}
loki.write "default" {
endpoint {
url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"
}
}
// discovery.kubernetes allows you to find scrape targets from Kubernetes resources.
// It watches cluster state and ensures targets are continually synced with what is currently running in your cluster.
discovery.kubernetes "pod" {
role = "pod"
}
// discovery.relabel rewrites the label set of the input targets by applying one or more relabeling rules.
// If no rules are defined, then the input targets are exported as-is.
discovery.relabel "pod_logs" {
targets = discovery.kubernetes.pod.targets
// Drop high-volume, low-value producers from Loki to cut sdc write wear
// (the log PVC is on the contended sdc HDD). goflow2 emits one JSON line
// per NetFlow record to stdout (~8 GB/day, ~64% of all cluster logs) but
// we only use its Prometheus aggregate metrics, not the per-flow logs;
// vpa = Goldilocks/VPA recommender chatter (~1.3 GB/day). Both reversible
// — remove this rule to ship them again. (Added 2026-06-05.)
rule {
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_name"]
separator = "/"
regex = "monitoring/goflow2-.*|vpa/.*"
action = "drop"
}
// Label creation - "namespace" field from "__meta_kubernetes_namespace"
rule {
source_labels = ["__meta_kubernetes_namespace"]
action = "replace"
target_label = "namespace"
}
// Label creation - "pod" field from "__meta_kubernetes_pod_name"
rule {
source_labels = ["__meta_kubernetes_pod_name"]
action = "replace"
target_label = "pod"
}
// Label creation - "container" field from "__meta_kubernetes_pod_container_name"
rule {
source_labels = ["__meta_kubernetes_pod_container_name"]
action = "replace"
target_label = "container"
}
// Label creation - "app" field from "__meta_kubernetes_pod_label_app_kubernetes_io_name"
rule {
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
action = "replace"
target_label = "app"
}
// Label creation - "job" field from "__meta_kubernetes_namespace" and "__meta_kubernetes_pod_container_name"
// Concatenate values __meta_kubernetes_namespace/__meta_kubernetes_pod_container_name
rule {
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_container_name"]
action = "replace"
target_label = "job"
separator = "/"
replacement = "$1"
}
// Label creation - "container" field from "__meta_kubernetes_pod_uid" and "__meta_kubernetes_pod_container_name"
// Concatenate values __meta_kubernetes_pod_uid/__meta_kubernetes_pod_container_name.log
rule {
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
action = "replace"
target_label = "__path__"
separator = "/"
replacement = "/var/log/pods/*$1/*.log"
}
// Label creation - "container_runtime" field from "__meta_kubernetes_pod_container_id"
rule {
source_labels = ["__meta_kubernetes_pod_container_id"]
action = "replace"
target_label = "container_runtime"
regex = "^(\\S+):\\/\\/.+$"
replacement = "$1"
}
}
// local.file_match expands the /var/log/pods/*<uid>/<container>/*.log globs
// that discovery.relabel.pod_logs writes into __path__ (doublestar) into
// concrete file targets. loki.source.file does NOT expand globs itself, so
// feeding it the glob directly makes it stat() the literal `*` path and ship
// ZERO pod logs (regression found 2026-06-05 — this component was missing;
// only node/Pi journals were reaching Loki). See Grafana Alloy docs
// "local.file_match > Send Kubernetes Pod logs to Loki".
local.file_match "pod_logs" {
path_targets = discovery.relabel.pod_logs.output
}
// loki.source.file tails pod logs from /var/log/pods/* on the host filesystem.
// Previously used loki.source.kubernetes (apiserver streaming) which drove
// kube-apiserver `CONNECT pods/log` to ~13 req/s + ~2200 sec/s of streams.
loki.source.file "pod_logs" {
targets = local.file_match.pod_logs.targets
forward_to = [loki.process.pod_logs.receiver]
}
// loki.process receives log entries from other Loki components, applies one or more processing stages,
// and forwards the results to the list of receivers in the component's arguments.
loki.process "pod_logs" {
// Parse the containerd CRI wrapper ("<ts> <stream> <flags> <msg>") so Loki
// stores the clean message + the real timestamp instead of the raw prefixed
// line. All cluster nodes run containerd, so a bare stage.cri is correct.
stage.cri { }
// Drop benign public-SMTP scanner noise from the mailserver pod only:
// unknown[unknown] probes that never complete TLS/PROXY + postscreen
// half-open drops (~9k lines/hr, the cluster's #1 Loki error source).
// Real delivery logs and real-IP SASL failures are KEPT; CrowdSec bans
// these scanner IPs independently, so security posture is unchanged.
// Reversible — delete this stage to ship the lines again.
stage.match {
selector = `{namespace="mailserver"}`
stage.drop {
expression = `.*(getpeername: Transport endpoint is not connected -- dropping|SSL_accept error from unknown\[unknown\]|Connection rate limit exceeded: [0-9]+ from unknown\[unknown\]).*`
drop_counter_reason = "mailserver_scanner_noise"
}
}
// Drop the cosmetic k8s deprecation warning calico-typha emits because
// it still WATCHes the core v1 Endpoints API (~342 lines/hr across 3
// typha pods). The v1 Endpoints API will essentially never be removed
// (KEP-4974), so this is pure log noise — Calico is healthy and no
// Calico release has moved typha's watch to EndpointSlice. Real
// calico-system warnings/errors are KEPT. Reversible — delete to ship.
stage.match {
selector = `{namespace="calico-system"}`
stage.drop {
expression = `.*v1 Endpoints is deprecated in v1\.33\+; use discovery\.k8s\.io/v1 EndpointSlice.*`
drop_counter_reason = "calico_endpoints_deprecation_warning"
}
}
stage.static_labels {
values = {
cluster = "default",
}
}
forward_to = [loki.write.default.receiver]
}
// Node-level journal log collection for kernel panics, OOMs, hung tasks, etc.
// Ships system logs off-node so they survive hard resets.
loki.source.journal "node_journal" {
forward_to = [loki.process.journal.receiver]
relabel_rules = loki.relabel.journal.rules
labels = {
job = "node-journal",
}
max_age = "12h"
}
loki.relabel "journal" {
forward_to = []
rule {
source_labels = ["__journal__hostname"]
target_label = "node"
}
rule {
source_labels = ["__journal__systemd_unit"]
target_label = "unit"
}
rule {
source_labels = ["__journal_priority_keyword"]
target_label = "level"
}
rule {
source_labels = ["__journal__transport"]
target_label = "transport"
}
}
// Forward warning+ journal entries (priority 0-4: emerg, alert, crit, err, warning)
// Also forwards kernel transport entries regardless of priority for OOM/panic detection.
loki.process "journal" {
stage.static_labels {
values = {
cluster = "default",
}
}
// Drop info/debug/notice entries that aren't from the kernel transport
stage.match {
selector = "{job=\"node-journal\", level=~\"info|notice|debug\", transport!=\"kernel\"}"
action = "drop"
}
forward_to = [loki.write.default.receiver]
}
// Kubernetes audit log collection from /var/log/kubernetes/audit/audit.log
// (kube-apiserver --audit-log-path on k8s-master; rotated siblings stay in
// the audit/ subdir). Requires alloy.mounts.varlog=true to mount /var/log
// from the host. Enabled 2026-06-06 once apiserver audit actually started
// writing — see infra/scripts/k8s-apiserver-audit-policy.yaml.
local.file_match "audit_logs" {
path_targets = [{
__path__ = "/var/log/kubernetes/audit/audit.log",
job = "kubernetes-audit",
node = env("HOSTNAME"),
}]
}
loki.source.file "audit_logs" {
targets = local.file_match.audit_logs.targets
forward_to = [loki.write.default.receiver]
}
# Mount /var/log from the host for file-based log collection (audit logs)
mounts:
varlog: true
# Mount journal directories for loki.source.journal
extra:
- name: journal-run
mountPath: /run/log/journal
readOnly: true
- name: journal-var
mountPath: /var/log/journal
readOnly: true
- name: machine-id
mountPath: /etc/machine-id
readOnly: true
controller:
# Bump maxUnavailable above the chart default (1) so a 5-node DS finishes its
# rolling update inside the helm_release timeout. Log shipper tolerates the
# brief gap.
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 50%
volumes:
extra:
- name: journal-run
hostPath:
path: /run/log/journal
type: DirectoryOrCreate
- name: journal-var
hostPath:
path: /var/log/journal
type: DirectoryOrCreate
- name: machine-id
hostPath:
path: /etc/machine-id
type: File
# Schedule on control-plane node too so we can tail /var/log/kubernetes/audit.log
# from kube-apiserver. Without this, K8s audit log shipping (wave 1 K2-K9 alert
# rules) has no source. control-plane has the standard NoSchedule taint.
tolerations:
- key: "node-role.kubernetes.io/control-plane"
operator: "Exists"
effect: "NoSchedule"

View file

@ -0,0 +1,220 @@
# =============================================================================
# Authentik walling-off guard
# =============================================================================
# Detects regressions where a service that MUST work WITHOUT Authentik SSO gets
# accidentally walled off i.e. an ingress that should be `auth = "none"` (or a
# path-scoped carve-out) starts returning an Authentik forward-auth 302.
#
# The "walled off" signature (captured live 2026-06-02): a request to a
# must-stay-public URL returns 301/302 whose `Location` header points at
# Authentik:
# https://authentik.viktorbarzin.me/application/o/authorize/?client_id=...
# A correctly-carved path returns a non-redirect (200/400/401/403/404/405/426/)
# OR a redirect whose Location is NOT Authentik (e.g. a short-link 302).
#
# Mechanism: a tiny blackbox-exporter (below) probes each guarded URL with
# `no_follow_redirects: true` and FAILS the probe iff the `Location` header
# matches Authentik (`fail_if_header_matches`). Prometheus scrapes the probe
# (job `blackbox-authentik-walloff` in extraScrapeConfigs) and the
# `AuthentikWallingOffPublicPath` PrometheusRule (alerting_rules.yml, lane=security)
# routes a firing alert to the #security Slack receiver.
#
# Chosen over a CronJob+pushgateway probe (the apex-probe pattern) because that
# pattern's `pip install`/`apk add` per-run footprint is a known disk-write
# anti-pattern that got status-page-pusher disabled (memory id=559). blackbox is
# a single long-lived deployment zero per-run disk writes, fully declarative.
#
# ---------------------------------------------------------------------------
# TARGET LIST HOW TO ADD A NEW CARVE-OUT (one-line edit)
# ---------------------------------------------------------------------------
# When you add a new `auth = "none"` carve-out (or path-scoped carve-out) to any
# stack, add ONE representative GET-able URL here that returns a NON-Authentik
# response today. The map key becomes the `service` label on the probe metric
# and the alert. Verify with:
# curl -s -o /dev/null -w '%{http_code} %{redirect_url}\n' '<url>'
# It must NOT 302 to authentik.viktorbarzin.me before you add it.
# ---------------------------------------------------------------------------
locals {
# Representative URL per `auth = "none"` carve-out service. Each MUST return a
# non-Authentik response (200/3xx-non-authentik/400/404/426/) when the
# carve-out is intact. Probed every 60s; alert fires only on an Authentik 302.
authentik_walloff_targets = {
# meshcentral agent/relay paths (auth="none"): native mesh-cert clients.
# /agent.ashx 404s without WebSocket upgrade headers non-redirect = OK.
"meshcentral-agent" = "https://meshcentral.viktorbarzin.me/agent.ashx"
# uptime-kuma public status page (auth="none" on /status, /api/push, ).
"uptime-status" = "https://uptime.viktorbarzin.me/status/infra"
# shlink REST API health (auth="none"): X-Api-Key self-gated, CORS XHR.
"shlink-rest-health" = "https://url.viktorbarzin.me/rest/health"
# rybbit analytics tracker beacon (auth="none"): public sites embed this JS.
"rybbit-script" = "https://rybbit.viktorbarzin.me/api/script.js"
# insta2spotify API (auth="none"): browser fetch() XHRs, CORS preflight.
"insta2spotify-api-health" = "https://insta2spotify.viktorbarzin.me/api/health"
# k8s-portal setup script (auth="none"): curl-ed by automation, no cookies.
"k8s-portal-setup-script" = "https://k8s-portal.viktorbarzin.me/setup/script"
# instagram-poster image derivative endpoint (auth="none"): Meta's fetcher.
# /image 404s without a query param non-redirect = OK.
"instagram-poster-image" = "https://instagram-poster.viktorbarzin.me/image"
# trading-bot app root (auth="app"): WebAuthn/JWT in-app; was walled, now 200.
"trading-bot-app" = "https://trading.viktorbarzin.me/"
# NOTE: openclaw task-webhook (auth="none") is intentionally NOT probed it
# has no public DNS record (NXDOMAIN, external_monitor=false), so there is no
# externally GET-able URL to probe. Its carve-out is internal-only.
}
}
# --- blackbox-exporter -------------------------------------------------------
# Single-purpose blackbox-exporter. The `http_no_authentik_redirect` module does
# NOT follow redirects and FAILS the probe ONLY when the Location header points
# at Authentik. The status code alone must NEVER fail the probe carve-outs
# legitimately return 404 (meshcentral /agent.ashx without WS headers,
# instagram-poster /image without a query) or 400/401/403/426, all of which mean
# "carve-out intact". So `valid_status_codes` enumerates every plausible
# non-Authentik response INCLUDING 301/302 a redirect is status-valid, and the
# Authentik case is then singled out by `fail_if_header_matches` on Location
# (NOT empty: blackbox treats an empty list as "2xx only", which would
# false-fire on every 404 carve-out). probe_failed_due_to_regex isolates the
# Authentik match even further (used as a tie-break in the alert expr).
resource "kubernetes_config_map" "blackbox_exporter_config" {
metadata {
name = "blackbox-exporter-config"
namespace = kubernetes_namespace.monitoring.metadata[0].name
annotations = {
"reloader.stakater.com/match" = "true"
}
}
data = {
"blackbox.yml" = yamlencode({
modules = {
http_no_authentik_redirect = {
prober = "http"
timeout = "10s"
http = {
method = "GET"
no_follow_redirects = true
preferred_ip_protocol = "ip4"
ip_protocol_fallback = false
fail_if_not_ssl = false
valid_http_versions = ["HTTP/1.1", "HTTP/2.0"]
# Every non-Authentik response a carve-out may legitimately return.
# 301/302 are INCLUDED so a redirect passes the status check and is
# judged solely by the Location header match below. 5xx is excluded:
# a backend 500 isn't a walling-off but is still worth surfacing as a
# probe failure. The full 2xx/3xx/4xx set keeps probe_success==1 for
# all intact carve-outs (404s included).
valid_status_codes = [200, 201, 202, 204, 301, 302, 304, 400, 401, 403, 404, 405, 409, 410, 426, 429]
# FAIL the probe if the response redirects to Authentik. This is the
# walling-off signature: forward-auth 301/302 -> /application/o/authorize
# on authentik.viktorbarzin.me (also matches /outpost.goauthentik.io).
fail_if_header_matches = [
{
header = "Location"
regexp = "(authentik\\.viktorbarzin\\.me|/outpost\\.goauthentik\\.io|/application/o/authorize)"
allow_missing = true
},
]
}
}
}
})
}
}
resource "kubernetes_deployment" "blackbox_exporter" {
metadata {
name = "blackbox-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "blackbox-exporter"
tier = var.tier
}
annotations = {
"reloader.stakater.com/search" = "true"
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "blackbox-exporter"
}
}
template {
metadata {
labels = {
app = "blackbox-exporter"
}
}
spec {
container {
name = "blackbox-exporter"
image = "prom/blackbox-exporter:v0.25.0"
args = ["--config.file=/etc/blackbox_exporter/blackbox.yml"]
port {
container_port = 9115
name = "http"
}
resources {
requests = {
cpu = "5m"
memory = "24Mi"
}
limits = {
memory = "48Mi"
}
}
volume_mount {
name = "config-volume"
mount_path = "/etc/blackbox_exporter/"
}
}
volume {
name = "config-volume"
config_map {
name = kubernetes_config_map.blackbox_exporter_config.metadata[0].name
}
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
# KEEL: monitoring ns is keel-enrolled (policy=patch) Keel owns the image
# tag and injects keel.sh annotations. Ignore so TF stops reverting Keel.
ignore_changes = [
spec[0].template[0].spec[0].dns_config,
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE
metadata[0].annotations["keel.sh/policy"],
metadata[0].annotations["keel.sh/trigger"],
metadata[0].annotations["keel.sh/pollSchedule"],
metadata[0].annotations["keel.sh/match-tag"],
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
]
}
}
resource "kubernetes_service" "blackbox_exporter" {
metadata {
name = "blackbox-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "blackbox-exporter"
}
}
spec {
selector = {
app = "blackbox-exporter"
}
port {
name = "http"
port = 9115
target_port = 9115
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,540 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": { "type": "datasource", "uid": "grafana" },
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Backup health overview — K8s CronJob backups",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": 0,
"links": [],
"panels": [
{
"title": "Time Since Last Successful Backup",
"type": "stat",
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 0 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 90000 },
{ "color": "red", "value": 604800 }
]
},
"mappings": [
{
"type": "special",
"options": { "match": "null", "result": { "text": "No data", "color": "red" } }
}
]
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"orientation": "auto",
"textMode": "auto",
"colorMode": "background",
"graphMode": "none"
},
"targets": [
{
"expr": "time() - backup_last_success_timestamp",
"legendFormat": "{{ job }}",
"refId": "A"
}
]
},
{
"title": "All Backups — Overview",
"type": "table",
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 6 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {},
"overrides": [
{
"matcher": { "id": "byName", "options": "Duration (s)" },
"properties": [{ "id": "unit", "value": "s" }]
},
{
"matcher": { "id": "byName", "options": "Read (MiB)" },
"properties": [{ "id": "unit", "value": "decmbytes" }]
},
{
"matcher": { "id": "byName", "options": "Written (MiB)" },
"properties": [{ "id": "unit", "value": "decmbytes" }]
},
{
"matcher": { "id": "byName", "options": "Output (MiB)" },
"properties": [{ "id": "unit", "value": "decmbytes" }]
},
{
"matcher": { "id": "byName", "options": "Last Success" },
"properties": [{ "id": "unit", "value": "dateTimeFromNow" }]
}
]
},
"options": {
"showHeader": true,
"sortBy": [{ "displayName": "Last Success", "desc": true }]
},
"transformations": [
{
"id": "merge",
"options": {}
},
{
"id": "organize",
"options": {
"renameByName": {
"Value #Duration": "Duration (s)",
"Value #Read": "Read (MiB)",
"Value #Written": "Written (MiB)",
"Value #Output": "Output (MiB)",
"Value #LastSuccess": "Last Success",
"job": "Backup"
},
"excludeByName": {
"Time": true,
"instance": true,
"__name__": true
}
}
}
],
"targets": [
{
"expr": "backup_duration_seconds",
"legendFormat": "{{ job }}",
"refId": "Duration",
"instant": true,
"format": "table"
},
{
"expr": "backup_read_bytes / 1048576",
"legendFormat": "{{ job }}",
"refId": "Read",
"instant": true,
"format": "table"
},
{
"expr": "backup_written_bytes / 1048576",
"legendFormat": "{{ job }}",
"refId": "Written",
"instant": true,
"format": "table"
},
{
"expr": "backup_output_bytes / 1048576",
"legendFormat": "{{ job }}",
"refId": "Output",
"instant": true,
"format": "table"
},
{
"expr": "backup_last_success_timestamp * 1000",
"legendFormat": "{{ job }}",
"refId": "LastSuccess",
"instant": true,
"format": "table"
}
]
},
{
"title": "Backup Duration Trend",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "s",
"custom": {
"drawStyle": "points",
"pointSize": 8,
"showPoints": "always"
}
},
"overrides": []
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
},
"targets": [
{
"expr": "backup_duration_seconds",
"legendFormat": "{{ job }}",
"refId": "A"
}
]
},
{
"title": "Backup IO Trend (Read + Written)",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "bytes",
"custom": {
"drawStyle": "points",
"pointSize": 8,
"showPoints": "always"
}
},
"overrides": []
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
},
"targets": [
{
"expr": "backup_read_bytes",
"legendFormat": "{{ job }} read",
"refId": "A"
},
{
"expr": "backup_written_bytes",
"legendFormat": "{{ job }} written",
"refId": "B"
}
]
},
{
"title": "Backup Output Size Trend",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "bytes",
"custom": {
"drawStyle": "points",
"pointSize": 8,
"showPoints": "always"
}
},
"overrides": []
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
},
"targets": [
{
"expr": "backup_output_bytes",
"legendFormat": "{{ job }}",
"refId": "A"
}
]
},
{
"title": "Write Throughput",
"type": "timeseries",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "Bps",
"custom": {
"drawStyle": "bars",
"lineWidth": 1,
"fillOpacity": 50,
"pointSize": 5,
"showPoints": "never"
}
},
"overrides": []
},
"options": {
"tooltip": { "mode": "multi", "sort": "desc" },
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
},
"targets": [
{
"expr": "backup_written_bytes / backup_duration_seconds",
"legendFormat": "{{ job }}",
"refId": "A"
}
]
},
{
"title": "LVM Thin Snapshots",
"type": "row",
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 42 },
"collapsed": false,
"panels": []
},
{
"title": "Time Since Last LVM Snapshot",
"type": "stat",
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 43 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 50400 },
{ "color": "red", "value": 90000 }
]
},
"mappings": [
{
"type": "special",
"options": { "match": "null", "result": { "text": "No data", "color": "red" } }
}
]
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"colorMode": "background",
"graphMode": "none",
"textMode": "auto"
},
"targets": [
{
"expr": "time() - lvm_snapshot_last_run_timestamp{job=\"lvm-pvc-snapshot\"}",
"legendFormat": "LVM Snapshots",
"refId": "A"
}
]
},
{
"title": "LVM Snapshot Status",
"type": "stat",
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 43 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"mappings": [
{
"type": "value",
"options": {
"0": { "text": "OK", "color": "green" },
"1": { "text": "PARTIAL", "color": "yellow" },
"2": { "text": "ABORTED", "color": "red" }
}
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 2 }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"colorMode": "background",
"graphMode": "none",
"textMode": "auto"
},
"targets": [
{
"expr": "lvm_snapshot_last_status{job=\"lvm-pvc-snapshot\"}",
"legendFormat": "Status",
"refId": "A"
}
]
},
{
"title": "Thin Pool Free Space",
"type": "gauge",
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 43 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 15 },
{ "color": "green", "value": 30 }
]
},
"mappings": [
{
"type": "special",
"options": { "match": "null", "result": { "text": "No data", "color": "red" } }
}
]
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"targets": [
{
"expr": "lvm_snapshot_thinpool_free_pct{job=\"lvm-pvc-snapshot\"}",
"legendFormat": "Free %",
"refId": "A"
}
]
},
{
"title": "Snapshots Created / Pruned (Last Run)",
"type": "stat",
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 49 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "blue", "value": null }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"colorMode": "background",
"graphMode": "none",
"textMode": "auto"
},
"targets": [
{
"expr": "lvm_snapshot_created_total{job=\"lvm-pvc-snapshot\"}",
"legendFormat": "Created",
"refId": "A"
},
{
"expr": "lvm_snapshot_pruned_total{job=\"lvm-pvc-snapshot\"}",
"legendFormat": "Pruned",
"refId": "B"
},
{
"expr": "lvm_snapshot_failed_total{job=\"lvm-pvc-snapshot\"}",
"legendFormat": "Failed",
"refId": "C"
}
]
},
{
"title": "Active Backup & Snapshot Alerts",
"type": "alertlist",
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 55 },
"datasource": { "type": "datasource", "uid": "grafana" },
"options": {
"showOptions": "current",
"maxItems": 20,
"sortOrder": 1,
"dashboardAlerts": false,
"alertName": "",
"stateFilter": {
"firing": true,
"pending": true,
"noData": true,
"normal": false,
"error": true
},
"alertInstanceLabelFilter": "{__alert_rule_title__=~\".*[Bb]ackup.*|.*[Ss]napshot.*|.*ThinPool.*\"}",
"folder": { "id": null, "title": "" },
"folderId": null
}
},
{
"title": "CronJob Last Schedule",
"type": "table",
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 61 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {},
"overrides": [
{
"matcher": { "id": "byName", "options": "Value" },
"properties": [{ "id": "unit", "value": "dateTimeAsIso" }]
}
]
},
"options": {
"showHeader": true,
"sortBy": [{ "displayName": "Value", "desc": true }]
},
"transformations": [
{
"id": "organize",
"options": {
"renameByName": {
"cronjob": "CronJob",
"namespace": "Namespace",
"Value": "Last Scheduled"
},
"excludeByName": {
"Time": true,
"__name__": true,
"instance": true,
"job": true,
"uid": true
}
}
}
],
"targets": [
{
"expr": "kube_cronjob_status_last_schedule_time{cronjob=~\".*backup.*|.*etcd.*|.*raft.*\"} * 1000",
"legendFormat": "",
"refId": "A",
"instant": true,
"format": "table"
}
]
}
],
"templating": {
"list": [
{
"current": {
"text": "Prometheus",
"value": "PBFA97CFB590B2093"
},
"includeAll": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
}
]
},
"time": { "from": "now-7d", "to": "now" },
"timepicker": {},
"timezone": "",
"title": "Backup Health",
"uid": "backup-health",
"version": 1,
"schemaVersion": 39
}

View file

@ -0,0 +1,138 @@
{
"annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "grafana", "uid": "-- Grafana --" }, "enable": true, "hide": true, "name": "Annotations & Alerts", "type": "dashboard" } ] },
"description": "Cluster-wide log observability over Loki. Pod logs are shipped by Grafana Alloy (labels namespace/pod/container/app); node + Sofia-Pi system logs come from the journald jobs. Filter with the namespace/app/pod dropdowns and the free-text search box; error/warn panels use case-insensitive regex line-filters so they work regardless of level-label availability.",
"editable": true,
"graphTooltip": 1,
"liveNow": false,
"schemaVersion": 39,
"tags": ["logs", "loki", "cluster"],
"time": { "from": "now-1h", "to": "now" },
"timezone": "",
"title": "Cluster Logs",
"uid": "cluster-logs",
"version": 1,
"templating": {
"list": [
{
"name": "namespace", "label": "Namespace", "type": "query",
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"query": "label_values(namespace)", "definition": "label_values(namespace)",
"multi": true, "includeAll": true, "allValue": ".+", "refresh": 2, "sort": 1,
"current": { "text": "All", "value": "$__all" }
},
{
"name": "app", "label": "App", "type": "query",
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"query": "label_values({namespace=~\"$namespace\"}, app)", "definition": "label_values({namespace=~\"$namespace\"}, app)",
"multi": true, "includeAll": true, "allValue": ".+", "refresh": 2, "sort": 1,
"current": { "text": "All", "value": "$__all" }
},
{
"name": "pod", "label": "Pod", "type": "query",
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"query": "label_values({namespace=~\"$namespace\"}, pod)", "definition": "label_values({namespace=~\"$namespace\"}, pod)",
"multi": true, "includeAll": true, "allValue": ".+", "refresh": 2, "sort": 1,
"current": { "text": "All", "value": "$__all" }
},
{
"name": "search", "label": "Search (regex, case-insensitive)", "type": "textbox",
"query": "", "current": { "text": "", "value": "" }
}
]
},
"panels": [
{ "type": "row", "title": "Cluster Pod Logs (Alloy)", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 100, "collapsed": false },
{
"type": "stat", "title": "Lines (range)", "id": 1,
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
"options": { "colorMode": "value", "graphMode": "area", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } },
"fieldConfig": { "defaults": { "color": { "mode": "fixed", "fixedColor": "blue" }, "unit": "short" }, "overrides": [] },
"targets": [ { "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" }, "expr": "sum(count_over_time({namespace=~\"$namespace\", app=~\"$app\", pod=~\"$pod\"} |~ \"(?i)$search\" [$__range]))", "queryType": "instant", "refId": "A" } ]
},
{
"type": "stat", "title": "Errors (range)", "id": 2,
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
"options": { "colorMode": "background", "graphMode": "area", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } },
"fieldConfig": { "defaults": { "unit": "short", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 50 } ] } }, "overrides": [] },
"targets": [ { "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" }, "expr": "sum(count_over_time({namespace=~\"$namespace\", app=~\"$app\", pod=~\"$pod\"} |~ \"(?i)$search\" |~ \"(?i)(error|fatal|panic|exception)\" [$__range]))", "queryType": "instant", "refId": "A" } ]
},
{
"type": "stat", "title": "Warnings (range)", "id": 3,
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
"options": { "colorMode": "background", "graphMode": "area", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } },
"fieldConfig": { "defaults": { "unit": "short", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 } ] } }, "overrides": [] },
"targets": [ { "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" }, "expr": "sum(count_over_time({namespace=~\"$namespace\", app=~\"$app\", pod=~\"$pod\"} |~ \"(?i)$search\" |~ \"(?i)(warn)\" [$__range]))", "queryType": "instant", "refId": "A" } ]
},
{
"type": "stat", "title": "Active namespaces", "id": 4,
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
"options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } },
"fieldConfig": { "defaults": { "color": { "mode": "fixed", "fixedColor": "purple" }, "unit": "short" }, "overrides": [] },
"targets": [ { "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" }, "expr": "count(sum by (namespace) (count_over_time({namespace=~\"$namespace\"} [$__range])))", "queryType": "instant", "refId": "A" } ]
},
{
"type": "timeseries", "title": "Log volume by namespace (top 5)", "id": 5,
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 },
"options": { "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
"fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "drawStyle": "bars", "fillOpacity": 60, "lineWidth": 0, "stacking": { "mode": "normal", "group": "A" } }, "unit": "short" }, "overrides": [] },
"targets": [ { "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" }, "expr": "topk(5, sum by (namespace) (count_over_time({namespace=~\"$namespace\", app=~\"$app\", pod=~\"$pod\"} |~ \"(?i)$search\" [$__auto])))", "legendFormat": "{{namespace}}", "queryType": "range", "refId": "A" } ]
},
{
"type": "timeseries", "title": "Error / Warning rate", "id": 6,
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 },
"options": { "legend": { "calcs": ["sum", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
"fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "fillOpacity": 15, "lineWidth": 2, "showPoints": "never" }, "unit": "short" }, "overrides": [ { "matcher": { "id": "byName", "options": "errors" }, "properties": [ { "id": "color", "value": { "mode": "fixed", "fixedColor": "red" } } ] }, { "matcher": { "id": "byName", "options": "warnings" }, "properties": [ { "id": "color", "value": { "mode": "fixed", "fixedColor": "orange" } } ] } ] },
"targets": [
{ "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" }, "expr": "sum(count_over_time({namespace=~\"$namespace\", app=~\"$app\", pod=~\"$pod\"} |~ \"(?i)(error|fatal|panic|exception)\" [$__auto]))", "legendFormat": "errors", "queryType": "range", "refId": "A" },
{ "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" }, "expr": "sum(count_over_time({namespace=~\"$namespace\", app=~\"$app\", pod=~\"$pod\"} |~ \"(?i)(warn)\" [$__auto]))", "legendFormat": "warnings", "queryType": "range", "refId": "B" }
]
},
{
"type": "table", "title": "Top namespaces by errors (range)", "id": 7,
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 },
"options": { "showHeader": true, "sortBy": [ { "displayName": "Value", "desc": true } ] },
"fieldConfig": { "defaults": { "custom": { "align": "auto", "filterable": false } }, "overrides": [] },
"targets": [ { "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" }, "expr": "topk(10, sum by (namespace) (count_over_time({namespace=~\"$namespace\"} |~ \"(?i)(error|fatal|panic|exception)\" [$__range])))", "queryType": "instant", "format": "table", "instant": true, "refId": "A" } ],
"transformations": [ { "id": "organize", "options": { "excludeByName": { "Time": true }, "renameByName": { "Value": "errors" } } } ]
},
{
"type": "table", "title": "Top pods by errors (range)", "id": 8,
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 },
"options": { "showHeader": true, "sortBy": [ { "displayName": "Value", "desc": true } ] },
"fieldConfig": { "defaults": { "custom": { "align": "auto", "filterable": false } }, "overrides": [] },
"targets": [ { "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" }, "expr": "topk(10, sum by (namespace, pod) (count_over_time({namespace=~\"$namespace\"} |~ \"(?i)(error|fatal|panic|exception)\" [$__range])))", "queryType": "instant", "format": "table", "instant": true, "refId": "A" } ],
"transformations": [ { "id": "organize", "options": { "excludeByName": { "Time": true }, "renameByName": { "Value": "errors" } } } ]
},
{
"type": "logs", "title": "Live logs {namespace=~$namespace, app=~$app, pod=~$pod} |~ search", "id": 9,
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 21 },
"options": { "showTime": true, "showLabels": false, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "dedupStrategy": "none", "sortOrder": "Descending" },
"targets": [ { "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" }, "expr": "{namespace=~\"$namespace\", app=~\"$app\", pod=~\"$pod\"} |~ \"(?i)$search\"", "queryType": "range", "refId": "A" } ]
},
{ "type": "row", "title": "Node & Device Journals (systemd)", "gridPos": { "h": 1, "w": 24, "x": 0, "y": 33 }, "id": 200, "collapsed": false },
{
"type": "timeseries", "title": "Journal volume by level", "id": 10,
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 },
"options": { "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
"fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "drawStyle": "bars", "fillOpacity": 60, "lineWidth": 0, "stacking": { "mode": "normal", "group": "A" } }, "unit": "short" }, "overrides": [] },
"targets": [ { "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" }, "expr": "sum by (level) (count_over_time({job=~\"node-journal|rpi-sofia-journal\"} [$__auto]))", "legendFormat": "{{level}}", "queryType": "range", "refId": "A" } ]
},
{
"type": "logs", "title": "Journal errors & warnings (nodes + rpi-sofia)", "id": 11,
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 },
"options": { "showTime": true, "showLabels": true, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "dedupStrategy": "none", "sortOrder": "Descending" },
"targets": [ { "datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" }, "expr": "{job=~\"node-journal|rpi-sofia-journal\", level=~\"emerg|alert|crit|error|warning\"}", "queryType": "range", "refId": "A" } ]
}
]
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,271 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {"type": "datasource", "uid": "grafana"},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Cost-of-living per city — Numbeo + Expatistan snapshots cached in fire_planner.col_snapshot (1-year TTL). Powers the FIRE simulator's auto-COL adjustment. Use the city dropdown to drill into one city; the bottom panels rank all cities.",
"editable": true,
"fiscalYearStartMonth": 0,
"id": null,
"templating": {
"list": [
{
"name": "city",
"type": "query",
"label": "City",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"query": "SELECT DISTINCT city_slug AS __value, city_display AS __text FROM fire_planner.col_snapshot ORDER BY city_display",
"refresh": 1,
"includeAll": false,
"multi": false,
"current": {"selected": false, "text": "Sofia", "value": "sofia"}
},
{
"name": "baseline_city",
"type": "query",
"label": "Baseline city (for ratio)",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"query": "SELECT DISTINCT city_slug AS __value, city_display AS __text FROM fire_planner.col_snapshot ORDER BY city_display",
"refresh": 1,
"includeAll": false,
"multi": false,
"current": {"selected": false, "text": "London", "value": "london"}
}
]
},
"links": [],
"panels": [
{
"id": 1,
"title": "$city — Total monthly cost (with rent, single person)",
"type": "stat",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"gridPos": {"h": 5, "w": 6, "x": 0, "y": 0},
"fieldConfig": {
"defaults": {
"unit": "currencyGBP",
"decimals": 0,
"color": {"mode": "thresholds"},
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 1500},
{"color": "orange", "value": 2500},
{"color": "red", "value": 3500}
]
}
},
"overrides": []
},
"options": {"colorMode": "background", "graphMode": "none", "textMode": "value_and_name", "reduceOptions": {"calcs": ["lastNotNull"]}},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "SELECT total_with_rent_gbp FROM fire_planner.col_snapshot WHERE city_slug = '$city' ORDER BY fetched_at DESC LIMIT 1"
}
]
},
{
"id": 2,
"title": "$city — Without rent",
"type": "stat",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"gridPos": {"h": 5, "w": 6, "x": 6, "y": 0},
"fieldConfig": {"defaults": {"unit": "currencyGBP", "decimals": 0}, "overrides": []},
"options": {"colorMode": "value", "graphMode": "none", "reduceOptions": {"calcs": ["lastNotNull"]}},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "SELECT total_no_rent_gbp FROM fire_planner.col_snapshot WHERE city_slug = '$city' ORDER BY fetched_at DESC LIMIT 1"
}
]
},
{
"id": 3,
"title": "$city — 1-bed rent (center)",
"type": "stat",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"gridPos": {"h": 5, "w": 6, "x": 12, "y": 0},
"fieldConfig": {"defaults": {"unit": "currencyGBP", "decimals": 0}, "overrides": []},
"options": {"colorMode": "value", "graphMode": "none", "reduceOptions": {"calcs": ["lastNotNull"]}},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "SELECT rent_1bed_center_gbp FROM fire_planner.col_snapshot WHERE city_slug = '$city' ORDER BY fetched_at DESC LIMIT 1"
}
]
},
{
"id": 4,
"title": "COL ratio vs $baseline_city",
"type": "stat",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"description": "Multiplier the simulator applies to spending_gbp when moving from $baseline_city to $city. 0.5x = half-the-price; 1.0x = same; 2.0x = double.",
"gridPos": {"h": 5, "w": 6, "x": 18, "y": 0},
"fieldConfig": {
"defaults": {
"unit": "none",
"decimals": 2,
"color": {"mode": "thresholds"},
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 0.6},
{"color": "orange", "value": 0.85},
{"color": "red", "value": 1.0}
]
}
},
"overrides": []
},
"options": {"colorMode": "background", "graphMode": "none", "reduceOptions": {"calcs": ["lastNotNull"]}},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "SELECT (c.total_with_rent_gbp / b.total_with_rent_gbp)::numeric(8,3) AS ratio FROM (SELECT total_with_rent_gbp FROM fire_planner.col_snapshot WHERE city_slug = '$city' ORDER BY fetched_at DESC LIMIT 1) c, (SELECT total_with_rent_gbp FROM fire_planner.col_snapshot WHERE city_slug = '$baseline_city' ORDER BY fetched_at DESC LIMIT 1) b"
}
]
},
{
"id": 5,
"title": "$city — Snapshot metadata",
"type": "table",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"description": "Provenance for the current city. Snapshot date is when the source page was crawled; expires_at is when the cache row will be re-scraped (1-year TTL).",
"gridPos": {"h": 6, "w": 12, "x": 0, "y": 5},
"fieldConfig": {
"defaults": {"custom": {"align": "left"}},
"overrides": [
{
"matcher": {"id": "byName", "options": "source_url"},
"properties": [{"id": "links", "value": [{"title": "Open source on $1", "url": "${__value.text}", "targetBlank": true}]}]
}
]
},
"options": {"showHeader": true},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "SELECT city_display, country, source_name, source_url, snapshot_date, fetched_at, expires_at, raw_currency, gbp_per_unit FROM fire_planner.col_snapshot WHERE city_slug = '$city' ORDER BY fetched_at DESC LIMIT 1"
}
]
},
{
"id": 6,
"title": "$city — Cost breakdown (single person, monthly)",
"type": "barchart",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"description": "Categories from baseline.py (Phase 1) — 0 for cities where the live scraper hasn't populated per-category yet. Rent and headline totals are always populated.",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 5},
"fieldConfig": {"defaults": {"unit": "currencyGBP", "decimals": 0}, "overrides": []},
"options": {"orientation": "horizontal", "showValue": "auto", "stacking": "none", "legend": {"displayMode": "list"}, "xTickLabelRotation": 0},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "SELECT 'rent_center' AS category, rent_1bed_center_gbp AS amount_gbp FROM fire_planner.col_snapshot WHERE city_slug = '$city' ORDER BY fetched_at DESC LIMIT 1 UNION ALL SELECT 'rent_outside', rent_1bed_outside_gbp FROM fire_planner.col_snapshot WHERE city_slug = '$city' AND rent_1bed_outside_gbp IS NOT NULL ORDER BY fetched_at DESC LIMIT 1 UNION ALL SELECT 'all_other', total_no_rent_gbp FROM fire_planner.col_snapshot WHERE city_slug = '$city' ORDER BY fetched_at DESC LIMIT 1"
}
]
},
{
"id": 7,
"title": "All cities — ranked by Total monthly (with rent)",
"type": "table",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"description": "Sortable table of every city in the cache. Click a column header to sort. Lower = cheaper.",
"gridPos": {"h": 16, "w": 24, "x": 0, "y": 13},
"fieldConfig": {
"defaults": {
"custom": {"align": "left", "displayMode": "auto"},
"decimals": 0
},
"overrides": [
{"matcher": {"id": "byName", "options": "total_with_rent_gbp"}, "properties": [{"id": "unit", "value": "currencyGBP"}, {"id": "custom.displayMode", "value": "gradient-gauge"}, {"id": "color", "value": {"mode": "thresholds"}}, {"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1500}, {"color": "orange", "value": 2500}, {"color": "red", "value": 3500}]}}]},
{"matcher": {"id": "byName", "options": "total_no_rent_gbp"}, "properties": [{"id": "unit", "value": "currencyGBP"}]},
{"matcher": {"id": "byName", "options": "rent_1bed_center_gbp"}, "properties": [{"id": "unit", "value": "currencyGBP"}]},
{"matcher": {"id": "byName", "options": "ratio_vs_baseline"}, "properties": [{"id": "decimals", "value": 2}, {"id": "custom.displayMode", "value": "color-text"}, {"id": "color", "value": {"mode": "thresholds"}}, {"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.5}, {"color": "orange", "value": 0.85}, {"color": "red", "value": 1.0}]}}]},
{"matcher": {"id": "byName", "options": "source_url"}, "properties": [{"id": "links", "value": [{"title": "Open source", "url": "${__value.text}", "targetBlank": true}]}]}
]
},
"options": {"showHeader": true, "sortBy": [{"displayName": "total_with_rent_gbp", "desc": false}]},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "WITH latest AS (SELECT DISTINCT ON (city_slug) city_slug, city_display, country, total_no_rent_gbp, total_with_rent_gbp, rent_1bed_center_gbp, source_name, source_url, snapshot_date, fetched_at FROM fire_planner.col_snapshot ORDER BY city_slug, fetched_at DESC), baseline AS (SELECT total_with_rent_gbp AS b FROM fire_planner.col_snapshot WHERE city_slug = '$baseline_city' ORDER BY fetched_at DESC LIMIT 1) SELECT l.city_display, l.country, l.total_with_rent_gbp, l.total_no_rent_gbp, l.rent_1bed_center_gbp, (l.total_with_rent_gbp / b.b)::numeric(8,3) AS ratio_vs_baseline, l.source_name, l.snapshot_date, l.source_url FROM latest l CROSS JOIN baseline b ORDER BY l.total_with_rent_gbp ASC"
}
]
},
{
"id": 8,
"title": "Cache freshness — fetched_at across all cities",
"type": "table",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"description": "When each city was last refreshed. Rows turning red are within 30 days of expiry (1-year TTL) — the refresh CronJob should pick them up before they go stale.",
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 29},
"fieldConfig": {
"defaults": {"custom": {"align": "left"}},
"overrides": [
{"matcher": {"id": "byName", "options": "expires_in_days"}, "properties": [{"id": "unit", "value": "d"}, {"id": "decimals", "value": 0}, {"id": "custom.displayMode", "value": "color-background-solid"}, {"id": "color", "value": {"mode": "thresholds"}}, {"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "orange", "value": 30}, {"color": "yellow", "value": 90}, {"color": "green", "value": 180}]}}]}
]
},
"options": {"showHeader": true, "sortBy": [{"displayName": "expires_in_days", "desc": false}]},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "SELECT DISTINCT ON (city_slug) city_display, country, source_name, snapshot_date, fetched_at, expires_at, EXTRACT(DAY FROM expires_at - NOW())::int AS expires_in_days FROM fire_planner.col_snapshot ORDER BY city_slug, fetched_at DESC"
}
]
}
],
"refresh": "",
"schemaVersion": 39,
"tags": ["fire-planner", "col", "cost-of-living"],
"time": {"from": "now-30d", "to": "now"},
"timepicker": {},
"timezone": "Europe/London",
"title": "Cost of Living",
"uid": "fire-col",
"version": 1
}

View file

@ -0,0 +1,226 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {"type": "datasource", "uid": "grafana"},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "FIRE Retirement Planner — risk-adjusted, tax-minimised Monte Carlo over jurisdictions, withdrawal strategies, and UK-departure years. Backed by fire_planner schema on pg-cluster-rw.",
"editable": true,
"fiscalYearStartMonth": 0,
"id": null,
"templating": {
"list": [
{
"name": "scenario",
"type": "query",
"label": "Scenario",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"query": "SELECT external_id FROM fire_planner.scenario ORDER BY external_id",
"refresh": 1,
"includeAll": false,
"multi": false,
"current": {"selected": false, "text": "cyprus-vpw-leave-y3-glide-rising", "value": "cyprus-vpw-leave-y3-glide-rising"}
}
]
},
"links": [],
"panels": [
{
"id": 1,
"title": "Net worth over time (real + nominal)",
"type": "timeseries",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 0},
"fieldConfig": {
"defaults": {"unit": "currencyGBP", "decimals": 0},
"overrides": []
},
"options": {"legend": {"displayMode": "table", "showLegend": true}, "tooltip": {"mode": "multi"}},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "time_series",
"rawSql": "SELECT snapshot_date AS time, account_name AS metric, SUM(market_value_gbp) AS value FROM fire_planner.account_snapshot WHERE snapshot_date >= NOW() - INTERVAL '10 years' GROUP BY snapshot_date, account_name ORDER BY snapshot_date"
}
]
},
{
"id": 2,
"title": "Monte Carlo fan chart — selected scenario",
"type": "timeseries",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 8},
"description": "P10/p25/p50/p75/p90 portfolio value across MC paths, for the scenario picked in the selector at the top.",
"fieldConfig": {"defaults": {"unit": "currencyGBP", "decimals": 0}, "overrides": []},
"options": {"legend": {"displayMode": "table", "showLegend": true}, "tooltip": {"mode": "multi"}},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "time_series",
"rawSql": "SELECT (DATE_TRUNC('year', NOW()) + (year_idx || ' years')::interval) AS time, 'p10' AS metric, p10_portfolio_gbp AS value FROM fire_planner.projection_yearly p JOIN fire_planner.mc_run r ON r.id = p.mc_run_id JOIN fire_planner.scenario s ON s.id = r.scenario_id WHERE s.external_id = '$scenario' UNION ALL SELECT (DATE_TRUNC('year', NOW()) + (year_idx || ' years')::interval), 'p25', p25_portfolio_gbp FROM fire_planner.projection_yearly p JOIN fire_planner.mc_run r ON r.id = p.mc_run_id JOIN fire_planner.scenario s ON s.id = r.scenario_id WHERE s.external_id = '$scenario' UNION ALL SELECT (DATE_TRUNC('year', NOW()) + (year_idx || ' years')::interval), 'p50', p50_portfolio_gbp FROM fire_planner.projection_yearly p JOIN fire_planner.mc_run r ON r.id = p.mc_run_id JOIN fire_planner.scenario s ON s.id = r.scenario_id WHERE s.external_id = '$scenario' UNION ALL SELECT (DATE_TRUNC('year', NOW()) + (year_idx || ' years')::interval), 'p75', p75_portfolio_gbp FROM fire_planner.projection_yearly p JOIN fire_planner.mc_run r ON r.id = p.mc_run_id JOIN fire_planner.scenario s ON s.id = r.scenario_id WHERE s.external_id = '$scenario' UNION ALL SELECT (DATE_TRUNC('year', NOW()) + (year_idx || ' years')::interval), 'p90', p90_portfolio_gbp FROM fire_planner.projection_yearly p JOIN fire_planner.mc_run r ON r.id = p.mc_run_id JOIN fire_planner.scenario s ON s.id = r.scenario_id WHERE s.external_id = '$scenario' ORDER BY time"
}
]
},
{
"id": 3,
"title": "Confidence heatmap — jurisdiction × strategy",
"type": "table",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 18},
"description": "Median success rate by (jurisdiction, strategy), averaged across leave-UK years and glide paths.",
"fieldConfig": {
"defaults": {"custom": {"align": "left", "displayMode": "auto"}, "unit": "percentunit", "decimals": 2},
"overrides": []
},
"options": {"showHeader": true},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "SELECT jurisdiction, strategy, AVG(success_rate) AS avg_success FROM fire_planner.scenario_summary GROUP BY jurisdiction, strategy ORDER BY jurisdiction, strategy"
}
]
},
{
"id": 4,
"title": "Median lifetime tax — by jurisdiction",
"type": "barchart",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 18},
"fieldConfig": {"defaults": {"unit": "currencyGBP", "decimals": 0}, "overrides": []},
"options": {"orientation": "horizontal", "showValue": "auto", "stacking": "none", "legend": {"displayMode": "list"}},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "SELECT jurisdiction, AVG(median_lifetime_tax_gbp) AS lifetime_tax FROM fire_planner.scenario_summary GROUP BY jurisdiction ORDER BY lifetime_tax DESC"
}
]
},
{
"id": 5,
"title": "Withdrawal runway — years to ruin (failing paths)",
"type": "table",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 26},
"description": "Among scenarios where some MC paths failed, the median year-to-ruin. Empty where every path survives.",
"fieldConfig": {"defaults": {"unit": "y", "decimals": 1}, "overrides": []},
"options": {"showHeader": true},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "SELECT jurisdiction, strategy, leave_uk_year, glide_path, median_years_to_ruin FROM fire_planner.scenario_summary WHERE median_years_to_ruin IS NOT NULL ORDER BY median_years_to_ruin ASC LIMIT 20"
}
]
},
{
"id": 6,
"title": "Optimal leave-UK year",
"type": "stat",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 26},
"description": "leave_uk_year that maximises success_rate lifetime_tax (tax in £M; small weighting).",
"fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
"options": {"colorMode": "value", "reduceOptions": {"calcs": ["lastNotNull"]}},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "SELECT leave_uk_year FROM fire_planner.scenario_summary WHERE jurisdiction <> 'uk' ORDER BY (success_rate - median_lifetime_tax_gbp / 1000000.0) DESC LIMIT 1"
}
]
},
{
"id": 7,
"title": "Median ending wealth — selected scenario",
"type": "stat",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 26},
"fieldConfig": {"defaults": {"unit": "currencyGBP", "decimals": 0}, "overrides": []},
"options": {"colorMode": "value", "reduceOptions": {"calcs": ["lastNotNull"]}},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "SELECT p50_ending_gbp FROM fire_planner.scenario_summary WHERE scenario_id = (SELECT id FROM fire_planner.scenario WHERE external_id = '$scenario')"
}
]
},
{
"id": 8,
"title": "Success rate vs spend (UK-stay)",
"type": "barchart",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 30},
"description": "Sanity gauge — UK success rate by strategy, helps anchor expectations against published cFIREsim numbers.",
"fieldConfig": {"defaults": {"unit": "percentunit", "decimals": 2}, "overrides": []},
"options": {"orientation": "horizontal", "showValue": "auto", "legend": {"displayMode": "list"}},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "SELECT strategy, AVG(success_rate) AS success FROM fire_planner.scenario_summary WHERE jurisdiction = 'uk' GROUP BY strategy ORDER BY success DESC"
}
]
},
{
"id": 9,
"title": "Sequence-of-returns sensitivity (top failing scenarios)",
"type": "table",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 30},
"description": "Pearson correlation between year-1 portfolio drawdown and overall success — strongly negative ⇒ scenario is sequence-of-returns sensitive (case for the rising-equity glide).",
"fieldConfig": {"defaults": {"unit": "none", "decimals": 4}, "overrides": []},
"options": {"showHeader": true},
"targets": [
{
"refId": "A",
"datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"},
"rawQuery": true,
"editorMode": "code",
"format": "table",
"rawSql": "SELECT s.external_id, r.sequence_risk_correlation, r.success_rate FROM fire_planner.mc_run r JOIN fire_planner.scenario s ON s.id = r.scenario_id WHERE r.id IN (SELECT MAX(id) FROM fire_planner.mc_run GROUP BY scenario_id) ORDER BY r.sequence_risk_correlation ASC LIMIT 15"
}
]
}
],
"schemaVersion": 39,
"tags": ["finance", "fire", "retirement", "monte-carlo"],
"title": "FIRE Planner",
"uid": "fire-planner",
"version": 1,
"weekStart": ""
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,465 @@
{
"annotations": {"list": []},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"description": "Newly-ingested roles (by fetched_at).",
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "bars",
"fillOpacity": 60,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"showPoints": "auto",
"spanNulls": false,
"stacking": {"mode": "none"}
},
"thresholds": {"mode": "absolute", "steps": []}
},
"overrides": []
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
"id": 1,
"options": {
"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true},
"tooltip": {"mode": "single", "sort": "none"}
},
"targets": [
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"format": "time_series",
"rawQuery": true,
"rawSql": "SELECT date_trunc('day', fetched_at) AT TIME ZONE 'UTC' AS time, source, COUNT(*) AS value FROM job_hunter.roles WHERE $__timeFilter(fetched_at) AND primary_location IN (${location:sqlstring}) GROUP BY 1, 2 ORDER BY 1",
"refId": "A"
}
],
"title": "New roles per day by source",
"type": "timeseries"
},
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"description": "Distinct open roles by source over the time window.",
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {"hideFrom": {"legend": false, "tooltip": false, "viz": false}},
"mappings": []
},
"overrides": []
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
"id": 2,
"options": {
"legend": {"displayMode": "table", "placement": "right", "showLegend": true, "values": ["value"]},
"pieType": "donut",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
"tooltip": {"mode": "single", "sort": "none"}
},
"targets": [
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"format": "table",
"rawQuery": true,
"rawSql": "SELECT source AS metric, COUNT(DISTINCT dedup_key) AS value FROM job_hunter.roles WHERE $__timeFilter(fetched_at) AND primary_location IN (${location:sqlstring}) GROUP BY source ORDER BY value DESC",
"refId": "A"
}
],
"title": "Roles by source (deduplicated)",
"type": "piechart"
},
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"description": "Top 20 companies by recent role volume.",
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"axisPlacement": "auto",
"fillOpacity": 80,
"gradientMode": "none",
"lineWidth": 1
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": []}
},
"overrides": []
},
"gridPos": {"h": 9, "w": 12, "x": 0, "y": 8},
"id": 3,
"options": {
"barRadius": 0,
"barWidth": 0.85,
"fullHighlight": false,
"groupWidth": 0.7,
"legend": {"displayMode": "hidden", "placement": "bottom", "showLegend": false},
"orientation": "horizontal",
"showValue": "auto",
"stacking": "none",
"tooltip": {"mode": "single", "sort": "none"},
"xTickLabelRotation": 0,
"xTickLabelSpacing": 0
},
"targets": [
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"format": "table",
"rawQuery": true,
"rawSql": "SELECT c.display_name, COUNT(*) AS roles FROM job_hunter.roles r JOIN job_hunter.companies c ON r.company_id = c.id WHERE $__timeFilter(r.fetched_at) AND r.primary_location IN (${location:sqlstring}) GROUP BY c.display_name ORDER BY roles DESC LIMIT 20",
"refId": "A"
}
],
"title": "Top companies by role volume",
"type": "barchart"
},
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"description": "Normalised base salary distribution (£) for roles with explicit comp.",
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {"fillOpacity": 80, "lineWidth": 1},
"mappings": []
},
"overrides": []
},
"gridPos": {"h": 9, "w": 12, "x": 12, "y": 8},
"id": 4,
"options": {
"bucketOffset": 0,
"combine": false,
"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}
},
"targets": [
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"format": "table",
"rawQuery": true,
"rawSql": "SELECT parsed_base_gbp::float AS base_gbp FROM job_hunter.roles WHERE parsed_base_gbp IS NOT NULL AND $__timeFilter(fetched_at) AND primary_location IN (${location:sqlstring})",
"refId": "A"
}
],
"title": "Salary distribution (GBP)",
"type": "histogram"
},
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"description": "Recent roles, ranked by salary-parse confidence then parsed base.",
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"custom": {
"align": "auto",
"cellOptions": {"type": "auto"},
"filterable": true,
"inspect": false
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": []}
},
"overrides": [
{
"matcher": {"id": "byName", "options": "apply_url"},
"properties": [
{"id": "custom.cellOptions", "value": {"type": "auto"}},
{"id": "links", "value": [{"targetBlank": true, "title": "Open", "url": "${__value.raw}"}]}
]
},
{
"matcher": {"id": "byName", "options": "base_gbp"},
"properties": [{"id": "unit", "value": "currencyGBP"}]
}
]
},
"gridPos": {"h": 12, "w": 24, "x": 0, "y": 17},
"id": 5,
"options": {
"cellHeight": "sm",
"footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false},
"showHeader": true
},
"targets": [
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"format": "table",
"rawQuery": true,
"rawSql": "SELECT r.posted_at, c.display_name AS company, r.title, r.location, r.remote_policy, r.parsed_base_gbp::float AS base_gbp, r.salary_parse_confidence, r.source, r.apply_url FROM job_hunter.roles r JOIN job_hunter.companies c ON r.company_id = c.id WHERE $__timeFilter(r.fetched_at) AND r.primary_location IN (${location:sqlstring}) ORDER BY r.salary_parse_confidence DESC NULLS LAST, r.parsed_base_gbp DESC NULLS LAST, r.posted_at DESC NULLS LAST LIMIT 100",
"refId": "A"
}
],
"title": "Top roles",
"type": "table"
},
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"description": "Per-company median base salary broken out by seniority level (comp_points, GBP).",
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"custom": {
"align": "auto",
"cellOptions": {"type": "auto"},
"filterable": true,
"inspect": false
},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": []},
"unit": "currencyGBP"
},
"overrides": []
},
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 29},
"id": 6,
"options": {
"cellHeight": "sm",
"footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false},
"showHeader": true
},
"targets": [
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"format": "table",
"rawQuery": true,
"rawSql": "SELECT c.display_name AS company, l.slug AS level, percentile_cont(0.5) WITHIN GROUP (ORDER BY cp.base_gbp) AS p50_base_gbp, COUNT(*) AS n FROM job_hunter.comp_points cp JOIN job_hunter.companies c ON cp.company_id = c.id LEFT JOIN job_hunter.levels l ON cp.level_id = l.id WHERE cp.base_gbp IS NOT NULL AND cp.location_bucket IN (${location:sqlstring}) AND (c.slug = ANY(string_to_array(${company:sqlstring}, ',')) OR ${company:sqlstring} = 'all') GROUP BY c.display_name, l.slug ORDER BY c.display_name, l.rank NULLS LAST",
"refId": "A"
}
],
"title": "Per-company salary by level (p50 base)",
"type": "table"
},
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"description": "p50 total comp (base + bonus + RSU/year + sign-on/year) per (company, level).",
"fieldConfig": {
"defaults": {
"color": {"mode": "continuous-GrYlRd"},
"custom": {"align": "center", "cellOptions": {"type": "color-background"}},
"unit": "currencyGBP"
},
"overrides": []
},
"gridPos": {"h": 10, "w": 12, "x": 0, "y": 39},
"id": 7,
"options": {
"cellHeight": "sm",
"footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false},
"showHeader": true
},
"targets": [
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"format": "table",
"rawQuery": true,
"rawSql": "SELECT c.display_name AS company, l.slug AS level, percentile_cont(0.5) WITHIN GROUP (ORDER BY COALESCE(cp.base_gbp, 0) + COALESCE(cp.bonus_gbp, 0) + COALESCE(cp.rsu_annual_gbp, 0) + COALESCE(cp.signon_gbp, 0)) AS p50_total_gbp FROM job_hunter.comp_points cp JOIN job_hunter.companies c ON cp.company_id = c.id LEFT JOIN job_hunter.levels l ON cp.level_id = l.id WHERE cp.base_gbp IS NOT NULL AND cp.location_bucket IN (${location:sqlstring}) GROUP BY c.display_name, l.slug ORDER BY c.display_name",
"refId": "A"
}
],
"title": "Total comp heatmap (p50, GBP)",
"type": "table"
},
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"description": "Comp-datapoint ingestion volume by source.",
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "bars",
"fillOpacity": 60,
"lineWidth": 1,
"stacking": {"mode": "normal"}
}
},
"overrides": []
},
"gridPos": {"h": 10, "w": 12, "x": 12, "y": 39},
"id": 8,
"options": {
"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true},
"tooltip": {"mode": "single", "sort": "none"}
},
"targets": [
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"format": "time_series",
"rawQuery": true,
"rawSql": "SELECT date_trunc('day', fetched_at) AT TIME ZONE 'UTC' AS time, source, COUNT(*) AS value FROM job_hunter.comp_points WHERE $__timeFilter(fetched_at) GROUP BY 1, 2 ORDER BY 1",
"refId": "A"
}
],
"title": "Comp-point volume by source",
"type": "timeseries"
},
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"description": "p50 base salary trend by (company, level) for top 5 companies.",
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line",
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 6,
"showPoints": "auto"
},
"unit": "currencyGBP"
},
"overrides": []
},
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 49},
"id": 9,
"options": {
"legend": {"displayMode": "table", "placement": "right", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [
{
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"format": "time_series",
"rawQuery": true,
"rawSql": "WITH ranked AS (SELECT c.slug AS company_slug, COUNT(*) AS n FROM job_hunter.comp_points cp JOIN job_hunter.companies c ON cp.company_id = c.id WHERE cp.base_gbp IS NOT NULL AND cp.location_bucket IN (${location:sqlstring}) GROUP BY c.slug ORDER BY n DESC LIMIT 5) SELECT date_trunc('month', cp.effective_date)::timestamp AS time, c.display_name || ' / ' || COALESCE(l.slug, 'unknown') AS metric, percentile_cont(0.5) WITHIN GROUP (ORDER BY cp.base_gbp) AS value FROM job_hunter.comp_points cp JOIN job_hunter.companies c ON cp.company_id = c.id LEFT JOIN job_hunter.levels l ON cp.level_id = l.id WHERE cp.base_gbp IS NOT NULL AND cp.effective_date IS NOT NULL AND cp.location_bucket IN (${location:sqlstring}) AND c.slug IN (SELECT company_slug FROM ranked) AND (l.slug = ${level:sqlstring} OR ${level:sqlstring} = 'all') GROUP BY 1, 2 ORDER BY 1",
"refId": "A"
}
],
"title": "Base-salary trend (p50) — top 5 companies",
"type": "timeseries"
},
{
"id": 10,
"type": "barchart",
"title": "Your comp vs the market \u2014 London p50 total comp (GBP)",
"description": "Per-company London median total comp (COALESCE total/base) ranked vs your current TC. 'Me (Meta IC5)' is a labeled data point in the DB (source='self'), not a hardcoded dashboard value.",
"datasource": {
"type": "grafana-postgresql-datasource",
"uid": "job-hunter-pg"
},
"gridPos": {
"h": 14,
"w": 24,
"x": 0,
"y": 48
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisPlacement": "auto",
"fillOpacity": 85,
"gradientMode": "none",
"lineWidth": 1,
"axisCenteredZero": false
},
"mappings": [],
"unit": "currencyGBP",
"thresholds": {
"mode": "absolute",
"steps": []
}
},
"overrides": []
},
"options": {
"orientation": "horizontal",
"xField": "company",
"colorByField": "who",
"showValue": "auto",
"stacking": "none",
"barRadius": 0,
"barWidth": 0.85,
"groupWidth": 0.9,
"fullHighlight": false,
"legend": {
"showLegend": true,
"displayMode": "list",
"placement": "bottom",
"calcs": []
},
"tooltip": {
"mode": "single",
"sort": "none"
},
"xTickLabelRotation": 0,
"xTickLabelSpacing": 0
},
"targets": [
{
"datasource": {
"type": "grafana-postgresql-datasource",
"uid": "job-hunter-pg"
},
"format": "table",
"rawQuery": true,
"refId": "A",
"rawSql": "SELECT c.display_name AS company, CASE WHEN bool_or(cp.source = 'self') THEN 'You' ELSE 'Market' END AS who, percentile_cont(0.5) WITHIN GROUP (ORDER BY COALESCE(cp.total_gbp, cp.base_gbp)) AS \"p50_gbp\" FROM job_hunter.comp_points cp JOIN job_hunter.companies c ON c.id = cp.company_id WHERE cp.cp.location_bucket IN (${location:sqlstring}) AND COALESCE(cp.total_gbp, cp.base_gbp) IS NOT NULL GROUP BY c.display_name ORDER BY \"p50_gbp\" DESC"
}
]
}
],
"refresh": "",
"schemaVersion": 39,
"tags": ["job-hunter", "jobs", "careers"],
"templating": {"list": [
{
"current": {"selected": true, "text": ["london"], "value": ["london"]},
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"definition": "SELECT DISTINCT location_bucket FROM job_hunter.comp_points ORDER BY 1",
"includeAll": false,
"label": "Location",
"multi": true,
"name": "location",
"options": [],
"query": "SELECT DISTINCT location_bucket FROM job_hunter.comp_points ORDER BY 1",
"refresh": 1,
"regex": "",
"type": "query"
},
{
"current": {"selected": true, "text": "senior", "value": "senior"},
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"definition": "SELECT slug FROM job_hunter.levels WHERE company_id IS NULL ORDER BY rank",
"includeAll": true,
"allValue": "all",
"label": "Level",
"multi": false,
"name": "level",
"options": [],
"query": "SELECT slug FROM job_hunter.levels WHERE company_id IS NULL ORDER BY rank",
"refresh": 1,
"regex": "",
"type": "query"
},
{
"current": {"selected": true, "text": "all", "value": "all"},
"datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"},
"definition": "SELECT slug FROM job_hunter.companies ORDER BY slug",
"includeAll": true,
"allValue": "all",
"label": "Company",
"multi": true,
"name": "company",
"options": [],
"query": "SELECT slug FROM job_hunter.companies ORDER BY slug",
"refresh": 1,
"regex": "",
"type": "query"
}
]},
"time": {"from": "now-30d", "to": "now"},
"timepicker": {},
"timezone": "browser",
"title": "Job Hunter",
"uid": "job-hunter",
"version": 1,
"weekStart": ""
}

View file

@ -0,0 +1,204 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": { "type": "datasource", "uid": "grafana" },
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Kubernetes API server audit logs from Loki",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": 0,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"panels": [],
"title": "Recent Activity",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"description": "Recent Kubernetes API actions from audit logs",
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"custom": {
"align": "auto",
"cellOptions": { "type": "auto" },
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
}
},
"overrides": []
},
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 1 },
"id": 1,
"options": {
"cellHeight": "sm",
"footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false },
"showHeader": true,
"sortBy": [{ "desc": true, "displayName": "Time" }]
},
"pluginVersion": "12.3.0",
"targets": [
{
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"editorMode": "code",
"expr": "{job=\"kubernetes-audit\"} | json | line_format \"{{.user.username}} {{.verb}} {{.objectRef.resource}} {{.objectRef.namespace}}\"",
"legendFormat": "",
"queryType": "range",
"refId": "A"
}
],
"title": "Recent Actions",
"type": "table"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 },
"id": 101,
"panels": [],
"title": "Request Rates",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"description": "API request count by user over time",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 14 },
"id": 2,
"options": {
"legend": { "calcs": ["sum", "lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"pluginVersion": "12.3.0",
"targets": [
{
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"editorMode": "code",
"expr": "sum by (user_username) (count_over_time({job=\"kubernetes-audit\"} | json [5m]))",
"legendFormat": "{{user_username}}",
"queryType": "range",
"refId": "A"
}
],
"title": "Request Count by User",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 },
"id": 102,
"panels": [],
"title": "Denied Requests",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"description": "API requests denied with HTTP 403+ status codes",
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"custom": {
"align": "auto",
"cellOptions": { "type": "auto" },
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 403 }
]
}
},
"overrides": []
},
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 25 },
"id": 3,
"options": {
"cellHeight": "sm",
"footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false },
"showHeader": true,
"sortBy": [{ "desc": true, "displayName": "Time" }]
},
"pluginVersion": "12.3.0",
"targets": [
{
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"editorMode": "code",
"expr": "{job=\"kubernetes-audit\"} | json | responseStatus_code >= 403",
"legendFormat": "",
"queryType": "range",
"refId": "A"
}
],
"title": "Denied Requests (403+)",
"type": "table"
}
],
"preload": false,
"refresh": "30s",
"schemaVersion": 42,
"tags": ["kubernetes", "audit", "security"],
"templating": {
"list": []
},
"time": {
"from": "now-24h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Kubernetes Audit Logs",
"uid": "k8s-audit",
"version": 1
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,288 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"description": "Logs collected from Kubernetes, stored in Loki",
"editable": true,
"fiscalYearStartMonth": 0,
"gnetId": 15141,
"graphTooltip": 0,
"id": 25,
"links": [],
"panels": [
{
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "bars",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 24,
"x": 0,
"y": 0
},
"id": 4,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"editorMode": "code",
"expr": "sum(count_over_time({namespace=~\"$namespace\", container =~\"$container\"} |= \"$query\" [$__interval]))",
"instant": false,
"legendFormat": "Log count",
"queryType": "range",
"range": true,
"refId": "A"
}
],
"type": "timeseries"
},
{
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"description": "Logs from services running in Kubernetes",
"gridPos": {
"h": 25,
"w": 24,
"x": 0,
"y": 4
},
"id": 2,
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": false,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"targets": [
{
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"editorMode": "code",
"expr": "{namespace=~\"$namespace\", container =~\"$container\"} |= \"$query\"",
"queryType": "range",
"refId": "A"
}
],
"type": "logs"
}
],
"refresh": "5s",
"schemaVersion": 39,
"tags": [],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "",
"value": ""
},
"description": "String to search for",
"hide": 0,
"label": "Search Query",
"name": "query",
"options": [
{
"selected": true,
"text": "",
"value": ""
}
],
"query": "",
"skipUrlSync": false,
"type": "textbox"
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": [
"dbaas"
],
"value": [
"dbaas"
]
},
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"definition": "label_values(namespace)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "namespace",
"options": [],
"query": "label_values(namespace)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"definition": "label_values(stream)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "stream",
"options": [],
"query": "label_values(stream)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"definition": "label_values(container)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "container",
"options": [],
"query": "label_values(container)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Loki Kubernetes Logs",
"uid": "o6-BGgnnk",
"version": 2,
"weekStart": ""
}

View file

@ -0,0 +1,146 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": { "type": "datasource", "uid": "grafana" },
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Network traffic monitoring via GoFlow2 NetFlow + DNS anomaly detection + CrowdSec",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"id": 1, "title": "GoFlow2 Status", "type": "stat",
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }, "mappings": [{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" }, "1": { "text": "UP", "color": "green" } } }] } },
"targets": [{ "expr": "up{job=\"goflow2\"}", "legendFormat": "GoFlow2", "refId": "A" }]
},
{
"id": 2, "title": "NetFlow Bytes/s", "type": "stat",
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "unit": "Bps", "thresholds": { "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 10485760 }, { "color": "red", "value": 104857600 }] } } },
"targets": [{ "expr": "sum(rate(goflow2_flow_traffic_bytes_total[5m]))", "legendFormat": "Total", "refId": "A" }]
},
{
"id": 3, "title": "Flows/s", "type": "stat",
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "unit": "short", "decimals": 1, "thresholds": { "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 0.1 }] } } },
"targets": [{ "expr": "sum(rate(goflow2_flow_process_nf_total[5m]))", "legendFormat": "flows/s", "refId": "A" }]
},
{
"id": 4, "title": "CrowdSec Decisions", "type": "stat",
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 50 }, { "color": "red", "value": 200 }] } } },
"targets": [{ "expr": "cs_active_decisions", "legendFormat": "Decisions", "refId": "A" }]
},
{
"id": 5, "title": "DNS Queries (Last Hour)", "type": "stat",
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 5000 }, { "color": "red", "value": 20000 }] } } },
"targets": [{ "expr": "dns_anomaly_total_queries", "legendFormat": "Queries", "refId": "A" }]
},
{
"id": 6, "title": "DNS DGA Suspects", "type": "stat",
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 0 },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 5 }] } } },
"targets": [{ "expr": "dns_anomaly_dga_suspects", "legendFormat": "DGA Suspects", "refId": "A" }]
},
{
"id": 7, "title": "NetFlow Bytes/s Over Time", "type": "timeseries",
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"fieldConfig": { "defaults": { "unit": "Bps", "custom": { "fillOpacity": 20, "lineWidth": 2, "drawStyle": "line", "showPoints": "never" } } },
"targets": [{ "expr": "sum(rate(goflow2_flow_traffic_bytes_total[5m]))", "legendFormat": "NetFlow Bytes/s", "refId": "A" }]
},
{
"id": 8, "title": "NetFlow Flows/s (by version)", "type": "timeseries",
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"fieldConfig": { "defaults": { "unit": "short", "custom": { "fillOpacity": 15, "lineWidth": 2, "drawStyle": "line", "showPoints": "never" } } },
"targets": [
{ "expr": "rate(goflow2_flow_process_nf_total[5m])", "legendFormat": "v{{version}} flows/s", "refId": "A" },
{ "expr": "rate(goflow2_flow_decoder_error_total[5m])", "legendFormat": "Decoder errors/s", "refId": "B" },
{ "expr": "rate(goflow2_flow_process_nf_errors_total[5m])", "legendFormat": "Errors/s ({{error}})", "refId": "C" }
]
},
{
"id": 9, "title": "NetFlow Processing Delay", "type": "timeseries",
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 },
"fieldConfig": { "defaults": { "unit": "s", "custom": { "fillOpacity": 10, "lineWidth": 1, "drawStyle": "line", "showPoints": "never" } } },
"targets": [
{ "expr": "goflow2_flow_process_nf_delay_seconds{quantile=\"0.5\"}", "legendFormat": "p50 delay", "refId": "A" },
{ "expr": "goflow2_flow_process_nf_delay_seconds{quantile=\"0.9\"}", "legendFormat": "p90 delay", "refId": "B" },
{ "expr": "goflow2_flow_process_nf_delay_seconds{quantile=\"0.99\"}", "legendFormat": "p99 delay", "refId": "C" }
]
},
{
"id": 10, "title": "CrowdSec Alerts & Decisions", "type": "timeseries",
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 },
"fieldConfig": { "defaults": { "custom": { "fillOpacity": 15, "lineWidth": 2, "drawStyle": "line", "showPoints": "never" } } },
"targets": [
{ "expr": "cs_active_decisions", "legendFormat": "Active Decisions", "refId": "A" },
{ "expr": "cs_alerts", "legendFormat": "Total Alerts", "refId": "B" },
{ "expr": "sum(rate(cs_lapi_route_requests_total[5m]))", "legendFormat": "LAPI req/s", "refId": "C" }
]
},
{
"id": 11, "title": "CrowdSec LAPI Latency", "type": "timeseries",
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 },
"fieldConfig": { "defaults": { "unit": "s", "custom": { "fillOpacity": 10, "lineWidth": 1, "drawStyle": "line", "showPoints": "never" } } },
"targets": [
{ "expr": "histogram_quantile(0.50, sum(rate(cs_lapi_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p50", "refId": "A" },
{ "expr": "histogram_quantile(0.99, sum(rate(cs_lapi_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p99", "refId": "B" }
]
},
{
"id": 12, "title": "NetFlow Flowset Records/s", "type": "timeseries",
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 },
"fieldConfig": { "defaults": { "unit": "short", "custom": { "fillOpacity": 15, "lineWidth": 1, "drawStyle": "line", "showPoints": "never" } } },
"targets": [
{ "expr": "rate(goflow2_flow_process_nf_flowset_records_total[5m])", "legendFormat": "{{type}} (v{{version}})", "refId": "A" }
]
},
{
"id": 13, "title": "DNS Metrics Over Time", "type": "timeseries",
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 },
"fieldConfig": { "defaults": { "custom": { "fillOpacity": 15, "lineWidth": 2, "drawStyle": "line", "showPoints": "never" } } },
"targets": [
{ "expr": "dns_anomaly_total_queries", "legendFormat": "Total Queries", "refId": "A" },
{ "expr": "dns_anomaly_nx_domain", "legendFormat": "NX Domain", "refId": "B" },
{ "expr": "dns_anomaly_server_failure", "legendFormat": "SERVFAIL", "refId": "C" },
{ "expr": "dns_anomaly_blocked", "legendFormat": "Blocked", "refId": "D" },
{ "expr": "dns_anomaly_dga_suspects", "legendFormat": "DGA Suspects", "refId": "E" }
]
}
],
"schemaVersion": 39,
"tags": ["network", "security", "goflow2", "dns", "crowdsec"],
"templating": { "list": [] },
"time": { "from": "now-6h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Network Traffic & Adversary Detection",
"uid": "network-traffic-adversary",
"version": 4
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,816 @@
{
"annotations": {
"list": [
{
"$$hashKey": "object:192",
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.13+) cluster",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 18,
"x": 0,
"y": 0
},
"id": 12,
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP",
"instant": false,
"interval": "",
"legendFormat": "GPU 0",
"refId": "A"
}
],
"title": "GPU Temperature",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "red",
"value": 80
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 0
},
"id": 14,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP",
"interval": "",
"legendFormat": "",
"range": true,
"refId": "A"
}
],
"title": "GPU Current Temp",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 18,
"x": 0,
"y": 8
},
"id": 10,
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_POWER_USAGE",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"title": "GPU Power Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 2400,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "#EAB839",
"value": 1800
},
{
"color": "red",
"value": 2200
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 8
},
"id": 16,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"sum"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(nvidia_tesla_t4_DCGM_FI_DEV_POWER_USAGE)",
"instant": true,
"interval": "",
"legendFormat": "",
"range": false,
"refId": "A"
}
],
"title": "GPU Power Total",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"id": 6,
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_UTIL",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"title": "GPU Utilization",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "decmbytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"id": 18,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_FB_USED",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"title": "GPU Framebuffer Mem Used",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "hertz"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"id": 2,
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_SM_CLOCK* 1000000",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"title": "GPU SM Clocks",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 24
},
"id": 19,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "sum by (namespace) (gpu_pod_memory_used_bytes)",
"instant": false,
"legendFormat": "{{namespace}}",
"range": true,
"refId": "A"
}
],
"title": "GPU Memory per Application",
"type": "timeseries"
}
],
"preload": false,
"refresh": "auto",
"schemaVersion": 42,
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-12h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "NVIDIA DCGM Exporter Dashboard",
"uid": "Oxed_c6Wz",
"version": 9
}

View file

@ -0,0 +1,476 @@
{
"annotations": {"list": []},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"refresh": "30s",
"schemaVersion": 38,
"tags": ["openclaw", "ai", "codex"],
"time": {"from": "now-6h", "to": "now"},
"timepicker": {},
"timezone": "",
"title": "OpenClaw — Codex Usage",
"uid": "openclaw-codex",
"version": 1,
"panels": [
{
"type": "row",
"id": 100,
"title": "Now",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"collapsed": false,
"panels": []
},
{
"type": "stat",
"id": 1,
"title": "Messages last 5h — gpt-5.4-mini",
"description": "Plus rate-card lower bound: 1,200 / 5h. Hard cap at the upper bound: 7,000 / 5h.",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 0, "y": 1},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
"textMode": "auto"
},
"fieldConfig": {
"defaults": {
"decimals": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 960},
{"color": "orange", "value": 1500},
{"color": "red", "value": 5600}
]
},
"unit": "short"
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum(increase(openclaw_codex_messages_total{provider=\"openai-codex\",model=\"gpt-5.4-mini\"}[5h]))",
"refId": "A"
}
]
},
{
"type": "gauge",
"id": 2,
"title": "% of Plus 5h floor (1,200 cap)",
"description": "Conservative gauge against the lower bound of the published rate-card. Real ceiling depends on dynamic allocation (1,2007,000). Re-baseline if you observe throttling at <80%.",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 6, "y": 1},
"options": {
"orientation": "auto",
"showThresholdLabels": false,
"showThresholdMarkers": true,
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
},
"fieldConfig": {
"defaults": {
"min": 0,
"max": 100,
"decimals": 1,
"unit": "percent",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 60},
{"color": "orange", "value": 80},
{"color": "red", "value": 95}
]
}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "100 * sum(increase(openclaw_codex_messages_total{provider=\"openai-codex\",model=\"gpt-5.4-mini\"}[5h])) / 1200",
"refId": "A"
}
]
},
{
"type": "stat",
"id": 3,
"title": "Tokens last 5h (input + output, codex)",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 12, "y": 1},
"options": {
"colorMode": "value",
"graphMode": "area",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
},
"fieldConfig": {
"defaults": {
"decimals": 0,
"unit": "short",
"thresholds": {"mode": "absolute", "steps": [{"color": "blue", "value": null}]}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum(increase(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5h])) + sum(increase(openclaw_codex_output_tokens_total{provider=\"openai-codex\"}[5h]))",
"refId": "A"
}
]
},
{
"type": "stat",
"id": 4,
"title": "Cache hit ratio (codex, 5h)",
"description": "cacheRead / (cacheRead + input). Higher is better — caching cuts effective Plus quota burn.",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 18, "y": 1},
"options": {
"colorMode": "value",
"graphMode": "area",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
},
"fieldConfig": {
"defaults": {
"min": 0,
"max": 100,
"decimals": 1,
"unit": "percent",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": null},
{"color": "yellow", "value": 30},
{"color": "green", "value": 60}
]
}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "100 * sum(increase(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5h])) / clamp_min(sum(increase(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5h])) + sum(increase(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5h])), 1)",
"refId": "A"
}
]
},
{
"type": "stat",
"id": 5,
"title": "OAuth token expiry",
"description": "Days until the openai-codex OAuth token expires. Re-run `openclaw models auth login --provider openai-codex` before this hits 0.",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 0, "y": 6},
"options": {
"colorMode": "background",
"graphMode": "none",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
},
"fieldConfig": {
"defaults": {
"decimals": 1,
"unit": "d",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": null},
{"color": "orange", "value": 1},
{"color": "yellow", "value": 3},
{"color": "green", "value": 5}
]
}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "max(openclaw_codex_oauth_expiry_seconds{provider=\"openai-codex\"}) / 86400",
"refId": "A"
}
]
},
{
"type": "stat",
"id": 6,
"title": "Active sessions",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 6, "y": 6},
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": true},
"textMode": "value_and_name"
},
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {"mode": "absolute", "steps": [{"color": "blue", "value": null}]}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "openclaw_codex_active_sessions",
"legendFormat": "{{kind}}",
"refId": "A"
}
]
},
{
"type": "stat",
"id": 7,
"title": "Last assistant turn",
"description": "Time since the latest assistant message landed in any session.",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 12, "y": 6},
"options": {
"colorMode": "background",
"graphMode": "none",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
},
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 1800},
{"color": "orange", "value": 7200},
{"color": "red", "value": 86400}
]
}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "time() - openclaw_codex_last_run_timestamp",
"refId": "A"
}
]
},
{
"type": "stat",
"id": 8,
"title": "Errors last 24h",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 5, "w": 6, "x": 18, "y": 6},
"options": {
"colorMode": "background",
"graphMode": "area",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
},
"fieldConfig": {
"defaults": {
"decimals": 0,
"unit": "short",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 1},
{"color": "red", "value": 10}
]
}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum(increase(openclaw_codex_message_errors_total[24h]))",
"refId": "A"
}
]
},
{
"type": "row",
"id": 200,
"title": "Over time",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 11},
"collapsed": false,
"panels": []
},
{
"type": "timeseries",
"id": 10,
"title": "Messages / min by model",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 12},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "bars",
"fillOpacity": 60,
"lineWidth": 1,
"stacking": {"mode": "normal"}
},
"unit": "short"
}
},
"options": {
"legend": {"displayMode": "table", "placement": "right", "showLegend": true, "calcs": ["sum"]},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum by (provider, model) (rate(openclaw_codex_messages_total[1m])) * 60",
"legendFormat": "{{provider}}/{{model}}",
"refId": "A"
}
]
},
{
"type": "timeseries",
"id": 11,
"title": "Tokens / min by type (codex)",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 20},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {
"drawStyle": "line",
"fillOpacity": 25,
"lineWidth": 2,
"stacking": {"mode": "none"}
},
"unit": "short"
}
},
"options": {
"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "desc"}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum(rate(openclaw_codex_input_tokens_total{provider=\"openai-codex\"}[5m])) * 60",
"legendFormat": "input",
"refId": "A"
},
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum(rate(openclaw_codex_output_tokens_total{provider=\"openai-codex\"}[5m])) * 60",
"legendFormat": "output",
"refId": "B"
},
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum(rate(openclaw_codex_cache_read_tokens_total{provider=\"openai-codex\"}[5m])) * 60",
"legendFormat": "cache_read",
"refId": "C"
}
]
},
{
"type": "bargauge",
"id": 12,
"title": "Messages / 5h by model",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 20},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"showUnfilled": true,
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}
},
"fieldConfig": {
"defaults": {
"min": 0,
"decimals": 0,
"unit": "short",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 100},
{"color": "orange", "value": 500},
{"color": "red", "value": 1000}
]
}
}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum by (provider, model) (increase(openclaw_codex_messages_total[5h]))",
"legendFormat": "{{provider}}/{{model}}",
"refId": "A"
}
]
},
{
"type": "row",
"id": 300,
"title": "Errors",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 28},
"collapsed": false,
"panels": []
},
{
"type": "table",
"id": 20,
"title": "Recent errors by model and reason",
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 29},
"options": {
"showHeader": true
},
"fieldConfig": {
"defaults": {
"custom": {"align": "auto", "displayMode": "auto"}
},
"overrides": [
{
"matcher": {"id": "byName", "options": "Value"},
"properties": [
{"id": "displayName", "value": "Errors (24h)"},
{"id": "custom.displayMode", "value": "color-background"},
{
"id": "thresholds",
"value": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 1},
{"color": "red", "value": 10}
]
}
}
]
}
]
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
"expr": "sum by (provider, model, reason) (increase(openclaw_codex_message_errors_total[24h])) > 0",
"format": "table",
"instant": true,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {"Time": true, "__name__": true, "instance": true, "job": true, "namespace": true, "pod": true, "app": true},
"indexByName": {"provider": 0, "model": 1, "reason": 2, "Value": 3},
"renameByName": {}
}
}
]
}
]
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,681 @@
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"title": "MAM Tracker",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "orange", "value": 0.8 },
{ "color": "green", "value": 1.0 }
]
},
"decimals": 3
}
},
"gridPos": { "h": 6, "w": 5, "x": 0, "y": 1 },
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "qbt_tracker_ratio{tracker=\"mam\"}",
"legendFormat": "MAM Ratio"
}
],
"title": "MAM Ratio",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": { "unit": "decbytes" }
},
"gridPos": { "h": 6, "w": 5, "x": 5, "y": 1 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "qbt_tracker_uploaded_bytes{tracker=\"mam\"}",
"legendFormat": "Uploaded"
},
{
"expr": "qbt_tracker_downloaded_bytes{tracker=\"mam\"}",
"legendFormat": "Downloaded"
}
],
"title": "MAM Transfer",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": { "unit": "short" }
},
"gridPos": { "h": 6, "w": 4, "x": 10, "y": 1 },
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "qbt_tracker_torrents_total{tracker=\"mam\"}",
"legendFormat": "Total"
},
{
"expr": "qbt_tracker_seeding{tracker=\"mam\"}",
"legendFormat": "Seeding"
},
{
"expr": "qbt_tracker_downloading{tracker=\"mam\"}",
"legendFormat": "Downloading"
}
],
"title": "MAM Torrents",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "orange", "value": 15 },
{ "color": "red", "value": 20 }
]
},
"unit": "short"
}
},
"gridPos": { "h": 6, "w": 4, "x": 14, "y": 1 },
"id": 4,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "qbt_tracker_unsatisfied{tracker=\"mam\"}",
"legendFormat": "Unsatisfied (<72h seed)"
}
],
"title": "MAM Unsatisfied",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "0": { "color": "red", "text": "Disconnected" }, "1": { "color": "green", "text": "Connected" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 6, "w": 3, "x": 18, "y": 1 },
"id": 5,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"textMode": "value",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "qbt_connected",
"legendFormat": "Connection"
}
],
"title": "Connection",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": { "unit": "short" }
},
"gridPos": { "h": 6, "w": 3, "x": 21, "y": 1 },
"id": 6,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "qbt_dht_nodes",
"legendFormat": "DHT Nodes"
}
],
"title": "DHT Nodes",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"drawStyle": "line",
"fillOpacity": 10,
"lineWidth": 2,
"showPoints": "never",
"spanNulls": true,
"thresholdsStyle": { "mode": "line" }
},
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1.0 }
]
},
"decimals": 3
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 7 },
"id": 7,
"options": {
"legend": { "calcs": ["lastNotNull", "min"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"expr": "qbt_tracker_ratio{tracker=\"mam\"}",
"legendFormat": "MAM Ratio"
}
],
"title": "MAM Ratio Over Time",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "scheme",
"lineWidth": 2,
"showPoints": "never",
"spanNulls": true
},
"unit": "decbytes"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 7 },
"id": 8,
"options": {
"legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"expr": "qbt_tracker_uploaded_bytes{tracker=\"mam\"}",
"legendFormat": "MAM Uploaded"
},
{
"expr": "qbt_tracker_downloaded_bytes{tracker=\"mam\"}",
"legendFormat": "MAM Downloaded"
}
],
"title": "MAM Cumulative Transfer",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 },
"id": 101,
"title": "All Trackers Breakdown",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "orange", "value": 0.8 },
{ "color": "green", "value": 1.0 }
]
},
"decimals": 3
}
},
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 16 },
"id": 9,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "qbt_tracker_ratio",
"legendFormat": "{{tracker}}"
}
],
"title": "Ratio by Tracker",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"drawStyle": "line",
"fillOpacity": 10,
"lineWidth": 2,
"showPoints": "never",
"spanNulls": true,
"thresholdsStyle": { "mode": "line" }
},
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1.0 }
]
},
"decimals": 3
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
"id": 10,
"options": {
"legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"expr": "qbt_tracker_ratio",
"legendFormat": "{{tracker}}"
}
],
"title": "Ratio by Tracker Over Time",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"drawStyle": "bars",
"fillOpacity": 80,
"lineWidth": 1,
"stacking": { "mode": "normal" }
},
"unit": "short"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
"id": 11,
"options": {
"legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"expr": "qbt_tracker_torrents_total",
"legendFormat": "{{tracker}} total"
},
{
"expr": "qbt_tracker_seeding",
"legendFormat": "{{tracker}} seeding"
}
],
"title": "Torrents by Tracker",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 },
"id": 102,
"title": "Transfer Speeds",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisCenteredZero": false,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "scheme",
"lineWidth": 2,
"showPoints": "never",
"spanNulls": true
},
"unit": "Bps"
},
"overrides": [
{
"matcher": { "id": "byName", "options": "Download" },
"properties": [
{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } },
{ "id": "custom.transform", "value": "negative-Y" }
]
},
{
"matcher": { "id": "byName", "options": "Upload" },
"properties": [
{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }
]
}
]
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 31 },
"id": 12,
"options": {
"legend": { "calcs": ["mean", "max", "lastNotNull"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"expr": "qbt_ul_speed_bytes",
"legendFormat": "Upload"
},
{
"expr": "qbt_dl_speed_bytes",
"legendFormat": "Download"
}
],
"title": "Transfer Speed (Global)",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 },
"id": 103,
"title": "MAM Profile (from jsonLoad.php)",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "type": "value", "options": {
"0": { "color": "red", "text": "Mouse" },
"1": { "color": "orange", "text": "Vole" },
"2": { "color": "yellow", "text": "User" },
"3": { "color": "green", "text": "Power User" },
"4": { "color": "green", "text": "Elite" },
"5": { "color": "blue", "text": "Torrent Master" },
"6": { "color": "blue", "text": "Power TM" },
"7": { "color": "purple", "text": "Elite TM" },
"8": { "color": "purple", "text": "VIP" }
} }
],
"thresholds": { "mode": "absolute", "steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 2 }
] }
}
},
"gridPos": { "h": 6, "w": 4, "x": 0, "y": 40 },
"id": 20,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"textMode": "value",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [{ "expr": "mam_class_code", "legendFormat": "Class" }],
"title": "MAM Class",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"thresholds": { "mode": "absolute", "steps": [
{ "color": "red", "value": null },
{ "color": "orange", "value": 0.8 },
{ "color": "green", "value": 1.2 }
] },
"decimals": 3
}
},
"gridPos": { "h": 6, "w": 4, "x": 4, "y": 40 },
"id": 21,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"textMode": "value",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [{ "expr": "mam_ratio", "legendFormat": "Ratio" }],
"title": "MAM Ratio (profile)",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": { "mode": "absolute", "steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 5000 }
] }
}
},
"gridPos": { "h": 6, "w": 4, "x": 8, "y": 40 },
"id": 22,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"textMode": "value",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [{ "expr": "mam_bp_balance", "legendFormat": "BP" }],
"title": "MAM Bonus Points",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": { "defaults": { "unit": "decbytes" } },
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 40 },
"id": 23,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{ "expr": "mam_downloaded_bytes", "legendFormat": "Downloaded" },
{ "expr": "mam_uploaded_bytes", "legendFormat": "Uploaded" }
],
"title": "MAM Transfer (profile)",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"drawStyle": "line",
"fillOpacity": 10,
"lineWidth": 2,
"showPoints": "never",
"spanNulls": true,
"thresholdsStyle": { "mode": "line" }
},
"thresholds": { "mode": "absolute", "steps": [
{ "color": "transparent", "value": null },
{ "color": "orange", "value": 500 }
] },
"unit": "short"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 46 },
"id": 24,
"options": {
"legend": { "calcs": ["lastNotNull", "min"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi" }
},
"targets": [
{ "expr": "mam_bp_balance", "legendFormat": "BP Balance" },
{ "expr": "mam_bp_needed_gib * 500", "legendFormat": "Next-run cost (BP)" }
],
"title": "BP Balance vs Reserve",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"drawStyle": "bars",
"fillOpacity": 80,
"lineWidth": 1,
"stacking": { "mode": "normal" }
},
"unit": "short"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 46 },
"id": 25,
"options": {
"legend": { "calcs": ["lastNotNull", "sum"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"expr": "mam_janitor_deleted_per_run",
"legendFormat": "{{reason}}"
}
],
"title": "Janitor Deletions per Run (by reason)",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": { "unit": "short" }
},
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 54 },
"id": 26,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{ "expr": "mam_janitor_preserved_hnr", "legendFormat": "Preserved (H&R <72h)" },
{ "expr": "mam_janitor_skipped_active", "legendFormat": "Skipped (in-progress)" },
{ "expr": "mam_janitor_dry_run", "legendFormat": "Dry-run mode" }
],
"title": "Janitor State",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": { "unit": "short" }
},
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 54 },
"id": 27,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{ "expr": "mam_farming_grabbed", "legendFormat": "Last run grabbed" },
{ "expr": "mam_farming_total_seeding", "legendFormat": "Total in farming" },
{ "expr": "sum by (reason) (mam_grabber_skipped_reason)", "legendFormat": "Grabber skipped: {{reason}}" }
],
"title": "Grabber State",
"type": "stat"
}
],
"refresh": "1m",
"schemaVersion": 39,
"tags": ["qbittorrent", "torrents", "mam"],
"templating": {
"list": [
{
"current": { "selected": false, "text": "Prometheus", "value": "prometheus" },
"hide": 0,
"includeAll": false,
"label": "Data Source",
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"type": "datasource"
}
]
},
"time": { "from": "now-24h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "qBittorrent - Seeding & Ratio",
"uid": "qbittorrent-mam",
"version": 1
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,230 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": { "type": "grafana", "uid": "-- Grafana --" },
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "rpi-sofia (Raspberry Pi 3, Sofia home site) — health + forensic signals. Frigate camera DNAT passthrough + solar inverter path + HA MQTT sensors run on this Pi. The rpi_* metrics come from a vcgencmd textfile collector; the rest from node_exporter.",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"id": 1,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"title": "Status",
"type": "stat",
"targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "up{job=\"rpi-sofia\"}", "refId": "A" } ]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "0": { "color": "green", "text": "OK" }, "1": { "color": "red", "text": "YES" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"id": 2,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"title": "Under-voltage (since boot)",
"type": "stat",
"targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_under_voltage_occurred{instance=\"rpi-sofia\"}", "refId": "A" } ]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "0": { "color": "green", "text": "No" }, "1": { "color": "red", "text": "THROTTLED" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"id": 3,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"title": "Throttled now",
"type": "stat",
"targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_throttled_now{instance=\"rpi-sofia\"}", "refId": "A" } ]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "0": { "color": "green", "text": "rw" }, "1": { "color": "red", "text": "READ-ONLY" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"id": 4,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"title": "Rootfs mount state",
"type": "stat",
"targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_filesystem_readonly{instance=\"rpi-sofia\", mountpoint=\"/\"}", "refId": "A" } ]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
"unit": "celsius",
"thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 80 } ] }
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"id": 5,
"options": { "legend": { "calcs": ["last", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
"title": "SoC Temperature",
"type": "timeseries",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_soc_temp_celsius{instance=\"rpi-sofia\"}", "legendFormat": "vcgencmd temp", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_thermal_zone_temp{instance=\"rpi-sofia\"}", "legendFormat": "thermal zone", "refId": "B" }
]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 20, "lineWidth": 2, "showPoints": "never", "stepAfter": true },
"max": 1,
"min": 0
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"id": 6,
"options": { "legend": { "calcs": ["max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
"title": "Throttle / Under-voltage events (1 = active)",
"type": "timeseries",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_under_voltage_now{instance=\"rpi-sofia\"}", "legendFormat": "under-voltage now", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_under_voltage_occurred{instance=\"rpi-sofia\"}", "legendFormat": "under-voltage since boot", "refId": "B" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_throttled_now{instance=\"rpi-sofia\"}", "legendFormat": "throttled now", "refId": "C" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_throttled_occurred{instance=\"rpi-sofia\"}", "legendFormat": "throttled since boot", "refId": "D" }
]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
"id": 7,
"options": { "legend": { "calcs": ["last", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
"title": "CPU load average",
"type": "timeseries",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_load1{instance=\"rpi-sofia\"}", "legendFormat": "load1", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_load5{instance=\"rpi-sofia\"}", "legendFormat": "load5", "refId": "B" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_load15{instance=\"rpi-sofia\"}", "legendFormat": "load15", "refId": "C" }
]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
"unit": "bytes"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
"id": 8,
"options": { "legend": { "calcs": ["last", "min"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
"title": "Memory",
"type": "timeseries",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_memory_MemAvailable_bytes{instance=\"rpi-sofia\"}", "legendFormat": "available", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_memory_MemTotal_bytes{instance=\"rpi-sofia\"}", "legendFormat": "total", "refId": "B" }
]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
"unit": "bytes"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 },
"id": 9,
"options": { "legend": { "calcs": ["last", "min"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
"title": "Root filesystem free space",
"type": "timeseries",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_filesystem_avail_bytes{instance=\"rpi-sofia\", mountpoint=\"/\"}", "legendFormat": "/ available", "refId": "A" }
]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
"unit": "Bps"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 },
"id": 10,
"options": { "legend": { "calcs": ["last", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
"title": "Network throughput per interface",
"type": "timeseries",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rate(node_network_receive_bytes_total{instance=\"rpi-sofia\", device!=\"lo\"}[5m])", "legendFormat": "rx {{device}}", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rate(node_network_transmit_bytes_total{instance=\"rpi-sofia\", device!=\"lo\"}[5m])", "legendFormat": "tx {{device}}", "refId": "B" }
]
}
],
"refresh": "1m",
"schemaVersion": 39,
"tags": ["rpi-sofia", "hardware", "sofia"],
"templating": { "list": [] },
"time": { "from": "now-24h", "to": "now" },
"timepicker": {},
"timezone": "",
"title": "RPi Sofia",
"uid": "rpi-sofia",
"version": 1,
"weekStart": ""
}

View file

@ -0,0 +1,488 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": { "type": "datasource", "uid": "grafana" },
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Technitium DNS query logs from PostgreSQL",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"title": "Total Queries",
"type": "stat",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"steps": [
{ "color": "green", "value": null }
]
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"textMode": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"rawSql": "SELECT COUNT(*) as total_queries FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Cached %",
"type": "stat",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"unit": "percentunit",
"thresholds": {
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 0.3 },
{ "color": "green", "value": 0.5 }
]
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"textMode": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"rawSql": "SELECT SUM(CASE WHEN response_type = 3 THEN 1 ELSE 0 END)::float / NULLIF(COUNT(*), 0) as cached_pct FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Blocked %",
"type": "stat",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"unit": "percentunit",
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.1 },
{ "color": "red", "value": 0.3 }
]
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"textMode": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"rawSql": "SELECT SUM(CASE WHEN response_type = 4 THEN 1 ELSE 0 END)::float / NULLIF(COUNT(*), 0) as blocked_pct FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "NxDomain %",
"type": "stat",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"unit": "percentunit",
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.2 },
{ "color": "red", "value": 0.5 }
]
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"textMode": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"rawSql": "SELECT SUM(CASE WHEN rcode = 3 THEN 1 ELSE 0 END)::float / NULLIF(COUNT(*), 0) as nxdomain_pct FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Avg Response Time",
"type": "stat",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"unit": "ms",
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 50 },
{ "color": "red", "value": 200 }
]
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"textMode": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"rawSql": "SELECT AVG(response_rtt) as avg_rtt_ms FROM dns_logs WHERE $__timeFilter(timestamp) AND response_rtt IS NOT NULL",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Queries by Protocol",
"type": "stat",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 0 },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" }
},
"overrides": []
},
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"textMode": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": true }
},
"targets": [
{
"rawSql": "SELECT SUM(CASE WHEN protocol = 0 THEN 1 ELSE 0 END) as UDP, SUM(CASE WHEN protocol = 1 THEN 1 ELSE 0 END) as TCP, SUM(CASE WHEN protocol = 3 THEN 1 ELSE 0 END) as DoH, SUM(CASE WHEN protocol = 4 THEN 1 ELSE 0 END) as DoT FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Queries Over Time",
"type": "timeseries",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "bars",
"fillOpacity": 50,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "normal" }
}
},
"overrides": []
},
"options": {
"legend": { "calcs": ["sum"], "displayMode": "list", "placement": "bottom" },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"rawSql": "SELECT $__timeGroup(timestamp, $__interval) as time, SUM(CASE WHEN response_type = 1 THEN 1 ELSE 0 END) as Authoritative, SUM(CASE WHEN response_type = 2 THEN 1 ELSE 0 END) as Recursive, SUM(CASE WHEN response_type = 3 THEN 1 ELSE 0 END) as Cached, SUM(CASE WHEN response_type = 4 THEN 1 ELSE 0 END) as Blocked, SUM(CASE WHEN response_type = 5 THEN 1 ELSE 0 END) as Dropped FROM dns_logs WHERE $__timeFilter(timestamp) GROUP BY time ORDER BY time",
"format": "time_series",
"refId": "A"
}
]
},
{
"title": "Response Codes",
"type": "piechart",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" }
},
"overrides": [
{ "matcher": { "id": "byName", "options": "NOERROR" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
{ "matcher": { "id": "byName", "options": "NXDOMAIN" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
{ "matcher": { "id": "byName", "options": "SERVFAIL" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
{ "matcher": { "id": "byName", "options": "REFUSED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
]
},
"options": {
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] },
"pieType": "donut",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": true },
"tooltip": { "mode": "single" }
},
"targets": [
{
"rawSql": "SELECT SUM(CASE WHEN rcode = 0 THEN 1 ELSE 0 END) as NOERROR, SUM(CASE WHEN rcode = 2 THEN 1 ELSE 0 END) as SERVFAIL, SUM(CASE WHEN rcode = 3 THEN 1 ELSE 0 END) as NXDOMAIN, SUM(CASE WHEN rcode = 5 THEN 1 ELSE 0 END) as REFUSED, SUM(CASE WHEN rcode NOT IN (0,2,3,5) THEN 1 ELSE 0 END) as Other FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Response Types",
"type": "piechart",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" }
},
"overrides": [
{ "matcher": { "id": "byName", "options": "Cached" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
{ "matcher": { "id": "byName", "options": "Blocked" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
{ "matcher": { "id": "byName", "options": "Recursive" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
{ "matcher": { "id": "byName", "options": "Authoritative" }, "properties": [{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }] }
]
},
"options": {
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] },
"pieType": "donut",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": true },
"tooltip": { "mode": "single" }
},
"targets": [
{
"rawSql": "SELECT SUM(CASE WHEN response_type = 1 THEN 1 ELSE 0 END) as Authoritative, SUM(CASE WHEN response_type = 2 THEN 1 ELSE 0 END) as Recursive, SUM(CASE WHEN response_type = 3 THEN 1 ELSE 0 END) as Cached, SUM(CASE WHEN response_type = 4 THEN 1 ELSE 0 END) as Blocked, SUM(CASE WHEN response_type = 5 THEN 1 ELSE 0 END) as Dropped FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Query Types",
"type": "piechart",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" }
},
"overrides": []
},
"options": {
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] },
"pieType": "donut",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": true },
"tooltip": { "mode": "single" }
},
"targets": [
{
"rawSql": "SELECT SUM(CASE WHEN qtype = 1 THEN 1 ELSE 0 END) as A, SUM(CASE WHEN qtype = 28 THEN 1 ELSE 0 END) as AAAA, SUM(CASE WHEN qtype = 5 THEN 1 ELSE 0 END) as CNAME, SUM(CASE WHEN qtype = 15 THEN 1 ELSE 0 END) as MX, SUM(CASE WHEN qtype = 16 THEN 1 ELSE 0 END) as TXT, SUM(CASE WHEN qtype = 33 THEN 1 ELSE 0 END) as SRV, SUM(CASE WHEN qtype = 12 THEN 1 ELSE 0 END) as PTR, SUM(CASE WHEN qtype = 6 THEN 1 ELSE 0 END) as SOA, SUM(CASE WHEN qtype = 2 THEN 1 ELSE 0 END) as NS, SUM(CASE WHEN qtype = 65 THEN 1 ELSE 0 END) as HTTPS, SUM(CASE WHEN qtype NOT IN (1,2,5,6,12,15,16,28,33,65) THEN 1 ELSE 0 END) as Other FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Top 20 Queried Domains",
"type": "table",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 20 },
"fieldConfig": {
"defaults": {
"custom": { "filterable": true }
},
"overrides": [
{ "matcher": { "id": "byName", "options": "count" }, "properties": [{ "id": "custom.width", "value": 100 }] }
]
},
"options": {
"showHeader": true,
"sortBy": [{ "desc": true, "displayName": "count" }]
},
"targets": [
{
"rawSql": "SELECT qname as domain, COUNT(*) as count FROM dns_logs WHERE $__timeFilter(timestamp) GROUP BY qname ORDER BY count DESC LIMIT 20",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Top 20 Clients",
"type": "table",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 20 },
"fieldConfig": {
"defaults": {
"custom": { "filterable": true }
},
"overrides": [
{ "matcher": { "id": "byName", "options": "count" }, "properties": [{ "id": "custom.width", "value": 100 }] }
]
},
"options": {
"showHeader": true,
"sortBy": [{ "desc": true, "displayName": "count" }]
},
"targets": [
{
"rawSql": "SELECT client_ip, COUNT(*) as count FROM dns_logs WHERE $__timeFilter(timestamp) GROUP BY client_ip ORDER BY count DESC LIMIT 20",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Average Response Time Over Time",
"type": "timeseries",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 30 },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"unit": "ms",
"custom": {
"axisBorderShow": false,
"axisLabel": "Response Time (ms)",
"axisPlacement": "auto",
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"lineWidth": 2,
"pointSize": 5,
"showPoints": "never",
"spanNulls": true
}
},
"overrides": []
},
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "list", "placement": "bottom" },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"rawSql": "SELECT $__timeGroup(timestamp, $__interval) as time, AVG(response_rtt) as avg_rtt, MAX(response_rtt) as max_rtt FROM dns_logs WHERE $__timeFilter(timestamp) AND response_rtt IS NOT NULL GROUP BY time ORDER BY time",
"format": "time_series",
"refId": "A"
}
]
},
{
"title": "Top 20 NxDomain Domains",
"type": "table",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 38 },
"fieldConfig": {
"defaults": {
"custom": { "filterable": true }
},
"overrides": [
{ "matcher": { "id": "byName", "options": "count" }, "properties": [{ "id": "custom.width", "value": 100 }] }
]
},
"options": {
"showHeader": true,
"sortBy": [{ "desc": true, "displayName": "count" }]
},
"targets": [
{
"rawSql": "SELECT qname as domain, COUNT(*) as count FROM dns_logs WHERE $__timeFilter(timestamp) AND rcode = 3 GROUP BY qname ORDER BY count DESC LIMIT 20",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Top 20 Blocked Domains",
"type": "table",
"datasource": { "type": "postgres", "uid": "technitium-pg" },
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 38 },
"fieldConfig": {
"defaults": {
"custom": { "filterable": true }
},
"overrides": [
{ "matcher": { "id": "byName", "options": "count" }, "properties": [{ "id": "custom.width", "value": 100 }] }
]
},
"options": {
"showHeader": true,
"sortBy": [{ "desc": true, "displayName": "count" }]
},
"targets": [
{
"rawSql": "SELECT qname as domain, COUNT(*) as count FROM dns_logs WHERE $__timeFilter(timestamp) AND response_type = 4 GROUP BY qname ORDER BY count DESC LIMIT 20",
"format": "table",
"refId": "A"
}
]
}
],
"refresh": "5m",
"schemaVersion": 39,
"tags": ["dns", "technitium", "postgresql"],
"templating": { "list": [] },
"time": { "from": "now-24h", "to": "now" },
"timepicker": {},
"timezone": "",
"title": "Technitium DNS",
"uid": "technitium-dns",
"version": 1
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,303 @@
# HELP snmpEnableAuthenTraps Indicates whether the SNMP entity is permitted to generate authenticationFailure traps - 1.3.6.1.2.1.11.30
# TYPE snmpEnableAuthenTraps gauge
snmpEnableAuthenTraps 2
# HELP snmpInASNParseErrs The total number of ASN.1 or BER errors encountered by the SNMP entity when decoding received SNMP messages. - 1.3.6.1.2.1.11.6
# TYPE snmpInASNParseErrs counter
snmpInASNParseErrs 0
# HELP snmpInBadCommunityNames The total number of community-based SNMP messages (for example, SNMPv1) delivered to the SNMP entity which used an SNMP community name not known to said entity - 1.3.6.1.2.1.11.4
# TYPE snmpInBadCommunityNames counter
snmpInBadCommunityNames 184
# HELP snmpInBadCommunityUses The total number of community-based SNMP messages (for example, SNMPv1) delivered to the SNMP entity which represented an SNMP operation that was not allowed for the SNMP community named in the message - 1.3.6.1.2.1.11.5
# TYPE snmpInBadCommunityUses counter
snmpInBadCommunityUses 0
# HELP snmpInBadValues The total number of SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `badValue'. - 1.3.6.1.2.1.11.10
# TYPE snmpInBadValues counter
snmpInBadValues 0
# HELP snmpInBadVersions The total number of SNMP messages which were delivered to the SNMP entity and were for an unsupported SNMP version. - 1.3.6.1.2.1.11.3
# TYPE snmpInBadVersions counter
snmpInBadVersions 0
# HELP snmpInGenErrs The total number of SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `genErr'. - 1.3.6.1.2.1.11.12
# TYPE snmpInGenErrs counter
snmpInGenErrs 0
# HELP snmpInGetNexts The total number of SNMP Get-Next PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.16
# TYPE snmpInGetNexts counter
snmpInGetNexts 2940
# HELP snmpInGetRequests The total number of SNMP Get-Request PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.15
# TYPE snmpInGetRequests counter
snmpInGetRequests 9
# HELP snmpInGetResponses The total number of SNMP Get-Response PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.18
# TYPE snmpInGetResponses counter
snmpInGetResponses 0
# HELP snmpInNoSuchNames The total number of SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `noSuchName'. - 1.3.6.1.2.1.11.9
# TYPE snmpInNoSuchNames counter
snmpInNoSuchNames 0
# HELP snmpInPkts The total number of messages delivered to the SNMP entity from the transport service. - 1.3.6.1.2.1.11.1
# TYPE snmpInPkts counter
snmpInPkts 5928
# HELP snmpInReadOnlys The total number valid SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `readOnly' - 1.3.6.1.2.1.11.11
# TYPE snmpInReadOnlys counter
snmpInReadOnlys 0
# HELP snmpInSetRequests The total number of SNMP Set-Request PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.17
# TYPE snmpInSetRequests counter
snmpInSetRequests 0
# HELP snmpInTooBigs The total number of SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `tooBig'. - 1.3.6.1.2.1.11.8
# TYPE snmpInTooBigs counter
snmpInTooBigs 0
# HELP snmpInTotalReqVars The total number of MIB objects which have been retrieved successfully by the SNMP protocol entity as the result of receiving valid SNMP Get-Request and Get-Next PDUs. - 1.3.6.1.2.1.11.13
# TYPE snmpInTotalReqVars counter
snmpInTotalReqVars 72699
# HELP snmpInTotalSetVars The total number of MIB objects which have been altered successfully by the SNMP protocol entity as the result of receiving valid SNMP Set-Request PDUs. - 1.3.6.1.2.1.11.14
# TYPE snmpInTotalSetVars counter
snmpInTotalSetVars 0
# HELP snmpInTraps The total number of SNMP Trap PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.19
# TYPE snmpInTraps counter
snmpInTraps 0
# HELP snmpOutBadValues The total number of SNMP PDUs which were generated by the SNMP protocol entity and for which the value of the error-status field was `badValue'. - 1.3.6.1.2.1.11.22
# TYPE snmpOutBadValues counter
snmpOutBadValues 0
# HELP snmpOutGenErrs The total number of SNMP PDUs which were generated by the SNMP protocol entity and for which the value of the error-status field was `genErr'. - 1.3.6.1.2.1.11.24
# TYPE snmpOutGenErrs counter
snmpOutGenErrs 0
# HELP snmpOutGetNexts The total number of SNMP Get-Next PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.26
# TYPE snmpOutGetNexts counter
snmpOutGetNexts 0
# HELP snmpOutGetRequests The total number of SNMP Get-Request PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.25
# TYPE snmpOutGetRequests counter
snmpOutGetRequests 0
# HELP snmpOutGetResponses The total number of SNMP Get-Response PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.28
# TYPE snmpOutGetResponses counter
snmpOutGetResponses 5740
# HELP snmpOutNoSuchNames The total number of SNMP PDUs which were generated by the SNMP protocol entity and for which the value of the error-status was `noSuchName'. - 1.3.6.1.2.1.11.21
# TYPE snmpOutNoSuchNames counter
snmpOutNoSuchNames 0
# HELP snmpOutPkts The total number of SNMP Messages which were passed from the SNMP protocol entity to the transport service. - 1.3.6.1.2.1.11.2
# TYPE snmpOutPkts counter
snmpOutPkts 5739
# HELP snmpOutSetRequests The total number of SNMP Set-Request PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.27
# TYPE snmpOutSetRequests counter
snmpOutSetRequests 0
# HELP snmpOutTooBigs The total number of SNMP PDUs which were generated by the SNMP protocol entity and for which the value of the error-status field was `tooBig.' - 1.3.6.1.2.1.11.20
# TYPE snmpOutTooBigs counter
snmpOutTooBigs 0
# HELP snmpOutTraps The total number of SNMP Trap PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.29
# TYPE snmpOutTraps counter
snmpOutTraps 0
# HELP snmpProxyDrops The total number of Confirmed Class PDUs (such as GetRequest-PDUs, GetNextRequest-PDUs, GetBulkRequest-PDUs, SetRequest-PDUs, and InformRequest-PDUs) delivered to the SNMP entity which were silently dropped because the transmission of the (possibly translated) message to a proxy target failed in a manner (other than a time-out) such that no Response Class PDU (such as a Response-PDU) could be returned. - 1.3.6.1.2.1.11.32
# TYPE snmpProxyDrops counter
snmpProxyDrops 0
# HELP snmpSilentDrops The total number of Confirmed Class PDUs (such as GetRequest-PDUs, GetNextRequest-PDUs, GetBulkRequest-PDUs, SetRequest-PDUs, and InformRequest-PDUs) delivered to the SNMP entity which were silently dropped because the size of a reply containing an alternate Response Class PDU (such as a Response-PDU) with an empty variable-bindings field was greater than either a local constraint or the maximum message size associated with the originator of the request. - 1.3.6.1.2.1.11.31
# TYPE snmpSilentDrops counter
snmpSilentDrops 0
# HELP snmp_scrape_duration_seconds Total SNMP time scrape took (walk and processing).
# TYPE snmp_scrape_duration_seconds gauge
snmp_scrape_duration_seconds{module="huawei"} 0.39253882
# HELP snmp_scrape_packets_retried Packets retried for get, bulkget, and walk.
# TYPE snmp_scrape_packets_retried gauge
snmp_scrape_packets_retried{module="huawei"} 0
# HELP snmp_scrape_packets_sent Packets sent for get, bulkget, and walk; including retries.
# TYPE snmp_scrape_packets_sent gauge
snmp_scrape_packets_sent{module="huawei"} 6
# HELP snmp_scrape_pdus_returned PDUs returned from get, bulkget, and walk.
# TYPE snmp_scrape_pdus_returned gauge
snmp_scrape_pdus_returned{module="huawei"} 104
# HELP snmp_scrape_walk_duration_seconds Time SNMP walk/bulkwalk took.
# TYPE snmp_scrape_walk_duration_seconds gauge
snmp_scrape_walk_duration_seconds{module="huawei"} 0.391760524
# HELP sysContact The textual identification of the contact person for this managed node, together with information on how to contact this person - 1.3.6.1.2.1.1.4
# TYPE sysContact gauge
sysContact{sysContact="Not Configure System Contact"} 1
# HELP sysDescr A textual description of the entity - 1.3.6.1.2.1.1.1
# TYPE sysDescr gauge
sysDescr{sysDescr="Linux GSE200M 2.6.27-SPEAr310 #80 Fri Jan 13 11:22:09 CST 2017 armv5tejl"} 1
# HELP sysLocation The physical location of this node (e.g., 'telephone closet, 3rd floor') - 1.3.6.1.2.1.1.6
# TYPE sysLocation gauge
sysLocation{sysLocation="Garage G03"} 1
# HELP sysName An administratively-assigned name for this managed node - 1.3.6.1.2.1.1.5
# TYPE sysName gauge
sysName{sysName="ups2000"} 1
# HELP sysORDescr A textual description of the capabilities identified by the corresponding instance of sysORID. - 1.3.6.1.2.1.1.9.1.3
# TYPE sysORDescr gauge
sysORDescr{sysORDescr="The MIB for Message Processing and Dispatching.",sysORIndex="3"} 1
sysORDescr{sysORDescr="The MIB module for SNMPv2 entities",sysORIndex="1"} 1
sysORDescr{sysORDescr="The SNMP Management Architecture MIB.",sysORIndex="5"} 1
sysORDescr{sysORDescr="The management information definitions for the SNMP User-based Security Model.",sysORIndex="4"} 1
sysORDescr{sysORDescr="View-based Access Control Model for SNMP.",sysORIndex="2"} 1
# HELP sysORID An authoritative identification of a capabilities statement with respect to various MIB modules supported by the local SNMP application acting as a command responder. - 1.3.6.1.2.1.1.9.1.2
# TYPE sysORID gauge
sysORID{sysORID="1.3.6.1.6.3.1",sysORIndex="1"} 1
sysORID{sysORID="1.3.6.1.6.3.10.3.1.1",sysORIndex="5"} 1
sysORID{sysORID="1.3.6.1.6.3.11.3.1.1",sysORIndex="3"} 1
sysORID{sysORID="1.3.6.1.6.3.15.2.1.1",sysORIndex="4"} 1
sysORID{sysORID="1.3.6.1.6.3.16.2.2.1",sysORIndex="2"} 1
# HELP sysORLastChange The value of sysUpTime at the time of the most recent change in state or value of any instance of sysORID. - 1.3.6.1.2.1.1.8
# TYPE sysORLastChange gauge
sysORLastChange 8
# HELP sysORUpTime The value of sysUpTime at the time this conceptual row was last instantiated. - 1.3.6.1.2.1.1.9.1.4
# TYPE sysORUpTime gauge
sysORUpTime{sysORIndex="1"} 7
sysORUpTime{sysORIndex="2"} 8
sysORUpTime{sysORIndex="3"} 8
sysORUpTime{sysORIndex="4"} 8
sysORUpTime{sysORIndex="5"} 8
# HELP sysObjectID The vendor's authoritative identification of the network management subsystem contained in the entity - 1.3.6.1.2.1.1.2
# TYPE sysObjectID gauge
sysObjectID{sysObjectID="1.3.6.1.4.1.8072.3.2.10"} 1
# HELP sysUpTime The time (in hundredths of a second) since the network management portion of the system was last re-initialized. - 1.3.6.1.2.1.1.3
# TYPE sysUpTime gauge
sysUpTime 5.3264032e+07
# HELP upsAlarmsPresent The present number of active alarm conditions. - 1.3.6.1.2.1.33.1.6.1
# TYPE upsAlarmsPresent gauge
upsAlarmsPresent 0
# HELP upsAutoRestart Setting this object to 'on' will cause the UPS system to restart after a shutdown if the shutdown occurred during a power loss as a result of either a upsShutdownAfterDelay or an internal battery depleted condition - 1.3.6.1.2.1.33.1.8.5
# TYPE upsAutoRestart gauge
upsAutoRestart 0
# HELP upsBatteryCurrent The present battery current. - 1.3.6.1.2.1.33.1.2.6
# TYPE upsBatteryCurrent gauge
upsBatteryCurrent 2.147483647e+09
# HELP upsBatteryStatus The indication of the capacity remaining in the UPS system's batteries - 1.3.6.1.2.1.33.1.2.1
# TYPE upsBatteryStatus gauge
upsBatteryStatus 2
# HELP upsBatteryTemperature The ambient temperature at or near the UPS Battery casing. - 1.3.6.1.2.1.33.1.2.7
# TYPE upsBatteryTemperature gauge
upsBatteryTemperature 2.147483647e+09
# HELP upsBatteryVoltage The magnitude of the present battery voltage. - 1.3.6.1.2.1.33.1.2.5
# TYPE upsBatteryVoltage gauge
upsBatteryVoltage 821
# HELP upsBypassFrequency The present bypass frequency. - 1.3.6.1.2.1.33.1.5.1
# TYPE upsBypassFrequency gauge
upsBypassFrequency 500
# HELP upsBypassLineIndex The bypass line identifier. - 1.3.6.1.2.1.33.1.5.3.1.1
# TYPE upsBypassLineIndex gauge
upsBypassLineIndex{upsBypassLineIndex="1"} 1
# HELP upsBypassNumLines The number of bypass lines utilized in this device - 1.3.6.1.2.1.33.1.5.2
# TYPE upsBypassNumLines gauge
upsBypassNumLines 1
# HELP upsBypassVoltage The present bypass voltage. - 1.3.6.1.2.1.33.1.5.3.1.2
# TYPE upsBypassVoltage gauge
upsBypassVoltage{upsBypassLineIndex="1"} 220
# HELP upsConfigAudibleStatus The requested state of the audible alarm - 1.3.6.1.2.1.33.1.9.8
# TYPE upsConfigAudibleStatus gauge
upsConfigAudibleStatus 0
# HELP upsConfigHighVoltageTransferPoint The maximum line voltage allowed before the UPS system transfers to battery backup. - 1.3.6.1.2.1.33.1.9.10
# TYPE upsConfigHighVoltageTransferPoint gauge
upsConfigHighVoltageTransferPoint 0
# HELP upsConfigInputFreq The nominal input frequency - 1.3.6.1.2.1.33.1.9.2
# TYPE upsConfigInputFreq gauge
upsConfigInputFreq 0
# HELP upsConfigInputVoltage The magnitude of the nominal input voltage - 1.3.6.1.2.1.33.1.9.1
# TYPE upsConfigInputVoltage gauge
upsConfigInputVoltage 0
# HELP upsConfigLowBattTime The value of upsEstimatedMinutesRemaining at which a lowBattery condition is declared - 1.3.6.1.2.1.33.1.9.7
# TYPE upsConfigLowBattTime gauge
upsConfigLowBattTime 0
# HELP upsConfigLowVoltageTransferPoint The minimum input line voltage allowed before the UPS system transfers to battery backup. - 1.3.6.1.2.1.33.1.9.9
# TYPE upsConfigLowVoltageTransferPoint gauge
upsConfigLowVoltageTransferPoint 0
# HELP upsConfigOutputFreq The nominal output frequency - 1.3.6.1.2.1.33.1.9.4
# TYPE upsConfigOutputFreq gauge
upsConfigOutputFreq 0
# HELP upsConfigOutputPower The magnitude of the nominal true power rating. - 1.3.6.1.2.1.33.1.9.6
# TYPE upsConfigOutputPower gauge
upsConfigOutputPower 0
# HELP upsConfigOutputVA The magnitude of the nominal Volt-Amp rating. - 1.3.6.1.2.1.33.1.9.5
# TYPE upsConfigOutputVA gauge
upsConfigOutputVA 0
# HELP upsConfigOutputVoltage The magnitude of the nominal output voltage - 1.3.6.1.2.1.33.1.9.3
# TYPE upsConfigOutputVoltage gauge
upsConfigOutputVoltage 0
# HELP upsEstimatedChargeRemaining An estimate of the battery charge remaining expressed as a percent of full charge. - 1.3.6.1.2.1.33.1.2.4
# TYPE upsEstimatedChargeRemaining gauge
upsEstimatedChargeRemaining 91
# HELP upsEstimatedMinutesRemaining An estimate of the time to battery charge depletion under the present load conditions if the utility power is off and remains off, or if it were to be lost and remain off. - 1.3.6.1.2.1.33.1.2.3
# TYPE upsEstimatedMinutesRemaining gauge
upsEstimatedMinutesRemaining 34
# HELP upsIdentAgentSoftwareVersion The UPS agent software version - 1.3.6.1.2.1.33.1.1.4
# TYPE upsIdentAgentSoftwareVersion gauge
upsIdentAgentSoftwareVersion{upsIdentAgentSoftwareVersion="V200R001C31B016"} 1
# HELP upsIdentAttachedDevices A string identifying the devices attached to the output(s) of the UPS - 1.3.6.1.2.1.33.1.1.6
# TYPE upsIdentAttachedDevices gauge
upsIdentAttachedDevices{upsIdentAttachedDevices="None"} 1
# HELP upsIdentManufacturer The name of the UPS manufacturer. - 1.3.6.1.2.1.33.1.1.1
# TYPE upsIdentManufacturer gauge
upsIdentManufacturer{upsIdentManufacturer="HUAWEI"} 1
# HELP upsIdentModel The UPS Model designation. - 1.3.6.1.2.1.33.1.1.2
# TYPE upsIdentModel gauge
upsIdentModel{upsIdentModel="UPS2000 2kVA"} 1
# HELP upsIdentName A string identifying the UPS - 1.3.6.1.2.1.33.1.1.5
# TYPE upsIdentName gauge
upsIdentName{upsIdentName="ups2000"} 1
# HELP upsIdentUPSSoftwareVersion The UPS firmware/software version(s) - 1.3.6.1.2.1.33.1.1.3
# TYPE upsIdentUPSSoftwareVersion gauge
upsIdentUPSSoftwareVersion{upsIdentUPSSoftwareVersion="V2R1C1SPC40"} 1
# HELP upsInputFrequency The present input frequency. - 1.3.6.1.2.1.33.1.3.3.1.2
# TYPE upsInputFrequency gauge
upsInputFrequency{upsInputLineIndex="1"} 500
# HELP upsInputLineBads A count of the number of times the input entered an out-of-tolerance condition as defined by the manufacturer - 1.3.6.1.2.1.33.1.3.1
# TYPE upsInputLineBads counter
upsInputLineBads 0
# HELP upsInputLineIndex The input line identifier. - 1.3.6.1.2.1.33.1.3.3.1.1
# TYPE upsInputLineIndex gauge
upsInputLineIndex{upsInputLineIndex="1"} 1
# HELP upsInputNumLines The number of input lines utilized in this device - 1.3.6.1.2.1.33.1.3.2
# TYPE upsInputNumLines gauge
upsInputNumLines 1
# HELP upsInputVoltage The magnitude of the present input voltage. - 1.3.6.1.2.1.33.1.3.3.1.3
# TYPE upsInputVoltage gauge
upsInputVoltage{upsInputLineIndex="1"} 218
# HELP upsOutputCurrent The present output current. - 1.3.6.1.2.1.33.1.4.4.1.3
# TYPE upsOutputCurrent gauge
upsOutputCurrent{upsOutputLineIndex="1"} 56
# HELP upsOutputFrequency The present output frequency. - 1.3.6.1.2.1.33.1.4.2
# TYPE upsOutputFrequency gauge
upsOutputFrequency 500
# HELP upsOutputLineIndex The output line identifier. - 1.3.6.1.2.1.33.1.4.4.1.1
# TYPE upsOutputLineIndex gauge
upsOutputLineIndex{upsOutputLineIndex="1"} 1
# HELP upsOutputNumLines The number of output lines utilized in this device - 1.3.6.1.2.1.33.1.4.3
# TYPE upsOutputNumLines gauge
upsOutputNumLines 1
# HELP upsOutputPercentLoad The percentage of the UPS power capacity presently being used on this output line, i.e., the greater of the percent load of true power capacity and the percent load of VA. - 1.3.6.1.2.1.33.1.4.4.1.5
# TYPE upsOutputPercentLoad gauge
upsOutputPercentLoad{upsOutputLineIndex="1"} 66
# HELP upsOutputPower The present output true power. - 1.3.6.1.2.1.33.1.4.4.1.4
# TYPE upsOutputPower gauge
upsOutputPower{upsOutputLineIndex="1"} 1
# HELP upsOutputSource The present source of output power - 1.3.6.1.2.1.33.1.4.1
# TYPE upsOutputSource gauge
upsOutputSource 3
# HELP upsOutputVoltage The present output voltage. - 1.3.6.1.2.1.33.1.4.4.1.2
# TYPE upsOutputVoltage gauge
upsOutputVoltage{upsOutputLineIndex="1"} 230
# HELP upsRebootWithDuration Setting this object will immediately shutdown (i.e., turn off) either the UPS output or the UPS system (as determined by the value of upsShutdownType at the time of shutdown) for a period equal to the indicated number of seconds, after which time the output will be started, including starting the UPS, if necessary - 1.3.6.1.2.1.33.1.8.4
# TYPE upsRebootWithDuration gauge
upsRebootWithDuration 0
# HELP upsSecondsOnBattery If the unit is on battery power, the elapsed time since the UPS last switched to battery power, or the time since the network management subsystem was last restarted, whichever is less - 1.3.6.1.2.1.33.1.2.2
# TYPE upsSecondsOnBattery gauge
upsSecondsOnBattery 0
# HELP upsShutdownAfterDelay Setting this object will shutdown (i.e., turn off) either the UPS output or the UPS system (as determined by the value of upsShutdownType at the time of shutdown) after the indicated number of seconds, or less if the UPS batteries become depleted - 1.3.6.1.2.1.33.1.8.2
# TYPE upsShutdownAfterDelay gauge
upsShutdownAfterDelay 0
# HELP upsShutdownType This object determines the nature of the action to be taken at the time when the countdown of the upsShutdownAfterDelay and upsRebootWithDuration objects reaches zero - 1.3.6.1.2.1.33.1.8.1
# TYPE upsShutdownType gauge
upsShutdownType 0
# HELP upsStartupAfterDelay Setting this object will start the output after the indicated number of seconds, including starting the UPS, if necessary - 1.3.6.1.2.1.33.1.8.3
# TYPE upsStartupAfterDelay gauge
upsStartupAfterDelay 0
# HELP upsTestElapsedTime The amount of time, in TimeTicks, since the test in progress was initiated, or, if no test is in progress, the previous test took to complete - 1.3.6.1.2.1.33.1.7.6
# TYPE upsTestElapsedTime gauge
upsTestElapsedTime 0
# HELP upsTestId The test is named by an OBJECT IDENTIFIER which allows a standard mechanism for the initiation of tests, including the well known tests identified in this document as well as those introduced by a particular implementation, i.e., as documented in the private enterprise MIB definition for the device - 1.3.6.1.2.1.33.1.7.1
# TYPE upsTestId gauge
upsTestId{upsTestId="0"} 1
# HELP upsTestResultsDetail Additional information about upsTestResultsSummary - 1.3.6.1.2.1.33.1.7.4
# TYPE upsTestResultsDetail gauge
upsTestResultsDetail{upsTestResultsDetail="0"} 1
# HELP upsTestResultsSummary The results of the current or last UPS diagnostics test performed - 1.3.6.1.2.1.33.1.7.3
# TYPE upsTestResultsSummary gauge
upsTestResultsSummary 0
# HELP upsTestSpinLock A spin lock on the test subsystem - 1.3.6.1.2.1.33.1.7.2
# TYPE upsTestSpinLock gauge
upsTestSpinLock 0
# HELP upsTestStartTime The value of sysUpTime at the time the test in progress was initiated, or, if no test is in progress, the time the previous test was initiated - 1.3.6.1.2.1.33.1.7.5
# TYPE upsTestStartTime gauge
upsTestStartTime 0

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,113 @@
resource "kubernetes_deployment" "goflow2" {
metadata {
name = "goflow2"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "goflow2"
tier = var.tier
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "goflow2"
}
}
template {
metadata {
labels = {
app = "goflow2"
}
}
spec {
container {
name = "goflow2"
image = "netsampler/goflow2:v2.2.1"
args = ["-listen", "netflow://:2055"]
port {
name = "netflow"
container_port = 2055
protocol = "UDP"
}
port {
name = "metrics"
container_port = 8080
protocol = "TCP"
}
resources {
requests = {
cpu = "50m"
memory = "128Mi"
}
limits = {
memory = "128Mi"
}
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
# KEEL: monitoring ns is keel-enrolled (policy=patch) Keel owns the image
# tag and injects keel.sh annotations. Ignore so TF stops reverting Keel each
# plan (completes the cdb7d9a8 KEEL sweep that missed these exporters and was
# tripping drift-detection exit 2 every run). 2026-05-31.
ignore_changes = [
spec[0].template[0].spec[0].dns_config,
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE
metadata[0].annotations["keel.sh/policy"],
metadata[0].annotations["keel.sh/trigger"],
metadata[0].annotations["keel.sh/pollSchedule"],
metadata[0].annotations["keel.sh/match-tag"],
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
]
}
}
resource "kubernetes_service" "goflow2" {
metadata {
name = "goflow2"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "goflow2"
}
}
spec {
selector = {
app = "goflow2"
}
port {
name = "metrics"
port = 8080
target_port = 8080
protocol = "TCP"
}
}
}
resource "kubernetes_service" "goflow2-netflow" {
metadata {
name = "goflow2-netflow"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "goflow2"
}
}
spec {
type = "NodePort"
selector = {
app = "goflow2"
}
port {
name = "netflow"
port = 2055
target_port = 2055
protocol = "UDP"
node_port = 32055
}
}
}

View file

@ -0,0 +1,242 @@
# resource "kubernetes_persistent_volume" "prometheus_grafana_pv" {
# metadata {
# name = "grafana-pv"
# }
# spec {
# capacity = {
# "storage" = "2Gi"
# }
# access_modes = ["ReadWriteOnce"]
# persistent_volume_source {
# nfs {
# path = "/mnt/main/grafana"
# server = var.nfs_server
# }
# # iscsi {
# # target_portal = "iscsi.viktorbarzin.lan:3260"
# # iqn = "iqn.2020-12.lan.viktorbarzin:storage:monitoring:grafana"
# # lun = 0
# # fs_type = "ext4"
# # }
# }
# }
# }
resource "kubernetes_persistent_volume" "alertmanager_pv" {
metadata {
name = "alertmanager-pv"
}
spec {
capacity = {
"storage" = "2Gi"
}
access_modes = ["ReadWriteOnce"]
persistent_volume_source {
csi {
driver = "nfs.csi.k8s.io"
volume_handle = "alertmanager-pv"
volume_attributes = {
server = "192.168.1.127"
share = "/srv/nfs/alertmanager"
}
}
}
mount_options = [
"soft",
"timeo=30",
"retrans=3",
"actimeo=5",
]
storage_class_name = "nfs-truenas"
persistent_volume_reclaim_policy = "Retain"
}
}
# resource "kubernetes_persistent_volume_claim" "grafana_pvc" {
# metadata {
# name = "grafana-pvc"
# namespace = kubernetes_namespace.monitoring.metadata[0].name
# }
# spec {
# access_modes = ["ReadWriteOnce"]
# resources {
# requests = {
# "storage" = "2Gi"
# }
# }
# }
# }
# DB credentials from Vault database engine (rotated automatically)
# Provides GF_DATABASE_PASSWORD that auto-updates when password rotates
resource "kubernetes_manifest" "grafana_db_creds" {
manifest = {
apiVersion = "external-secrets.io/v1beta1"
kind = "ExternalSecret"
metadata = {
name = "grafana-db-creds"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec = {
refreshInterval = "15m"
secretStoreRef = {
name = "vault-database"
kind = "ClusterSecretStore"
}
target = {
name = "grafana-db-creds"
template = {
data = {
GF_DATABASE_PASSWORD = "{{ .password }}"
}
}
}
data = [{
secretKey = "password"
remoteRef = {
key = "static-creds/mysql-grafana"
property = "password"
}
}]
}
}
}
locals {
# Dashboard folder assignments
dashboard_folders = {
# Cluster & Kubernetes
"api_server.json" = "Cluster"
"cluster_health.json" = "Cluster"
"nodes.json" = "Cluster"
"pods.json" = "Cluster"
"kube-state-metrics.json" = "Cluster"
# Networking & DNS
"core_dns.json" = "Networking"
"technitium-dns.json" = "Networking"
"nginx_ingress.json" = "Networking"
"network_traffic.json" = "Networking"
# Hardware & Host
"node_exporter_full.json" = "Hardware"
"proxmox_node_exporter.json" = "Hardware"
"idrac.json" = "Hardware"
"ups.json" = "Hardware"
"nvidia.json" = "Hardware"
"rpi-sofia.json" = "Hardware"
# Operations
"backup_health.json" = "Operations"
"registry.json" = "Operations"
"loki.json" = "Operations"
"k8s-audit.json" = "Operations"
# Logs
"cluster-logs.json" = "Logs"
# Applications
"qbittorrent.json" = "Applications"
"realestate-crawler.json" = "Applications"
"openclaw.json" = "Applications"
"uk-payslip.json" = "Finance (Personal)"
"wealth.json" = "Finance (Personal)"
"job-hunter.json" = "Finance"
"fire-planner.json" = "Finance"
"cost-of-living.json" = "Finance"
}
# Folders restricted to the Grafana admin user (anonymous Viewer + any future
# non-admin users are denied). Permission set by null_resource below via the
# Grafana folder permissions API after the dashboard sidecar auto-creates the
# folder. Server-admin always retains access regardless of folder ACL.
admin_only_folders = [
"Finance (Personal)",
]
}
resource "kubernetes_config_map" "grafana_dashboards" {
for_each = fileset("${path.module}/dashboards", "*.json")
metadata {
name = "grafana-dashboard-${replace(trimsuffix(each.value, ".json"), "_", "-")}"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
grafana_dashboard = "1"
}
annotations = {
grafana_folder = lookup(local.dashboard_folders, each.value, "General")
}
}
data = {
(each.value) = file("${path.module}/dashboards/${each.value}")
}
}
# Lock down "admin only" folders via Grafana folder permissions API.
# Default org-role inheritance gives Viewer + Editor read access to every
# folder; explicitly setting the folder ACL to {Admin: 4} overrides that
# inheritance so Viewer/Editor (incl. anonymous-Viewer) get no access.
# The Grafana super-admin (`admin` user) always retains access regardless.
resource "null_resource" "grafana_admin_only_folder_acl" {
for_each = toset(local.admin_only_folders)
# Re-runs on tg apply (cheap, idempotent API call). Catches drift if anyone
# edits permissions via the UI or the folder is rebuilt.
triggers = {
folder = each.value
always = timestamp()
}
provisioner "local-exec" {
interpreter = ["/bin/bash", "-c"]
command = <<-EOT
set -euo pipefail
FOLDER='${each.value}'
KUBECONFIG_FLAG='--kubeconfig ${var.kube_config_path}'
POD=$(kubectl $KUBECONFIG_FLAG get pod -n monitoring -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')
ADMIN_PW=$(kubectl $KUBECONFIG_FLAG get secret -n monitoring grafana -o jsonpath='{.data.admin-password}' | base64 -d)
# Wait up to 60s for the dashboard sidecar to materialise the folder.
for i in $(seq 1 12); do
FOLDER_UID=$(kubectl $KUBECONFIG_FLAG exec -n monitoring "$POD" -c grafana -- \
curl -sf -u "admin:$ADMIN_PW" "http://localhost:3000/api/folders" \
| jq -r --arg t "$FOLDER" 'first(.[] | select(.title == $t) | .uid) // ""' || true)
if [ -n "$FOLDER_UID" ]; then break; fi
sleep 5
done
if [ -z "$FOLDER_UID" ]; then
echo "ERROR: folder '$FOLDER' not found in Grafana after 60s"
exit 1
fi
# Admin-only ACL. permission codes: 1=View, 2=Edit, 4=Admin.
kubectl $KUBECONFIG_FLAG exec -n monitoring "$POD" -c grafana -- \
curl -sf -u "admin:$ADMIN_PW" -X POST \
-H "Content-Type: application/json" \
-d '{"items":[{"role":"Admin","permission":4}]}' \
"http://localhost:3000/api/folders/$FOLDER_UID/permissions" >/dev/null
echo "set admin-only ACL on folder '$FOLDER' (uid=$FOLDER_UID)"
EOT
}
depends_on = [
helm_release.grafana,
kubernetes_config_map.grafana_dashboards,
]
}
resource "helm_release" "grafana" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "grafana"
atomic = true
timeout = 600
repository = "https://grafana.github.io/helm-charts"
chart = "grafana"
values = [templatefile("${path.module}/grafana_chart_values.yaml", { grafana_admin_password = var.grafana_admin_password, mysql_host = var.mysql_host })]
depends_on = [kubernetes_manifest.grafana_db_creds]
}

View file

@ -0,0 +1,132 @@
deploymentStrategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
replicas: 1
adminPassword: "${grafana_admin_password}"
plugins:
- netsage-sankey-panel
resources:
requests:
cpu: 50m
memory: 512Mi
limits:
memory: 512Mi
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
app.kubernetes.io/name: grafana
podAnnotations:
dependency.kyverno.io/wait-for: "mysql.dbaas:3306"
reloader.stakater.com/auto: "true"
podDisruptionBudget:
maxUnavailable: 1
persistence:
enabled: false # using external mysql
existingClaim: "grafana-pvc"
ingress:
enabled: "true"
ingressClassName: "traefik"
annotations:
traefik.ingress.kubernetes.io/router.middlewares: "traefik-rate-limit@kubernetescrd,traefik-csp-headers@kubernetescrd,traefik-crowdsec@kubernetescrd,traefik-authentik-forward-auth@kubernetescrd"
traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
gethomepage.dev/enabled: "true"
gethomepage.dev/name: "Grafana"
gethomepage.dev/description: "Dashboards & observability"
gethomepage.dev/icon: "grafana.png"
gethomepage.dev/group: "Core Platform"
gethomepage.dev/pod-selector: ""
gethomepage.dev/widget.type: "grafana"
gethomepage.dev/widget.url: "http://grafana.monitoring.svc.cluster.local"
gethomepage.dev/widget.username: "admin"
gethomepage.dev/widget.password: "${grafana_admin_password}"
tls:
- secretName: "tls-secret"
hosts:
- "grafana.viktorbarzin.me"
hosts:
- "grafana.viktorbarzin.me"
sidecar:
datasources:
enabled: "true"
dashboards:
enabled: true
label: "grafana_dashboard"
folderAnnotation: "grafana_folder"
provider:
foldersFromFilesStructure: true
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
name: default
ordId: 1
# folder: ""
type: "file"
# disableDeletion: "false"
# editable: "true"
options:
path: "/var/lib/grafana/dashboards/default"
envFromSecrets:
- name: grafana-db-creds
optional: false
# Cross-namespace passwords for provisioned datasources backed by
# rotating Vault static-roles. Each source stack creates the secret
# via its own ExternalSecret in `monitoring`. `optional: true` lets
# Grafana boot if a stack hasn't applied yet; reloader (podAnnotation
# above) restarts Grafana when any of these secrets is created or
# rotated, so $__env{...} substitution in datasource ConfigMaps stays
# current.
- name: grafana-wealth-pg-creds
optional: true
- name: grafana-payslips-pg-creds
optional: true
- name: grafana-job-hunter-pg-creds
optional: true
env:
GF_SERVER_ROOT_URL: https://grafana.viktorbarzin.me
grafana.ini:
database:
type: mysql
host: ${mysql_host}:3306
name: grafana
user: grafana
password: $__env{GF_DATABASE_PASSWORD}
ssl_mode: disable
auth.anonymous:
enabled: false
auth.proxy:
enabled: true
header_name: X-authentik-username
header_property: username
auto_sign_up: true
sync_ttl: 60
whitelist: ""
enable_login_token: false
users:
auto_assign_org: true
auto_assign_org_role: Viewer
# auth.google:
# enabled: true
analytics:
check_for_updates: "true"
grafana_net:
url: "https://grafana.net"
log:
mode: "console"
paths:
data: "/var/lib/grafana/data"
logs: "/var/log/grafana"
plugins: "/var/lib/grafana/plugins"
provisioning: "/etc/grafana/provisioning"
security:
allow_embedding: true # Allow to be iframed
# url: https://grafana.com/api/dashboards/11074/revisions/2/download
# datasources:
# - name: Prometheus
# url: http://prometheus-server

View file

@ -0,0 +1,162 @@
resource "kubernetes_config_map" "redfish-config" {
metadata {
name = "redfish-exporter-config"
namespace = kubernetes_namespace.monitoring.metadata[0].name
annotations = {
"reloader.stakater.com/match" = "true"
}
}
data = {
"config.yml" = <<-EOF
address: 0.0.0.0
port: 9610
hosts:
${var.idrac_host}:
username: ${var.idrac_username}
password: ${var.idrac_password}
default:
username: root
password: calvin
metrics:
# SNMP (snmp-idrac job, dell_idrac module) is the FAST primary source
# for dynamic + health metrics since 2026-06-05. This Redfish exporter
# is the slow remnant (10m Prometheus scrape) serving only what SNMP
# cannot: indicator LED, NIC link-speed Mbps, SSD life %, machine/BIOS
# info, per-DIMM / per-NIC inventory, PSU input-watts/capacity.
# NOTE: HA Sofia's sensor.r730_fan_speed reads idrac_sensors_fan_speed
# from THIS exporter directly, so `sensors` MUST stay enabled.
# events (SEL empty on this box), processors (cpu count via SNMP),
# manager, extra -> left disabled (default false) to trim the walk.
all: false
system: true
sensors: true
power: true
storage: true
network: true
memory: true
EOF
}
}
resource "kubernetes_deployment" "idrac-redfish" {
metadata {
name = "idrac-redfish-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "idrac-redfish-exporter"
tier = var.tier
}
annotations = {
"reloader.stakater.com/search" = "true"
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "idrac-redfish-exporter"
}
}
template {
metadata {
labels = {
app = "idrac-redfish-exporter"
}
}
spec {
priority_class_name = "tier-1-cluster"
container {
# https://github.com/mrlhansen/idrac_exporter?tab=readme-ov-file
# Patched v2.4.1 - restored missing idrac_power_supply_input_voltage metric
# See: https://github.com/mrlhansen/idrac_exporter/issues/176
image = "viktorbarzin/idrac-redfish-exporter:2.4.1-voltage-fix"
name = "redfish-exporter"
port {
container_port = 9610
}
volume_mount {
name = "redfish-exporter-config"
mount_path = "/etc/prometheus/idrac.yml"
sub_path = "config.yml"
}
}
volume {
name = "redfish-exporter-config"
config_map {
name = "redfish-exporter-config"
}
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
# KEEL: monitoring ns is keel-enrolled (policy=patch) Keel owns the image
# tag and injects keel.sh annotations. Ignore so TF stops reverting Keel each
# plan (completes the cdb7d9a8 KEEL sweep that missed these exporters and was
# tripping drift-detection exit 2 every run). 2026-05-31.
ignore_changes = [
spec[0].template[0].spec[0].dns_config,
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE
metadata[0].annotations["keel.sh/policy"],
metadata[0].annotations["keel.sh/trigger"],
metadata[0].annotations["keel.sh/pollSchedule"],
metadata[0].annotations["keel.sh/match-tag"],
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
]
}
}
resource "kubernetes_service" "idrac-redfish-exporter" {
metadata {
name = "idrac-redfish-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
"app" = "idrac-redfish-exporter"
}
# annotations = {
# "prometheus.io/scrape" = "true"
# "prometheus.io/path" = "/metrics"
# "prometheus.io/port" = "9090"
# }
}
spec {
selector = {
"app" = "idrac-redfish-exporter"
}
port {
name = "http"
port = "9090"
target_port = "9610"
}
}
}
module "idrac-redfish-exporter-ingress" {
source = "../../../../modules/kubernetes/ingress_factory"
# Auth disabled: HA Sofia + Prometheus scrape this endpoint
# programmatically (no browser, no SSO cookie). The
# allow_local_access_only middleware (192.168.0.0/16 + 10.0.0.0/8)
# already gates external access, so layering Authentik on top only
# breaks the REST sensor in HA Sofia (it gets a 302 to authentik.viktorbarzin.me
# and parses HTML instead of metrics).
# auth = "none": HA Sofia REST sensors poll programmatically without cookies; Authentik OIDC flow incompatible with automation.
auth = "none"
namespace = kubernetes_namespace.monitoring.metadata[0].name
name = "idrac-redfish-exporter"
root_domain = "viktorbarzin.lan"
tls_secret_name = var.tls_secret_name
allow_local_access_only = true
ssl_redirect = false
port = 9090
}

View file

@ -0,0 +1,76 @@
---
cluster:
name: default
destinations:
- name: loki
type: loki
url: http://loki-gateway.monitoring.svc.cluster.local/loki/api/v1/push
clusterEvents:
enabled: false
collector: alloy-logs
namespaces:
- dbaas
- immich
- authentik
- mailserver
- crowdsec
- descheduler
- monitoring
- ingress-nginx
- vaultwarden
nodeLogs:
enabled: false
podLogs:
enabled: true
gatherMethod: kubernetesApi
collector: alloy-logs
labelsToKeep:
[
"app_kubernetes_io_name",
"container",
"instance",
"job",
"level",
"namespace",
"service_name",
"service_namespace",
"deployment_environment",
"deployment_environment_name",
]
structuredMetadata:
pod: pod # Set structured metadata "pod" from label "pod"
namespaces:
- dbaas
- immich
- authentik
- mailserver
- crowdsec
- descheduler
- monitoring
- ingress-nginx
- vaultwarden
# Collectors
alloy-singleton:
enabled: false
alloy-metrics:
enabled: false
alloy-logs:
enabled: true
# Required when using the Kubernetes API to pod logs
alloy:
mounts:
varlog: false
clustering:
enabled: true
alloy-profiles:
enabled: false
alloy-receiver:
enabled: false

View file

@ -0,0 +1,434 @@
variable "nfs_server" { type = string }
# Loki + Alloy re-enabled 2026-05-18 for wave 1 security audit logging
# (beads code-8ywc + code-146x). Original disable rationale was "operational
# overhead vs benefit after node2 incident" — re-evaluated because the wave 1
# detection layer (K8s audit, Vault audit, source-IP anomaly rules) needs Loki.
# Resource budget: SingleBinary mode, 2-4Gi memory, 50Gi proxmox-lvm PVC,
# 30-day retention, ruler enabled pointed at prometheus-alertmanager.
resource "helm_release" "loki" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "loki"
repository = "https://grafana.github.io/helm-charts"
chart = "loki"
values = [templatefile("${path.module}/loki.yaml", {})]
timeout = 600
depends_on = [kubernetes_config_map.loki_alert_rules]
}
resource "helm_release" "alloy" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "alloy"
repository = "https://grafana.github.io/helm-charts"
chart = "alloy"
values = [file("${path.module}/alloy.yaml")]
atomic = true
timeout = 900 # 5-pod DS rolling update + occasional runc-stuck-Terminating on k8s-master needs >300s default
depends_on = [helm_release.loki]
}
# inotify limits raised for Alloy pod log tailing (one watch per container).
resource "kubernetes_daemon_set_v1" "sysctl-inotify" {
metadata {
name = "sysctl-inotify"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "sysctl-inotify"
}
}
spec {
selector {
match_labels = {
app = "sysctl-inotify"
}
}
template {
metadata {
labels = {
app = "sysctl-inotify"
}
}
spec {
init_container {
name = "sysctl"
image = "busybox:1.37"
command = [
"sh", "-c",
"sysctl -w fs.inotify.max_user_watches=1048576 && sysctl -w fs.inotify.max_user_instances=8192 && sysctl -w fs.inotify.max_queued_events=1048576"
]
security_context {
privileged = true
}
}
container {
name = "pause"
image = "registry.k8s.io/pause:3.10"
resources {
requests = {
cpu = "1m"
memory = "4Mi"
}
limits = {
cpu = "1m"
memory = "4Mi"
}
}
}
host_pid = true
toleration {
operator = "Exists"
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
# KEEL: monitoring ns is keel-enrolled Keel owns the pause image tag and
# injects keel.sh annotations. Ignore so TF stops reverting Keel each plan
# (completes the cdb7d9a8 KEEL sweep that missed this daemonset and was
# tripping drift-detection exit 2 every run). 2026-05-31.
ignore_changes = [
spec[0].template[0].spec[0].dns_config,
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE
metadata[0].annotations["keel.sh/policy"],
metadata[0].annotations["keel.sh/trigger"],
metadata[0].annotations["keel.sh/pollSchedule"],
metadata[0].annotations["keel.sh/match-tag"],
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
metadata[0].labels["tier"], # tier stamped live by tier-labeling; TF doesn't declare it here
]
}
}
# resource "helm_release" "k8s-monitoring" {
# namespace = kubernetes_namespace.monitoring.metadata[0].name
# create_namespace = true
# name = "k8s-monitoring"
# repository = "https://grafana.github.io/helm-charts"
# chart = "k8s-monitoring"
# values = [templatefile("${path.module}/k8s-monitoring-values.yaml", {})]
# atomic = true
# }
resource "kubernetes_config_map" "loki_alert_rules" {
metadata {
name = "loki-alert-rules"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
data = {
"rules.yaml" = yamlencode({
groups = [
{
name = "Node Health"
rules = [
{
alert = "KernelOOMKiller"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Out of memory.*Killed process\" [5m])) > 0"
for = "0m"
labels = {
severity = "critical"
}
annotations = {
summary = "OOM killer active on {{ $labels.node }}"
}
},
{
alert = "KernelPanic"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Kernel panic\" [5m])) > 0"
for = "0m"
labels = {
severity = "critical"
}
annotations = {
summary = "Kernel panic on {{ $labels.node }}"
}
},
{
alert = "KernelHungTask"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"blocked for more than\" [5m])) > 0"
for = "0m"
labels = {
severity = "warning"
}
annotations = {
summary = "Hung task detected on {{ $labels.node }}"
}
},
{
alert = "KernelSoftLockup"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)soft lockup\" [5m])) > 0"
for = "0m"
labels = {
severity = "critical"
}
annotations = {
summary = "Soft lockup on {{ $labels.node }}"
}
},
{
alert = "ContainerdDown"
expr = "sum by (node) (count_over_time({job=\"node-journal\", unit=\"containerd.service\"} |~ \"(?i)(dead|failed|deactivating)\" [5m])) > 0"
for = "1m"
labels = {
severity = "critical"
}
annotations = {
summary = "containerd service unhealthy on {{ $labels.node }}"
}
},
]
},
{
# Wave 1 security alerts (beads code-8ywc). Routed via Loki ruler
# prometheus-alertmanager #security Slack receiver. Allowlist CIDRs:
# 10.0.20.0/22, 192.168.1.0/24, K8s pod CIDR 10.10.0.0/16, K8s service
# CIDR 10.96.0.0/12. Identity allowlist: me@viktorbarzin.me only.
# NOTE: K1 (cluster-admin grant) intentionally skipped.
name = "Security Wave 1"
rules = [
# V1: Root token created (Vault audit, vault-tail sidecar stream)
{
alert = "VaultRootTokenCreated"
expr = "sum(count_over_time({namespace=\"vault\",container=\"audit-tail\"} | json | request_path=\"auth/token/create\" |~ \"\\\"policies\\\":\\\\[\\\"root\\\"\\\\]\" [5m])) > 0"
for = "0m"
labels = { severity = "critical", lane = "security" }
annotations = {
summary = "Vault root token created"
description = "A token with policies=[root] was issued via auth/token/create. Verify this is a planned bootstrap or break-glass; otherwise treat as critical compromise."
runbook = "docs/runbooks/security-incident.md#v1-root-token-created"
}
},
# V2: Audit device disabled/modified
{
alert = "VaultAuditDeviceModified"
expr = "sum(count_over_time({namespace=\"vault\",container=\"audit-tail\"} | json | request_path=~\"sys/audit/.+\" | operation=~\"(create|delete|update)\" [5m])) > 0"
for = "0m"
labels = { severity = "critical", lane = "security" }
annotations = {
summary = "Vault audit device modified — attacker may be silencing visibility"
runbook = "docs/runbooks/security-incident.md#v2-audit-device-disabledmodified"
}
},
# V3: Seal status changed
{
alert = "VaultSealChanged"
expr = "sum(count_over_time({namespace=\"vault\",container=\"audit-tail\"} | json | request_path=\"sys/seal\" | operation=\"update\" [5m])) > 0"
for = "0m"
labels = { severity = "critical", lane = "security" }
annotations = {
summary = "Vault seal status changed via API — confirm planned operation"
runbook = "docs/runbooks/security-incident.md#v3-seal-status-changed"
}
},
# V4: Policy modified
{
alert = "VaultPolicyModified"
expr = "sum(count_over_time({namespace=\"vault\",container=\"audit-tail\"} | json | request_path=~\"sys/policies/acl/.+\" | operation=~\"(create|update|delete)\" [5m])) > 0"
for = "0m"
labels = { severity = "warning", lane = "security" }
annotations = {
summary = "Vault policy modified — verify Terraform-driven change"
runbook = "docs/runbooks/security-incident.md#v4-policy-modified"
}
},
# V5: Auth failure spike
{
alert = "VaultAuthFailureSpike"
expr = "sum(count_over_time({namespace=\"vault\",container=\"audit-tail\"} | json | type=\"response\" |~ \"\\\"error\\\":\\\"permission denied\\\"\" [1m])) > 10"
for = "1m"
labels = { severity = "warning", lane = "security" }
annotations = {
summary = "Vault permission-denied spike >10/min — possible brute force or CI rotation glitch"
runbook = "docs/runbooks/security-incident.md#v5-auth-failure-spike"
}
},
# V7: Viktor identity from non-allowlist source IP
# XFF trust enabled, so request.remote_address is the real client IP.
# Allowlist regex covers: 10.0.20.x, 192.168.1.x, pod CIDR 10.10.x.x,
# service CIDR 10.96-111.x.x, Headscale tailnet 100.64-127.x.x.
{
alert = "VaultViktorFromUnexpectedIP"
expr = "sum(count_over_time({namespace=\"vault\",container=\"audit-tail\"} | json | auth_metadata_username=\"me@viktorbarzin.me\" | request_remote_address!~\"^(10\\\\.0\\\\.2[0-3]\\\\.|192\\\\.168\\\\.1\\\\.|10\\\\.10\\\\.|10\\\\.(9[6-9]|1[01][0-9]|111)\\\\.|100\\\\.(6[4-9]|[7-9][0-9]|1[01][0-9]|12[0-7])\\\\.).*\" [5m])) > 0"
for = "0m"
labels = { severity = "critical", lane = "security" }
annotations = {
summary = "Vault auth as me@viktorbarzin.me from non-allowlist source IP — possible stolen OIDC token"
runbook = "docs/runbooks/security-incident.md#v7-viktors-vault-identity-from-unexpected-source-ip"
}
},
# K2: ServiceAccount token used from outside cluster.
# Allowlist = pod CIDR + LAN + Headscale tailnet. Anything else =
# likely stolen SA token used externally.
{
alert = "K8sSATokenFromUnexpectedIP"
expr = "sum(count_over_time({job=\"kubernetes-audit\"} | json | user_username=~\"system:serviceaccount:.+\" | sourceIPs_0!~\"^(10\\\\.0\\\\.2[0-3]\\\\.|192\\\\.168\\\\.1\\\\.|10\\\\.10\\\\.|10\\\\.(9[6-9]|1[01][0-9]|111)\\\\.|100\\\\.(6[4-9]|[7-9][0-9]|1[01][0-9]|12[0-7])\\\\.).*\" [5m])) > 0"
for = "0m"
labels = { severity = "critical", lane = "security" }
annotations = {
summary = "K8s ServiceAccount token used from non-allowlist source IP — possible stolen SA token"
runbook = "docs/runbooks/security-incident.md#k2-serviceaccount-token-used-from-outside-cluster"
}
},
# K3: Secret read in sensitive namespace by unexpected actor.
# Allowlisted readers: ESO controller, sealed-secrets controller,
# Vault SA, me@viktorbarzin.me. Anyone else = alert.
{
alert = "K8sSensitiveSecretReadByUnexpectedActor"
expr = "sum(count_over_time({job=\"kubernetes-audit\"} | json | verb=~\"get|list\" | objectRef_resource=\"secrets\" | objectRef_namespace=~\"vault|sealed-secrets|external-secrets\" | user_username!~\"^(me@viktorbarzin\\\\.me|system:serviceaccount:external-secrets:.+|system:serviceaccount:sealed-secrets:.+|system:serviceaccount:vault:.+)$\" [5m])) > 0"
for = "0m"
labels = { severity = "critical", lane = "security" }
annotations = {
summary = "Sensitive Secret read in vault/sealed-secrets/external-secrets by non-allowlisted actor"
runbook = "docs/runbooks/security-incident.md#k3-secret-read-in-sensitive-namespace-by-unexpected-actor"
}
},
# K4: Exec into pod in sensitive namespace.
{
alert = "K8sExecIntoSensitiveNamespace"
expr = "sum(count_over_time({job=\"kubernetes-audit\"} | json | verb=\"create\" | objectRef_resource=\"pods\" | objectRef_subresource=\"exec\" | objectRef_namespace=~\"vault|kube-system|dbaas|cnpg-system\" | user_username!=\"me@viktorbarzin.me\" [5m])) > 0"
for = "0m"
labels = { severity = "warning", lane = "security" }
annotations = {
summary = "kubectl exec into sensitive namespace (vault/kube-system/dbaas/cnpg-system) by non-Viktor actor"
runbook = "docs/runbooks/security-incident.md#k4-exec-into-sensitive-pod"
}
},
# K5: Mass delete of pods/secrets/configmaps in 60s by single actor.
{
alert = "K8sMassDelete"
expr = "sum by (user_username) (count_over_time({job=\"kubernetes-audit\"} | json | verb=\"delete\" | objectRef_resource=~\"pods|secrets|configmaps\" [1m])) > 5"
for = "1m"
labels = { severity = "critical", lane = "security" }
annotations = {
summary = "Mass delete (>5 Pod/Secret/ConfigMap in 60s) by {{ $labels.user_username }}"
runbook = "docs/runbooks/security-incident.md#k5-mass-delete"
}
},
# K6: Audit policy or audit-log path modified attacker silencing
# visibility. The audit policy file is /etc/kubernetes/policies/audit-policy.yaml
# on master; changes go via kubeadm reconfig. Detect via API access
# to apiserver kubeadm-config ConfigMap.
{
alert = "K8sAuditPolicyModified"
expr = "sum(count_over_time({job=\"kubernetes-audit\"} | json | verb=~\"update|patch\" | objectRef_resource=\"configmaps\" | objectRef_name=\"kubeadm-config\" | objectRef_namespace=\"kube-system\" [5m])) > 0"
for = "0m"
labels = { severity = "critical", lane = "security" }
annotations = {
summary = "kubeadm-config ConfigMap modified — could be audit policy change"
runbook = "docs/runbooks/security-incident.md#k6-audit-policy-modified"
}
},
# K7: New ClusterRole created with verbs=* and resources=*.
# Allowlist excludes calico-system, kyverno, nvidia, etc. which legitimately
# create such ClusterRoles via Helm.
{
alert = "K8sClusterRoleWildcardCreated"
expr = "sum(count_over_time({job=\"kubernetes-audit\"} | json | verb=\"create\" | objectRef_resource=\"clusterroles\" |~ \"\\\"verbs\\\":\\\\[\\\"\\\\*\\\"\\\\]\" |~ \"\\\"resources\\\":\\\\[\\\"\\\\*\\\"\\\\]\" [5m])) > 0"
for = "0m"
labels = { severity = "warning", lane = "security" }
annotations = {
summary = "New ClusterRole with verbs=[*]+resources=[*] created — privilege escalation primitive"
runbook = "docs/runbooks/security-incident.md#k7-new-clusterrole-with-full-wildcards"
}
},
# K8: Anonymous binding granted catastrophic.
{
alert = "K8sAnonymousBindingGranted"
expr = "sum(count_over_time({job=\"kubernetes-audit\"} | json | verb=\"create\" | objectRef_resource=~\"rolebindings|clusterrolebindings\" |~ \"system:(anonymous|unauthenticated)\" [5m])) > 0"
for = "0m"
labels = { severity = "critical", lane = "security" }
annotations = {
summary = "Binding granted to system:anonymous or system:unauthenticated — full cluster compromise risk"
runbook = "docs/runbooks/security-incident.md#k8-anonymous-binding"
}
},
# K9: Viktor's identity from non-allowlist source IP. Same regex as V7.
{
alert = "K8sViktorFromUnexpectedIP"
expr = "sum(count_over_time({job=\"kubernetes-audit\"} | json | user_username=\"me@viktorbarzin.me\" | sourceIPs_0!~\"^(10\\\\.0\\\\.2[0-3]\\\\.|192\\\\.168\\\\.1\\\\.|10\\\\.10\\\\.|10\\\\.(9[6-9]|1[01][0-9]|111)\\\\.|100\\\\.(6[4-9]|[7-9][0-9]|1[01][0-9]|12[0-7])\\\\.).*\" [5m])) > 0"
for = "0m"
labels = { severity = "critical", lane = "security" }
annotations = {
summary = "K8s API request as me@viktorbarzin.me from non-allowlist source IP — possible stolen kubeconfig/OIDC token"
runbook = "docs/runbooks/security-incident.md#k9-viktors-identity-from-unexpected-source-ip"
}
},
# S1: PVE sshd auth success from non-allowlist IP.
# Conditional on the pve-sshd promtail unit being live on PVE host
# (deployed via stacks/infra/scripts out of scope until W1.3 host
# piece lands). Rule is defined so it fires automatically once logs
# flow with job=sshd-pve.
{
alert = "PVEsshLoginFromUnexpectedIP"
expr = "sum(count_over_time({job=\"sshd-pve\"} |~ \"Accepted (publickey|password|keyboard-interactive)\" | regexp \"Accepted (?P<method>\\\\S+) for (?P<user>\\\\S+) from (?P<ip>\\\\S+) port\" | ip!~\"^(10\\\\.0\\\\.2[0-3]\\\\.|192\\\\.168\\\\.1\\\\.|100\\\\.(6[4-9]|[7-9][0-9]|1[01][0-9]|12[0-7])\\\\.).*\" [5m])) > 0"
for = "0m"
labels = { severity = "critical", lane = "security" }
annotations = {
summary = "PVE sshd login from non-allowlist source IP — possible stolen SSH key"
runbook = "docs/runbooks/security-incident.md#s1-pve-sshd-auth-success-from-unexpected-ip"
}
},
]
},
{
# Matrix (tuwunel) open registration is ON, so notify on every new
# signup. tuwunel logs `... New user "@x:..." registered on this server`
# only on SUCCESS (the disabled-path logs "Rejecting ... registration is
# disabled"), so this matcher never false-fires on rejected attempts.
# lane=security routes it to the existing #security Slack receiver.
name = "Matrix"
rules = [
{
alert = "MatrixNewUserRegistered"
expr = "sum(count_over_time({namespace=\"matrix\",container=\"matrix\"} |= \"registered on this server\" [10m])) > 0"
for = "0m"
labels = { severity = "info", lane = "security" }
annotations = {
summary = "New user registered on Matrix (tuwunel) — open registration is ON"
description = "A new account was created on matrix.viktorbarzin.me. See who with: kubectl -n matrix logs deploy/matrix | grep 'New user'. If unexpected/abuse, revert to token-gated registration in stacks/matrix."
}
},
]
}
]
})
}
}
resource "kubernetes_config_map" "grafana_loki_datasource" {
metadata {
name = "grafana-loki-datasource"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
grafana_datasource = "1"
}
}
data = {
"loki-datasource.yaml" = yamlencode({
apiVersion = 1
datasources = [{
name = "Loki"
type = "loki"
access = "proxy"
url = "http://loki.monitoring.svc.cluster.local:3100"
isDefault = false
}]
})
}
}

View file

@ -0,0 +1,115 @@
loki:
commonConfig:
replication_factor: 1
schemaConfig:
configs:
- from: "2025-04-01"
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: loki_index_
period: 24h
ingester:
chunk_idle_period: 12h
max_chunk_age: 24h
chunk_retain_period: 1m
chunk_target_size: 1572864
wal:
dir: /loki-wal
pattern_ingester:
enabled: true
limits_config:
allow_structured_metadata: true
volume_enabled: true
retention_period: 720h
compactor:
retention_enabled: true
working_directory: /var/loki/compactor
compaction_interval: 1h
delete_request_store: filesystem
ruler:
enable_api: true
storage:
type: local
local:
directory: /var/loki/rules
alertmanager_url: http://prometheus-alertmanager.monitoring.svc.cluster.local:9093
ring:
kvstore:
store: inmemory
rule_path: /var/loki/scratch
storage:
type: "filesystem"
auth_enabled: false
minio:
enabled: false
deploymentMode: SingleBinary
singleBinary:
replicas: 1
persistence:
enabled: true
size: 50Gi
storageClass: "proxmox-lvm"
extraVolumes:
- name: wal
emptyDir:
medium: Memory
sizeLimit: 2Gi
- name: rules
configMap:
name: loki-alert-rules
extraVolumeMounts:
- name: wal
mountPath: /loki-wal
- name: rules
mountPath: /var/loki/rules/fake
resources:
requests:
cpu: 250m
# Right-sized 2026-06-04 (3Gi->1Gi): VPA upperBound 364Mi, actual ~315Mi.
# 1Gi request is ~3x the observed ceiling; the 4Gi limit (Burstable)
# keeps headroom for query spikes. Frees 2Gi of monitoring-quota
# requests.memory, taking it 89%->~79% (under the >80% WARN). NOTE: the
# alloy DaemonSet (562Mi/node) grows with node count, so this can creep
# back over 80% as the cluster expands — bump the quota then.
memory: 1Gi
limits:
memory: 4Gi
# Zero out replica counts of other deployment modes
backend:
replicas: 0
read:
replicas: 0
write:
replicas: 0
ingester:
replicas: 0
querier:
replicas: 0
queryFrontend:
replicas: 0
queryScheduler:
replicas: 0
distributor:
replicas: 0
compactor:
replicas: 0
indexGateway:
replicas: 0
bloomCompactor:
replicas: 0
bloomGateway:
replicas: 0
# Disable optional components for single binary mode
gateway:
enabled: false
chunksCache:
enabled: false
resultsCache:
enabled: false

View file

@ -0,0 +1,23 @@
# Loki write/push endpoint for EXTERNAL hosts (currently rpi-sofia's promtail).
#
# Loki runs SingleBinary with the gateway disabled and auth_enabled=false, so it
# is ClusterIP-only (svc "loki":3100) and unreachable from off-cluster. An
# external log shipper like the Sofia Raspberry Pi cannot POST to
# /loki/api/v1/push without this ingress.
#
# auth = "none": promtail ships logs programmatically (no browser, no Authentik
# SSO cookie dance). The allow_local_access_only middleware (192.168.0.0/16 +
# 10.0.0.0/8) gates the endpoint to LAN/VPN only the correct model for a
# LAN-only Pi, mirroring the idrac-redfish-exporter ingress in this module.
module "loki-write-ingress" {
source = "../../../../modules/kubernetes/ingress_factory"
# auth = "none": rpi-sofia's promtail pushes logs programmatically (no browser, no Authentik SSO cookie); gated to LAN/VPN by allow_local_access_only below.
auth = "none"
namespace = kubernetes_namespace.monitoring.metadata[0].name
name = "loki"
root_domain = "viktorbarzin.lan"
tls_secret_name = var.tls_secret_name
allow_local_access_only = true
ssl_redirect = false
port = 3100
}

View file

@ -0,0 +1,587 @@
variable "tls_secret_name" {}
variable "alertmanager_account_password" {}
variable "idrac_host" {
default = "192.168.1.4"
}
variable "idrac_username" {
default = "root"
}
variable "idrac_password" {
default = "calvin"
sensitive = true
}
variable "alertmanager_slack_api_url" {}
variable "tiny_tuya_service_secret" {
type = string
sensitive = true
}
variable "haos_api_token" {
type = string
sensitive = true
}
variable "pve_password" {
type = string
sensitive = true
}
variable "grafana_admin_password" {
type = string
sensitive = true
}
variable "kube_config_path" {
type = string
sensitive = true
}
variable "tier" { type = string }
variable "mysql_host" { type = string }
variable "registry_user" {
type = string
sensitive = true
}
variable "registry_password" {
type = string
sensitive = true
}
variable "forgejo_pull_token" {
type = string
sensitive = true
description = "PAT for the cluster-puller user, used by the Forgejo registry integrity probe."
}
resource "kubernetes_namespace" "monitoring" {
metadata {
name = "monitoring"
labels = {
"istio-injection" : "disabled"
tier = var.tier
"resource-governance/custom-quota" = "true"
"keel.sh/enrolled" = "true"
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
}
}
module "tls_secret" {
source = "../../../../modules/kubernetes/setup_tls_secret"
namespace = kubernetes_namespace.monitoring.metadata[0].name
tls_secret_name = var.tls_secret_name
}
# Terraform get angry with the 30k values file :/ use ansible until solved
# resource "helm_release" "ups_prometheus_snmp_exporter" {
# namespace = kubernetes_namespace.monitoring.metadata[0].name
# create_namespace = true
# name = "ups_prometheus_exporter"
# repository = "https://prometheus-community.github.io/helm-charts"
# chart = "prometheus-snmp-exporter"
# values = [file("${path.module}/ups_snmp_values.yaml")]
# }
resource "kubernetes_cron_job_v1" "monitor_prom" {
metadata {
name = "monitor-prometheus"
}
spec {
concurrency_policy = "Replace"
failed_jobs_history_limit = 5
schedule = "*/30 * * * *"
job_template {
metadata {
}
spec {
template {
metadata {
}
spec {
container {
name = "monitor-prometheus"
image = "alpine"
command = ["/bin/sh", "-c", "apk add --update curl && curl --connect-timeout 2 prometheus-server.monitoring.svc.cluster.local || curl https://webhook.viktorbarzin.me/fb/message-viktor -d 'Prometheus is down!'"]
}
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
}
}
# -----------------------------------------------------------------------------
# DNS Anomaly Monitor query Technitium stats API, detect anomalies, push to Pushgateway
# Runs every 15 min. Checks for query spikes, high error rates, and suspicious patterns.
# -----------------------------------------------------------------------------
resource "kubernetes_cron_job_v1" "dns_anomaly_monitor" {
metadata {
name = "dns-anomaly-monitor"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec {
concurrency_policy = "Replace"
failed_jobs_history_limit = 3
successful_jobs_history_limit = 3
schedule = "*/15 * * * *"
job_template {
metadata {}
spec {
backoff_limit = 2
ttl_seconds_after_finished = 300
template {
metadata {}
spec {
container {
name = "dns-anomaly-monitor"
image = "docker.io/library/alpine"
command = ["/bin/sh", "-c", <<-EOT
set -euo pipefail
apk add --no-cache curl jq
TECHNITIUM_URL="http://technitium-web.technitium.svc.cluster.local:5380"
# Get main stats
STATS=$(curl -sf "$TECHNITIUM_URL/api/stats/get?token=&type=LastHour" 2>&1) || {
echo "ERROR: Failed to query Technitium stats API"
exit 1
}
# Parse key metrics
TOTAL_QUERIES=$(echo "$STATS" | jq -r '.response.stats.totalQueries // 0')
SERVER_FAILURE=$(echo "$STATS" | jq -r '.response.stats.serverFailure // 0')
NX_DOMAIN=$(echo "$STATS" | jq -r '.response.stats.nxDomain // 0')
BLOCKED=$(echo "$STATS" | jq -r '.response.stats.blocked // 0')
NO_ERROR=$(echo "$STATS" | jq -r '.response.stats.noError // 0')
echo "DNS Stats (last hour): total=$TOTAL_QUERIES noError=$NO_ERROR nxDomain=$NX_DOMAIN serverFailure=$SERVER_FAILURE blocked=$BLOCKED"
# Get top clients for anomaly context
TOP_CLIENTS=$(curl -sf "$TECHNITIUM_URL/api/stats/getTopClients?token=&type=LastHour&limit=10" 2>&1) || true
# Get top domains for DGA/tunneling detection
TOP_DOMAINS=$(curl -sf "$TECHNITIUM_URL/api/stats/getTopDomains?token=&type=LastHour&limit=20" 2>&1) || true
# Check for high-entropy domains (potential DGA)
DGA_SUSPECT=0
if [ -n "$TOP_DOMAINS" ]; then
# Simple heuristic: domains with many consonant clusters or very long labels
DGA_SUSPECT=$(echo "$TOP_DOMAINS" | jq -r '[.response.topDomains[]?.name // empty | select(length > 30 or test("[bcdfghjklmnpqrstvwxyz]{5,}"))] | length')
fi
# Push metrics to Pushgateway
cat <<METRICS | curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/dns-anomaly-monitor"
# HELP dns_anomaly_total_queries Total DNS queries in last hour
# TYPE dns_anomaly_total_queries gauge
dns_anomaly_total_queries $TOTAL_QUERIES
# HELP dns_anomaly_server_failure DNS server failures in last hour
# TYPE dns_anomaly_server_failure gauge
dns_anomaly_server_failure $SERVER_FAILURE
# HELP dns_anomaly_nx_domain NX domain responses in last hour
# TYPE dns_anomaly_nx_domain gauge
dns_anomaly_nx_domain $NX_DOMAIN
# HELP dns_anomaly_blocked Blocked queries in last hour
# TYPE dns_anomaly_blocked gauge
dns_anomaly_blocked $BLOCKED
# HELP dns_anomaly_dga_suspects Domains with DGA-like characteristics
# TYPE dns_anomaly_dga_suspects gauge
dns_anomaly_dga_suspects $DGA_SUSPECT
# HELP dns_anomaly_check_timestamp Last successful check timestamp
# TYPE dns_anomaly_check_timestamp gauge
dns_anomaly_check_timestamp $(date +%s)
METRICS
# Calculate average for spike detection (store as a simple rolling metric)
# The Prometheus alert rule compares current vs stored average
AVG_FILE="/tmp/dns_avg"
if [ -f "$AVG_FILE" ]; then
PREV_AVG=$(cat "$AVG_FILE")
NEW_AVG=$(( (PREV_AVG + TOTAL_QUERIES) / 2 ))
else
NEW_AVG=$TOTAL_QUERIES
fi
cat <<METRICS | curl -sf --data-binary @- "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/dns-anomaly-monitor"
# HELP dns_anomaly_avg_queries Rolling average DNS queries
# TYPE dns_anomaly_avg_queries gauge
dns_anomaly_avg_queries $NEW_AVG
METRICS
echo "DNS anomaly check complete (DGA suspects: $DGA_SUSPECT)"
EOT
]
resources {
requests = {
memory = "32Mi"
cpu = "10m"
}
limits = {
memory = "64Mi"
}
}
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
}
}
# -----------------------------------------------------------------------------
# Phase 4 of forgejo-registry-consolidation 2026-05-07: registry-private
# decommissioned. The integrity probe below caught the orphan-index failure
# mode in `registry:2.8.3` (post-mortem 2026-04-19). With that engine
# retired, the probe is replaced by `forgejo_integrity_probe` below.
#
# Resource definitions stripped wholesale terragrunt apply destroys the
# in-cluster CronJob + Secret on the next run.
# See: docs/post-mortems/2026-04-19-registry-orphan-index.md
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# Forgejo registry integrity probe same algorithm as registry-integrity-probe
# above, but targets the Forgejo OCI registry instead of registry-private. Runs
# in parallel with the existing probe during the dual-push bake; once Phase 4
# decommissions registry-private, the registry-integrity-probe CronJob is
# deleted and only this one remains.
#
# Auth: HTTP Basic with cluster-puller PAT (read:package scope is enough to
# walk catalog + manifests). Reaches Forgejo via the in-cluster service so we
# don't hairpin out through Traefik for every probe run.
# -----------------------------------------------------------------------------
resource "kubernetes_secret" "forgejo_probe_credentials" {
metadata {
name = "forgejo-probe-credentials"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
type = "Opaque"
data = {
REG_USER = "cluster-puller"
REG_PASS = var.forgejo_pull_token
}
}
resource "kubernetes_cron_job_v1" "forgejo_integrity_probe" {
metadata {
name = "forgejo-integrity-probe"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec {
concurrency_policy = "Forbid"
failed_jobs_history_limit = 3
successful_jobs_history_limit = 3
schedule = "*/15 * * * *"
job_template {
metadata {}
spec {
backoff_limit = 1
ttl_seconds_after_finished = 600
template {
metadata {}
spec {
container {
name = "forgejo-integrity-probe"
image = "docker.io/library/alpine:3.20"
env {
name = "REG_USER"
value_from {
secret_key_ref {
name = kubernetes_secret.forgejo_probe_credentials.metadata[0].name
key = "REG_USER"
}
}
}
env {
name = "REG_PASS"
value_from {
secret_key_ref {
name = kubernetes_secret.forgejo_probe_credentials.metadata[0].name
key = "REG_PASS"
}
}
}
env {
name = "REGISTRY_HOST"
value = "forgejo.forgejo.svc.cluster.local"
}
env {
name = "REGISTRY_SCHEME"
value = "http"
}
env {
name = "REGISTRY_INSTANCE"
value = "forgejo.viktorbarzin.me"
}
env {
name = "PUSHGATEWAY"
value = "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/forgejo-integrity-probe"
}
env {
name = "TAGS_PER_REPO"
value = "5"
}
command = ["/bin/sh", "-c", <<-EOT
set -eu
apk add --no-cache curl jq >/dev/null
REG="$REGISTRY_HOST"
SCHEME="$${REGISTRY_SCHEME:-https}"
INSTANCE="$REGISTRY_INSTANCE"
AUTH="$REG_USER:$REG_PASS"
ACCEPT='application/vnd.oci.image.index.v1+json,application/vnd.oci.image.manifest.v1+json,application/vnd.docker.distribution.manifest.list.v2+json,application/vnd.docker.distribution.manifest.v2+json'
push() {
curl -sf --max-time 10 --data-binary @- "$PUSHGATEWAY" >/dev/null 2>&1 || true
}
CATALOG=$(curl -sk -u "$AUTH" --max-time 30 "$SCHEME://$REG/v2/_catalog?n=1000" || echo "")
REPOS=$(echo "$CATALOG" | jq -r '.repositories[]?' 2>/dev/null || echo "")
if [ -z "$REPOS" ]; then
echo "ERROR: empty catalog or auth failure — cannot probe"
NOW=$(date +%s)
push <<METRICS
# TYPE registry_manifest_integrity_catalog_accessible gauge
registry_manifest_integrity_catalog_accessible{instance="$INSTANCE"} 0
# TYPE registry_manifest_integrity_last_run_timestamp gauge
registry_manifest_integrity_last_run_timestamp{instance="$INSTANCE"} $NOW
METRICS
exit 1
fi
FAIL=0
REPOS_N=0
TAGS_N=0
INDEXES_N=0
printf '%s\n' $REPOS > /tmp/repos.txt
while IFS= read -r repo; do
[ -z "$repo" ] && continue
REPOS_N=$((REPOS_N + 1))
TAGS_JSON=$(curl -sk -u "$AUTH" --max-time 15 "$SCHEME://$REG/v2/$repo/tags/list" || echo "")
echo "$TAGS_JSON" | jq -r '.tags[]?' 2>/dev/null | tail -n "$TAGS_PER_REPO" > /tmp/tags.txt || true
while IFS= read -r tag; do
[ -z "$tag" ] && continue
TAGS_N=$((TAGS_N + 1))
HTTP=$(curl -sk -u "$AUTH" -o /tmp/m.json -w '%%{http_code}' \
-H "Accept: $ACCEPT" --max-time 15 \
"$SCHEME://$REG/v2/$repo/manifests/$tag")
if [ "$HTTP" != "200" ]; then
echo "FAIL: $repo:$tag manifest HTTP $HTTP"
FAIL=$((FAIL + 1))
continue
fi
MT=$(jq -r '.mediaType // empty' /tmp/m.json 2>/dev/null || echo "")
if echo "$MT" | grep -Eq 'manifest\.list|image\.index'; then
INDEXES_N=$((INDEXES_N + 1))
jq -r '.manifests[].digest' /tmp/m.json > /tmp/children.txt 2>/dev/null || true
while IFS= read -r d; do
[ -z "$d" ] && continue
CH=$(curl -sk -u "$AUTH" -o /dev/null -w '%%{http_code}' \
-H "Accept: $ACCEPT" --max-time 10 -I \
"$SCHEME://$REG/v2/$repo/manifests/$d")
if [ "$CH" != "200" ]; then
echo "FAIL: $repo:$tag index child $d HTTP $CH"
FAIL=$((FAIL + 1))
fi
done < /tmp/children.txt
fi
done < /tmp/tags.txt
done < /tmp/repos.txt
NOW=$(date +%s)
push <<METRICS
# TYPE registry_manifest_integrity_failures gauge
registry_manifest_integrity_failures{instance="$INSTANCE"} $FAIL
# TYPE registry_manifest_integrity_catalog_accessible gauge
registry_manifest_integrity_catalog_accessible{instance="$INSTANCE"} 1
# TYPE registry_manifest_integrity_repos_checked gauge
registry_manifest_integrity_repos_checked{instance="$INSTANCE"} $REPOS_N
# TYPE registry_manifest_integrity_tags_checked gauge
registry_manifest_integrity_tags_checked{instance="$INSTANCE"} $TAGS_N
# TYPE registry_manifest_integrity_indexes_checked gauge
registry_manifest_integrity_indexes_checked{instance="$INSTANCE"} $INDEXES_N
# TYPE registry_manifest_integrity_last_run_timestamp gauge
registry_manifest_integrity_last_run_timestamp{instance="$INSTANCE"} $NOW
METRICS
echo "Probe complete: $FAIL failures across $REPOS_N repos / $TAGS_N tags / $INDEXES_N indexes"
if [ "$FAIL" -gt 0 ]; then exit 1; fi
EOT
]
resources {
requests = {
cpu = "10m"
memory = "48Mi"
}
limits = {
memory = "96Mi"
}
}
}
restart_policy = "OnFailure"
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
}
}
# Expose Pushgateway via NodePort so the PVE host can push LVM snapshot metrics
resource "kubernetes_service" "pushgateway_nodeport" {
metadata {
name = "pushgateway-nodeport"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec {
type = "NodePort"
selector = {
"app.kubernetes.io/name" = "prometheus-pushgateway"
"app.kubernetes.io/instance" = "prometheus"
}
port {
port = 9091
target_port = 9091
node_port = 30091
protocol = "TCP"
}
}
}
resource "kubernetes_manifest" "status_redirect_middleware" {
manifest = {
apiVersion = "traefik.io/v1alpha1"
kind = "Middleware"
metadata = {
name = "status-redirect"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec = {
redirectRegex = {
regex = ".*"
replacement = "https://hetrixtools.com/r/38981b548b5d38b052aca8d01285a3f3/"
permanent = true
}
}
}
}
resource "kubernetes_manifest" "status_ingress_route" {
manifest = {
apiVersion = "traefik.io/v1alpha1"
kind = "IngressRoute"
metadata = {
name = "hetrix-redirect-ingress"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec = {
entryPoints = ["websecure"]
routes = [{
match = "Host(`status.viktorbarzin.me`)"
kind = "Rule"
middlewares = [{
name = "status-redirect"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}]
services = [{
kind = "TraefikService"
name = "noop@internal"
}]
}]
tls = {
secretName = var.tls_secret_name
}
}
}
}
resource "kubernetes_manifest" "yotovski_redirect_middleware" {
manifest = {
apiVersion = "traefik.io/v1alpha1"
kind = "Middleware"
metadata = {
name = "yotovski-redirect"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec = {
redirectRegex = {
regex = ".*"
replacement = "https://hetrixtools.com/r/2ba9d7a5e017794db0fd91f0115a8b3b/"
permanent = true
}
}
}
}
resource "kubernetes_manifest" "yotovski_ingress_route" {
manifest = {
apiVersion = "traefik.io/v1alpha1"
kind = "IngressRoute"
metadata = {
name = "hetrix-yotovski-redirect-ingress"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec = {
entryPoints = ["websecure"]
routes = [{
match = "Host(`yotovski-status.viktorbarzin.me`)"
kind = "Rule"
middlewares = [{
name = "yotovski-redirect"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}]
services = [{
kind = "TraefikService"
name = "noop@internal"
}]
}]
tls = {
secretName = var.tls_secret_name
}
}
}
}
# Custom ResourceQuota for monitoring larger than the default 1-cluster tier quota
# because monitoring runs 29+ pods (Prometheus, Grafana, Loki, Alloy, exporters, etc.)
# Headroom: cluster grew from 5 7 workers (k8s-node5/6 added 2026-05-26); per-pod
# DaemonSets (alloy 562Mi, node-exporter 100Mi, loki-canary 128Mi, sysctl-inotify 4Mi)
# now consume ~+2Gi vs. pre-expansion. 20Gi gives ~3-4Gi safe headroom.
resource "kubernetes_resource_quota" "monitoring" {
metadata {
name = "monitoring-quota"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec {
hard = {
"requests.cpu" = "16"
"requests.memory" = "20Gi"
"limits.memory" = "64Gi"
pods = "100"
}
}
}

View file

@ -0,0 +1,89 @@
resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" {
metadata {
name = "prometheus-data-proxmox"
namespace = kubernetes_namespace.monitoring.metadata[0].name
annotations = {
# threshold = free-space % below which autoresizer expands.
# 10% means "expand when 90% used" (the conventional knob).
# WAS 90% that's "expand when 10% used", which would
# autoresize this volume from 200Gi 500Gi in 6 cycles.
"resize.topolvm.io/threshold" = "10%"
"resize.topolvm.io/increase" = "10%"
"resize.topolvm.io/storage_limit" = "500Gi"
}
}
spec {
access_modes = ["ReadWriteOnce"]
storage_class_name = "proxmox-lvm"
resources {
requests = {
storage = "200Gi"
}
}
}
lifecycle {
# The autoresizer expands requests.storage up to storage_limit and
# PVCs can't shrink. Without this ignore_changes, every TF apply
# tries to revert the live size back to 200Gi, hits the
# K8s shrink-forbidden rule, and forces a destroy+recreate that
# leaves the PVC stuck in Terminating until the pod releases it.
# (Root cause of the prometheus-data-proxmox + technitium-primary-config-encrypted
# Terminating-but-in-use incident on 2026-05-10.)
ignore_changes = [spec[0].resources[0].requests]
}
}
module "nfs_prometheus_backup_host" {
source = "../../../../modules/kubernetes/nfs_volume"
name = "monitoring-prometheus-backup-host"
namespace = kubernetes_namespace.monitoring.metadata[0].name
nfs_server = "192.168.1.127"
nfs_path = "/srv/nfs/prometheus-backup"
}
resource "helm_release" "prometheus" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "prometheus"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "prometheus"
# version = "15.0.2"
version = "25.8.2"
timeout = 900 # 15 min Recreate strategy + iSCSI reattach is slow
# force_update disabled 2026-04-23: caused Helm to try replacing the bound
# pushgateway PVC (added in rev 188, see commit e51c104), which is immutable.
# Re-enable temporarily only when a StatefulSet volumeClaimTemplate change needs --force.
force_update = false
values = [templatefile("${path.module}/prometheus_chart_values.tpl", { alertmanager_mail_pass = var.alertmanager_account_password, alertmanager_slack_api_url = var.alertmanager_slack_api_url, tuya_api_key = var.tiny_tuya_service_secret, haos_api_token = var.haos_api_token, authentik_walloff_targets = local.authentik_walloff_targets })]
}
# Local-only Prometheus query-API ingress for ha-sofia REST sensors (added
# 2026-06-05). ha-sofia (external HAOS) reads R730 iDRAC SNMP metrics
# (r730_idrac_coolingDeviceReading, etc.) by querying Prometheus directly via
# this host instead of hitting the slow on-demand Redfish exporter. Distinct
# host (prometheus-query.viktorbarzin.lan) + resource name to avoid colliding
# with the chart-created `prometheus-server` ingress (prometheus.viktorbarzin.me).
# Path-scoped to /api/v1/query so ONLY the read-only instant-query endpoint is
# reachable on the LAN not the UI, admin, or federation endpoints.
module "prometheus-query-ingress" {
source = "../../../../modules/kubernetes/ingress_factory"
# auth = "none": ha-sofia REST sensor queries the Prometheus HTTP API
# programmatically (no browser, no SSO cookie); the allow_local_access_only
# IP allowlist (LAN subnets) is the gate. Authentik OIDC would 302 every call.
auth = "none"
namespace = kubernetes_namespace.monitoring.metadata[0].name
name = "prometheus-query"
service_name = "prometheus-server"
root_domain = "viktorbarzin.lan"
tls_secret_name = var.tls_secret_name
allow_local_access_only = true
ssl_redirect = false
port = 80
ingress_path = ["/api/v1/query"]
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,144 @@
resource "kubernetes_secret" "pve_exporter_config" {
metadata {
name = "pve-exporter-config"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
data = {
"pve.yml" = <<-EOF
default:
user: "root@pam"
password: ${var.pve_password}
verify_ssl: false
timeout: 30
EOF
}
}
resource "kubernetes_deployment" "pve_exporter" {
metadata {
name = "proxmox-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
tier = var.tier
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "proxmox-exporter"
}
}
template {
metadata {
labels = {
app = "proxmox-exporter"
}
}
spec {
container {
name = "proxmox-exporter"
image = "prompve/prometheus-pve-exporter:latest"
port {
container_port = 9221
}
resources {
requests = {
cpu = "15m"
memory = "256Mi"
}
limits = {
memory = "256Mi"
}
}
# Mount the file into the container
volume_mount {
name = "config-volume"
mount_path = "/etc/prometheus"
read_only = true
}
}
volume {
name = "config-volume"
secret {
secret_name = kubernetes_secret.pve_exporter_config.metadata[0].name
items {
key = "pve.yml"
path = "pve.yml" # This results in /etc/prometheus/pve.yml
}
}
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
# KEEL: monitoring ns is keel-enrolled (policy=patch) Keel owns the image
# tag and injects keel.sh annotations. Ignore so TF stops reverting Keel each
# plan (completes the cdb7d9a8 KEEL sweep that missed these exporters and was
# tripping drift-detection exit 2 every run). 2026-05-31.
ignore_changes = [
spec[0].template[0].spec[0].dns_config,
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE
metadata[0].annotations["keel.sh/policy"],
metadata[0].annotations["keel.sh/trigger"],
metadata[0].annotations["keel.sh/pollSchedule"],
metadata[0].annotations["keel.sh/match-tag"],
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
]
}
}
resource "kubernetes_service" "proxmox-exporter" {
metadata {
name = "proxmox-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
"app" = "proxmox-exporter"
}
annotations = {
# Use scrape_slow (5m interval, 30s timeout in prometheus values) because
# the PVE API endpoint regularly takes ~11s with ~1000 k8s-csi LVs on the
# host, blowing past the default 10s scrape_timeout and flapping the
# ProxmoxMetricsMissing + ScrapeTargetDown alerts. The slow job is gated
# by the `prometheus_io_scrape_slow=true` annotation in
# prometheus_chart_values.tpl and also excludes us from the fast job.
"prometheus.io/scrape_slow" = "true"
"prometheus.io/port" = 9221
"prometheus.io/path" = "/pve"
"prometheus.io/param_target" = "192.168.1.127"
"prometheus.io/param_node" = "1"
"prometheus.io/param_cluster" = "1"
}
}
spec {
selector = {
"app" = "proxmox-exporter"
}
port {
name = "http"
port = 9221
target_port = 9221
}
}
}
# To monitor the pve node, use the node exporter and the playbook in this repo. from the root run:
# ansible-playbook -i ./playbooks/inventory.ini ./playbooks/deploy_node_exporter.yaml
# This installs the exporter binary

View file

@ -0,0 +1,51 @@
import asyncio
import logging
import os
import signal
import sys
import time
import aiohttp
iDRAC_HOST = 'idrac'
iDRAC_USER_ENV_VAR = 'idrac_user'
iDRAC_PASSWORD_ENV_VAR = 'idrac_password'
SHOULD_RUN = True
def signal_handler(sig, frame):
logging.warning(f'signal {sig} received. shutting down gracefully...')
global SHOULD_RUN
SHOULD_RUN = False
time.sleep(60)
sys.exit(0)
async def main() -> None:
# define signal handlers
signal.signal(signal.SIGINT, signal_handler)
user = os.environ.get(iDRAC_USER_ENV_VAR)
if user is None:
logging.critical('missing environment variable for idrac user'
f' please set {iDRAC_USER_ENV_VAR}')
return
password = os.environ.get(iDRAC_PASSWORD_ENV_VAR)
if password is None:
logging.critical('missing environment variable for idrac password'
f' please set {iDRAC_PASSWORD_ENV_VAR}')
return
logging.info('service initiated with credentials')
return await monitor(user, password)
async def monitor(user: str, password: str) -> None:
while SHOULD_RUN:
pass
if __name__ == '__main__':
# abandoned bc server cannot start itself when it's off :/
asyncio.run(main())

View file

@ -0,0 +1,152 @@
/**
1. clone snmp exporter
2. update generator.yaml to include only interesting modules
3. make generate
4. cp snmp.yml to whereever is used
5. scrape service with curl 'http://snmp-exporter.monitoring.svc.cluster.local:9116/snmp?auth=public_v2&module=huawei&target=192.168.1.5%3A161'
generate reference - https://github.com/prometheus/snmp_exporter/tree/main/generator
https://sbcode.net/prometheus/snmp-generate-huawei/
*/
resource "kubernetes_config_map" "snmp-exporter-yaml" {
metadata {
name = "snmp-exporter-yaml"
namespace = kubernetes_namespace.monitoring.metadata[0].name
annotations = {
"reloader.stakater.com/match" = "true"
}
}
data = {
"snmp.yml" = file("${path.module}/ups_snmp_values.yaml")
}
}
resource "kubernetes_deployment" "snmp-exporter" {
metadata {
name = "snmp-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "snmp-exporter"
tier = var.tier
}
annotations = {
"reloader.stakater.com/search" = "true"
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "snmp-exporter"
}
}
template {
metadata {
labels = {
app = "snmp-exporter"
}
}
spec {
container {
image = "prom/snmp-exporter"
name = "snmp-exporter"
# command = ["/usr/local/bin/redfish_exporter", "--config.file", "/app/config.yml"]
resources {
requests = {
cpu = "10m"
memory = "256Mi"
}
limits = {
memory = "256Mi"
}
}
port {
container_port = 9116
}
volume_mount {
name = "config-volume"
mount_path = "/etc/snmp_exporter/"
}
}
volume {
name = "config-volume"
config_map {
name = "snmp-exporter-yaml"
}
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
# KEEL: monitoring ns is keel-enrolled (policy=patch) Keel owns the image
# tag and injects keel.sh annotations. Ignore so TF stops reverting Keel each
# plan (completes the cdb7d9a8 KEEL sweep that missed these exporters and was
# tripping drift-detection exit 2 every run). 2026-05-31.
ignore_changes = [
spec[0].template[0].spec[0].dns_config,
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE
metadata[0].annotations["keel.sh/policy"],
metadata[0].annotations["keel.sh/trigger"],
metadata[0].annotations["keel.sh/pollSchedule"],
metadata[0].annotations["keel.sh/match-tag"],
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
]
}
}
resource "kubernetes_service" "snmp-exporter" {
metadata {
name = "snmp-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
"app" = "snmp-exporter"
}
# annotations = {
# "prometheus.io/scrape" = "true"
# "prometheus.io/path" = "/snmp?auth=Public0&target=tcp%3A%2F%2F192.%3A161"
# "prometheus.io/port" = "9116"
# }
}
spec {
selector = {
"app" = "snmp-exporter"
}
port {
name = "http"
port = "9116"
target_port = "9116"
}
}
}
module "snmp-exporter-ingress" {
source = "../../../../modules/kubernetes/ingress_factory"
# Auth disabled same rationale as idrac-redfish-exporter-ingress:
# HA Sofia REST sensors scrape /snmp endpoint programmatically and
# can't follow the Authentik OIDC flow. local-only IP allowlist
# already gates external access.
# auth = "none": HA Sofia REST sensors scrape /snmp endpoint programmatically; OIDC flow would 302 every request.
auth = "none"
namespace = kubernetes_namespace.monitoring.metadata[0].name
name = "snmp-exporter"
root_domain = "viktorbarzin.lan"
tls_secret_name = var.tls_secret_name
allow_local_access_only = true
ssl_redirect = false
port = 9116
}

File diff suppressed because it is too large Load diff

1
stacks/monitoring/secrets Symbolic link
View file

@ -0,0 +1 @@
../../secrets

View file

@ -0,0 +1,8 @@
include "root" {
path = find_in_parent_folders()
}
dependency "infra" {
config_path = "../infra"
skip_outputs = true
}