extract monitoring, nvidia, mailserver, cloudflared, kyverno from platform [ci skip]

Phase 2 of platform stack split. 5 more modules extracted into
independent stacks. All applied successfully with zero destroys.
Cloudflared now reads k8s_users from Vault directly to compute
user_domains. Woodpecker pipeline runs all 8 extracted stacks
in parallel. Memory bumped to 6Gi for 9 concurrent TF processes.
Platform reduced from 27 to 19 modules.
This commit is contained in:
Viktor Barzin 2026-03-17 21:34:11 +00:00
parent 3c804aedf8
commit ae36dc253b
73 changed files with 166093 additions and 96 deletions

View file

@ -0,0 +1,27 @@
# dockerhub: viktorbarzin/redfish-exporter
# repo: https://pkg.go.dev/github.com/jenningsloy318/redfish_exporter#section-readme
FROM golang:rc-bullseye AS builder
LABEL maintainer="Viktor Barzin <me@viktorbarzin.me>"
ARG ARCH=amd64
ENV GOROOT /usr/local/go
ENV GOPATH /go
ENV PATH "$GOROOT/bin:$GOPATH/bin:$PATH"
ENV GO_VERSION 1.15.2
ENV GO111MODULE=on
# Build dependencies
RUN mkdir -p /go/src/github.com/ && \
git clone https://github.com/jenningsloy318/redfish_exporter /go/src/github.com/jenningsloy318/redfish_exporter && \
cd /go/src/github.com/jenningsloy318/redfish_exporter && \
make build
FROM golang:rc-bullseye
COPY --from=builder /go/src/github.com/jenningsloy318/redfish_exporter/build/redfish_exporter /usr/local/bin/redfish_exporter
RUN mkdir /etc/prometheus
# config file mounter at runtime
CMD ["/usr/local/bin/redfish_exporter", "--config.file", "/etc/prometheus/redfish_exporter.yml"]

View file

@ -0,0 +1,207 @@
alloy:
configMap:
content: |-
// Write your Alloy config here:
logging {
level = "info"
format = "logfmt"
}
loki.write "default" {
endpoint {
url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"
}
}
// discovery.kubernetes allows you to find scrape targets from Kubernetes resources.
// It watches cluster state and ensures targets are continually synced with what is currently running in your cluster.
discovery.kubernetes "pod" {
role = "pod"
}
// discovery.relabel rewrites the label set of the input targets by applying one or more relabeling rules.
// If no rules are defined, then the input targets are exported as-is.
discovery.relabel "pod_logs" {
targets = discovery.kubernetes.pod.targets
// Label creation - "namespace" field from "__meta_kubernetes_namespace"
rule {
source_labels = ["__meta_kubernetes_namespace"]
action = "replace"
target_label = "namespace"
}
// Label creation - "pod" field from "__meta_kubernetes_pod_name"
rule {
source_labels = ["__meta_kubernetes_pod_name"]
action = "replace"
target_label = "pod"
}
// Label creation - "container" field from "__meta_kubernetes_pod_container_name"
rule {
source_labels = ["__meta_kubernetes_pod_container_name"]
action = "replace"
target_label = "container"
}
// Label creation - "app" field from "__meta_kubernetes_pod_label_app_kubernetes_io_name"
rule {
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
action = "replace"
target_label = "app"
}
// Label creation - "job" field from "__meta_kubernetes_namespace" and "__meta_kubernetes_pod_container_name"
// Concatenate values __meta_kubernetes_namespace/__meta_kubernetes_pod_container_name
rule {
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_container_name"]
action = "replace"
target_label = "job"
separator = "/"
replacement = "$1"
}
// Label creation - "container" field from "__meta_kubernetes_pod_uid" and "__meta_kubernetes_pod_container_name"
// Concatenate values __meta_kubernetes_pod_uid/__meta_kubernetes_pod_container_name.log
rule {
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
action = "replace"
target_label = "__path__"
separator = "/"
replacement = "/var/log/pods/*$1/*.log"
}
// Label creation - "container_runtime" field from "__meta_kubernetes_pod_container_id"
rule {
source_labels = ["__meta_kubernetes_pod_container_id"]
action = "replace"
target_label = "container_runtime"
regex = "^(\\S+):\\/\\/.+$"
replacement = "$1"
}
}
// loki.source.kubernetes tails logs from Kubernetes containers using the Kubernetes API.
loki.source.kubernetes "pod_logs" {
targets = discovery.relabel.pod_logs.output
forward_to = [loki.process.pod_logs.receiver]
}
// loki.process receives log entries from other Loki components, applies one or more processing stages,
// and forwards the results to the list of receivers in the component's arguments.
loki.process "pod_logs" {
stage.static_labels {
values = {
cluster = "default",
}
}
forward_to = [loki.write.default.receiver]
}
// Node-level journal log collection for kernel panics, OOMs, hung tasks, etc.
// Ships system logs off-node so they survive hard resets.
loki.source.journal "node_journal" {
forward_to = [loki.process.journal.receiver]
relabel_rules = loki.relabel.journal.rules
labels = {
job = "node-journal",
}
max_age = "12h"
}
loki.relabel "journal" {
forward_to = []
rule {
source_labels = ["__journal__hostname"]
target_label = "node"
}
rule {
source_labels = ["__journal__systemd_unit"]
target_label = "unit"
}
rule {
source_labels = ["__journal_priority_keyword"]
target_label = "level"
}
rule {
source_labels = ["__journal__transport"]
target_label = "transport"
}
}
// Forward warning+ journal entries (priority 0-4: emerg, alert, crit, err, warning)
// Also forwards kernel transport entries regardless of priority for OOM/panic detection.
loki.process "journal" {
stage.static_labels {
values = {
cluster = "default",
}
}
// Drop info/debug/notice entries that aren't from the kernel transport
stage.match {
selector = "{job=\"node-journal\", level=~\"info|notice|debug\", transport!=\"kernel\"}"
action = "drop"
}
forward_to = [loki.write.default.receiver]
}
// Kubernetes audit log collection from /var/log/kubernetes/audit.log
// Requires alloy.mounts.varlog=true to mount /var/log from the host
local.file_match "audit_logs" {
path_targets = [{
__path__ = "/var/log/kubernetes/audit.log",
job = "kubernetes-audit",
node = env("HOSTNAME"),
}]
}
loki.source.file "audit_logs" {
targets = local.file_match.audit_logs.targets
forward_to = [loki.write.default.receiver]
}
# Mount /var/log from the host for file-based log collection (audit logs)
mounts:
varlog: true
# Mount journal directories for loki.source.journal
extra:
- name: journal-run
mountPath: /run/log/journal
readOnly: true
- name: journal-var
mountPath: /var/log/journal
readOnly: true
- name: machine-id
mountPath: /etc/machine-id
readOnly: true
controller:
volumes:
extra:
- name: journal-run
hostPath:
path: /run/log/journal
type: DirectoryOrCreate
- name: journal-var
hostPath:
path: /var/log/journal
type: DirectoryOrCreate
- name: machine-id
hostPath:
path: /etc/machine-id
type: File
# Resource limits for DaemonSet pods
# Alloy tails logs from all containers on the node via K8s API and batches
# them to Loki. Memory scales with number of active log streams (~30-50 per node).
# 128Mi was OOMKilled; steady-state usage is ~400-450Mi per pod.
resources:
requests:
cpu: 50m
memory: 512Mi
limits:
memory: 1Gi

View file

@ -0,0 +1,62 @@
resource "helm_release" "caretta" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "caretta"
repository = "https://helm.groundcover.com/"
chart = "caretta"
version = "0.0.16"
values = [yamlencode({
grafana = {
enabled = false
}
victoria-metrics-single = {
enabled = false
}
tolerations = [
{
key = "node-role.kubernetes.io/control-plane"
operator = "Exists"
effect = "NoSchedule"
},
{
key = "nvidia.com/gpu"
operator = "Exists"
effect = "NoSchedule"
}
]
resources = {
requests = {
cpu = "10m"
memory = "600Mi"
}
limits = {
memory = "600Mi"
}
}
})]
}
resource "kubernetes_service" "caretta_metrics" {
metadata {
name = "caretta-metrics"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "caretta"
}
}
spec {
selector = {
app = "caretta"
}
port {
name = "metrics"
port = 7117
target_port = 7117
protocol = "TCP"
}
}
}
# Caretta dashboard is now loaded via the grafana_dashboards for_each in grafana.tf

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,861 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"gridPos": {
"h": 28,
"w": 24,
"x": 0,
"y": 0
},
"id": 2,
"interval": "15s",
"options": {
"nodes": {
"mainStatUnit": ""
},
"edges": {
"mainStatUnit": ""
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"exemplar": false,
"expr": "increase((sum by (id, title, subTitle, detail__kind, color) (label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace(label_replace((label_replace(label_replace(label_replace(label_replace((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"}), \"detail__kind\", \"$1\", \"server_kind\", \"(.*)\"), \"subTitle\", \"$1\", \"server_namespace\", \"(.*)\"), \"title\", \"$1\", \"server_name\", \"(.*)\"), \"id\", \"$1\", \"server_id\", \"(.*)\") or label_replace(label_replace(label_replace(label_replace((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"}), \"detail__kind\", \"$1\", \"client_kind\", \"(.*)\"), \"subTitle\", \"$1\", \"client_namespace\", \"(.*)\"), \"title\", \"$1\", \"client_name\", \"(.*)\"), \"id\", \"$1\", \"client_id\", \"(.*)\")), \"color\", \"#8F8F8F\", \"subTitle\", \"(.*)\"), \"color\", \"#F2495C\", \"subTitle\", \"^external$\"), \"color\", \"#8AB8FF\", \"title\", \"^10\\\\..*\"), \"color\", \"#8AB8FF\", \"title\", \"^192\\\\.168\\\\..*\"), \"color\", \"#8AB8FF\", \"title\", \"^172\\\\.(1[6-9]|2[0-9]|3[01])\\\\..*\"), \"color\", \"#8AB8FF\", \"title\", \"^(0\\\\.0\\\\.0\\\\.0|localhost)$\"), \"color\", \"#8AB8FF\", \"subTitle\", \"^node$\"), \"color\", \"#FF9830\", \"subTitle\", \"^traefik$\"), \"color\", \"#5794F2\", \"subTitle\", \"^monitoring$\"), \"color\", \"#73BF69\", \"subTitle\", \"^dbaas$\"), \"color\", \"#B877D9\", \"subTitle\", \"^authentik$\"), \"color\", \"#FF7383\", \"subTitle\", \"^crowdsec$\"), \"color\", \"#FADE2A\", \"subTitle\", \"^uptime-kuma$\"), \"color\", \"#56A64B\", \"subTitle\", \"^immich$\"), \"color\", \"#C0D8FF\", \"subTitle\", \"^technitium$\"), \"color\", \"#FF6600\", \"subTitle\", \"^kyverno$\"), \"color\", \"#76B900\", \"subTitle\", \"^nvidia$\")))[$__range:$__interval]) > 0",
"format": "table",
"instant": true,
"legendFormat": "__auto",
"range": false,
"refId": "nodes"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"exemplar": false,
"expr": "increase((sum by (id, source, target, mainStat) ((label_replace(label_replace(label_replace(label_replace((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"}), \"id\", \"$1\", \"link_id\", \"(.*)\"), \"source\", \"$1\", \"client_id\", \"(.*)\"), \"target\", \"$1\", \"server_id\", \"(.*)\"), \"mainStat\", \"$1\", \"server_port\", \"(.*)\"))))[$__range:$__interval]) > 0",
"format": "table",
"hide": false,
"instant": true,
"legendFormat": "__auto",
"range": false,
"refId": "edges"
}
],
"title": "Service Map",
"type": "nodeGraph",
"fieldConfig": {
"defaults": {},
"overrides": []
}
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"fixedColor": "blue",
"mode": "fixed"
},
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
}
},
"links": [],
"mappings": []
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 0,
"y": 28
},
"id": 4,
"options": {
"displayLabels": [
"name"
],
"legend": {
"displayMode": "list",
"placement": "right",
"showLegend": false
},
"pieType": "donut",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (server_port) (increase((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"})[$__range:$__interval])) > 0",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Active Ports",
"type": "piechart"
},
{
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"gridPos": {
"h": 4,
"w": 3,
"x": 21,
"y": 36
},
"id": 10,
"options": {
"code": {
"language": "plaintext",
"showLineNumbers": false,
"showMiniMap": false
},
"content": "<table style=\"width:100%; height:100%;border:0px solid black;\">\n <td style=\"text-align: center;vertical-align: middle;border:0px solid black; \">\n<div style=\"text-align: center\">\n<p align=\"center\">\n <img src=\"https://raw.githubusercontent.com/groundcover-com/caretta/main/images/logo.svg\" width=\"75%\" alt=\"caretta\" title=\"caretta\" />\n <h4>by <a href=\"https://www.groundcover.com\">groundcover</h4>\n\n \n [![slack](https://img.shields.io/badge/slack-groundcover-yellowgreen.svg?logo=slack)](http://www.groundcover.com/join-slack)\n \n</div>\n</p>\n</div>\n</td>\n</table>\n",
"mode": "markdown"
},
"pluginVersion": "10.1.2",
"type": "text"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"fixedColor": "purple",
"mode": "continuous-blues"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 9,
"x": 15,
"y": 28
},
"id": 8,
"options": {
"displayMode": "gradient",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showUnfilled": true,
"valueMode": "color"
},
"pluginVersion": "10.1.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"exemplar": false,
"expr": "topk(8, sum by (client_name) ((rate(caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"}[$__range:$__interval]))))",
"format": "time_series",
"instant": true,
"legendFormat": "__auto",
"range": false,
"refId": "A"
}
],
"title": "Top Throughput Workloads",
"type": "bargauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "continuous-blues"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 9,
"x": 6,
"y": 28
},
"id": 6,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "10.1.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"exemplar": false,
"expr": "topk(7, sum by (client_name, server_name) ( rate( (caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\", client_kind!~\"(node|external)\",} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\", server_kind!~\"(node|external)\"})[$__range:$__interval]) ) )",
"format": "time_series",
"instant": true,
"legendFormat": "{{client_name}} \u2b82 {{server_name}}",
"range": false,
"refId": "A"
}
],
"title": "Top Throughput Connections",
"type": "stat"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 40
},
"id": 11,
"title": "Network Flows (GoFlow2 / pfSense NetFlow)",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisLabel": "flows/s",
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "scheme",
"lineWidth": 2,
"pointSize": 5,
"showPoints": "never",
"spanNulls": true
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 41
},
"id": 12,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "rate(goflow2_flow_process_nf_flowset_records_total{type=\"DataFlowSet\"}[5m])",
"legendFormat": "Flows/s ({{router}})",
"range": true,
"refId": "A"
}
],
"title": "NetFlow Ingestion Rate",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisLabel": "",
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "scheme",
"lineWidth": 2,
"pointSize": 5,
"showPoints": "never",
"spanNulls": true
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 41
},
"id": 13,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "rate(goflow2_flow_traffic_bytes_total[5m])",
"legendFormat": "Bytes/s from {{remote_ip}}",
"range": true,
"refId": "A"
}
],
"title": "NetFlow Traffic Volume",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 10000
},
{
"color": "red",
"value": 100000
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 0,
"y": 49
},
"id": 14,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "goflow2_flow_process_nf_flowset_records_total{type=\"DataFlowSet\"}",
"legendFormat": "Total Flows",
"instant": true,
"refId": "A"
}
],
"title": "Total Flows Processed",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 4,
"y": 49
},
"id": 15,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "goflow2_flow_process_nf_total",
"legendFormat": "Messages",
"instant": true,
"refId": "A"
}
],
"title": "NetFlow Messages",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "blue",
"value": null
}
]
},
"unit": "decbytes"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 8,
"y": 49
},
"id": 16,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "goflow2_flow_traffic_bytes_total",
"legendFormat": "Bytes",
"instant": true,
"refId": "A"
}
],
"title": "Total NetFlow Bytes",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"drawStyle": "line",
"fillOpacity": 10,
"lineWidth": 2,
"pointSize": 5,
"showPoints": "never",
"spanNulls": true
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 12,
"x": 12,
"y": 49
},
"id": 17,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "rate(goflow2_flow_process_nf_delay_seconds_sum[5m]) / rate(goflow2_flow_process_nf_delay_seconds_count[5m])",
"legendFormat": "Avg Delay",
"range": true,
"refId": "A"
}
],
"title": "Flow Processing Delay",
"type": "timeseries"
}
],
"refresh": "1h",
"schemaVersion": 38,
"style": "dark",
"tags": [
"network",
"caretta",
"goflow2"
],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "default",
"value": "default"
},
"hide": 0,
"includeAll": false,
"label": "datasource",
"multi": false,
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"allValue": "(.*)",
"current": {
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "query_result(caretta_links_observed)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "namespace",
"options": [],
"query": {
"query": "query_result(caretta_links_observed)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "/.*_namespace=\"([^\"]*).*/",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"allValue": "(.*)",
"current": {
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "query_result(caretta_links_observed)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "kind",
"options": [],
"query": {
"query": "query_result(caretta_links_observed)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "/.*_kind=\"([^\"]*).*/",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"allValue": "(.*)",
"current": {
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "query_result(caretta_links_observed)",
"hide": 0,
"includeAll": true,
"label": "workload",
"multi": true,
"name": "workload",
"options": [],
"query": {
"query": "query_result(caretta_links_observed)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "/.*_name=\"([^\"]*).*/",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"allValue": "(.*)",
"current": {
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(server_port)",
"hide": 0,
"includeAll": true,
"label": "server port",
"multi": true,
"name": "port",
"options": [],
"query": {
"query": "label_values(server_port)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
}
]
},
"time": {
"from": "now-15m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Network Observability",
"uid": "network-observability",
"version": 2,
"weekStart": ""
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,204 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": { "type": "datasource", "uid": "grafana" },
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Kubernetes API server audit logs from Loki",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": 0,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"panels": [],
"title": "Recent Activity",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"description": "Recent Kubernetes API actions from audit logs",
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"custom": {
"align": "auto",
"cellOptions": { "type": "auto" },
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
}
},
"overrides": []
},
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 1 },
"id": 1,
"options": {
"cellHeight": "sm",
"footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false },
"showHeader": true,
"sortBy": [{ "desc": true, "displayName": "Time" }]
},
"pluginVersion": "12.3.0",
"targets": [
{
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"editorMode": "code",
"expr": "{job=\"kubernetes-audit\"} | json | line_format \"{{.user.username}} {{.verb}} {{.objectRef.resource}} {{.objectRef.namespace}}\"",
"legendFormat": "",
"queryType": "range",
"refId": "A"
}
],
"title": "Recent Actions",
"type": "table"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 },
"id": 101,
"panels": [],
"title": "Request Rates",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"description": "API request count by user over time",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 14 },
"id": 2,
"options": {
"legend": { "calcs": ["sum", "lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"pluginVersion": "12.3.0",
"targets": [
{
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"editorMode": "code",
"expr": "sum by (user_username) (count_over_time({job=\"kubernetes-audit\"} | json [5m]))",
"legendFormat": "{{user_username}}",
"queryType": "range",
"refId": "A"
}
],
"title": "Request Count by User",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 },
"id": 102,
"panels": [],
"title": "Denied Requests",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"description": "API requests denied with HTTP 403+ status codes",
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"custom": {
"align": "auto",
"cellOptions": { "type": "auto" },
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 403 }
]
}
},
"overrides": []
},
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 25 },
"id": 3,
"options": {
"cellHeight": "sm",
"footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false },
"showHeader": true,
"sortBy": [{ "desc": true, "displayName": "Time" }]
},
"pluginVersion": "12.3.0",
"targets": [
{
"datasource": { "type": "loki", "uid": "P8E80F9AEF21F6940" },
"editorMode": "code",
"expr": "{job=\"kubernetes-audit\"} | json | responseStatus_code >= 403",
"legendFormat": "",
"queryType": "range",
"refId": "A"
}
],
"title": "Denied Requests (403+)",
"type": "table"
}
],
"preload": false,
"refresh": "30s",
"schemaVersion": 42,
"tags": ["kubernetes", "audit", "security"],
"templating": {
"list": []
},
"time": {
"from": "now-24h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Kubernetes Audit Logs",
"uid": "k8s-audit",
"version": 1
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,288 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"description": "Logs collected from Kubernetes, stored in Loki",
"editable": true,
"fiscalYearStartMonth": 0,
"gnetId": 15141,
"graphTooltip": 0,
"id": 25,
"links": [],
"panels": [
{
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "bars",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 24,
"x": 0,
"y": 0
},
"id": 4,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"editorMode": "code",
"expr": "sum(count_over_time({namespace=~\"$namespace\", container =~\"$container\"} |= \"$query\" [$__interval]))",
"instant": false,
"legendFormat": "Log count",
"queryType": "range",
"range": true,
"refId": "A"
}
],
"type": "timeseries"
},
{
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"description": "Logs from services running in Kubernetes",
"gridPos": {
"h": 25,
"w": 24,
"x": 0,
"y": 4
},
"id": 2,
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": false,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"targets": [
{
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"editorMode": "code",
"expr": "{namespace=~\"$namespace\", container =~\"$container\"} |= \"$query\"",
"queryType": "range",
"refId": "A"
}
],
"type": "logs"
}
],
"refresh": "5s",
"schemaVersion": 39,
"tags": [],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "",
"value": ""
},
"description": "String to search for",
"hide": 0,
"label": "Search Query",
"name": "query",
"options": [
{
"selected": true,
"text": "",
"value": ""
}
],
"query": "",
"skipUrlSync": false,
"type": "textbox"
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": [
"dbaas"
],
"value": [
"dbaas"
]
},
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"definition": "label_values(namespace)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "namespace",
"options": [],
"query": "label_values(namespace)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"definition": "label_values(stream)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "stream",
"options": [],
"query": "label_values(stream)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"definition": "label_values(container)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "container",
"options": [],
"query": "label_values(container)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Loki Kubernetes Logs",
"uid": "o6-BGgnnk",
"version": 2,
"weekStart": ""
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,816 @@
{
"annotations": {
"list": [
{
"$$hashKey": "object:192",
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.13+) cluster",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 18,
"x": 0,
"y": 0
},
"id": 12,
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP",
"instant": false,
"interval": "",
"legendFormat": "GPU 0",
"refId": "A"
}
],
"title": "GPU Temperature",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "red",
"value": 80
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 0
},
"id": 14,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP",
"interval": "",
"legendFormat": "",
"range": true,
"refId": "A"
}
],
"title": "GPU Current Temp",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 18,
"x": 0,
"y": 8
},
"id": 10,
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_POWER_USAGE",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"title": "GPU Power Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 2400,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "#EAB839",
"value": 1800
},
{
"color": "red",
"value": 2200
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 8
},
"id": 16,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"sum"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(nvidia_tesla_t4_DCGM_FI_DEV_POWER_USAGE)",
"instant": true,
"interval": "",
"legendFormat": "",
"range": false,
"refId": "A"
}
],
"title": "GPU Power Total",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"id": 6,
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_UTIL",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"title": "GPU Utilization",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "decmbytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"id": 18,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_FB_USED",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"title": "GPU Framebuffer Mem Used",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "hertz"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"id": 2,
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_SM_CLOCK* 1000000",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"title": "GPU SM Clocks",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 24
},
"id": 19,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "12.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "sum by (namespace) (gpu_pod_memory_used_bytes)",
"instant": false,
"legendFormat": "{{namespace}}",
"range": true,
"refId": "A"
}
],
"title": "GPU Memory per Application",
"type": "timeseries"
}
],
"preload": false,
"refresh": "auto",
"schemaVersion": 42,
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-12h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "NVIDIA DCGM Exporter Dashboard",
"uid": "Oxed_c6Wz",
"version": 9
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,488 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": { "type": "datasource", "uid": "grafana" },
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Technitium DNS query logs from MySQL",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"title": "Total Queries",
"type": "stat",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"steps": [
{ "color": "green", "value": null }
]
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"textMode": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"rawSql": "SELECT COUNT(*) as total_queries FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Cached %",
"type": "stat",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"unit": "percentunit",
"thresholds": {
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 0.3 },
{ "color": "green", "value": 0.5 }
]
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"textMode": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"rawSql": "SELECT SUM(CASE WHEN response_type = 3 THEN 1 ELSE 0 END) / COUNT(*) as cached_pct FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Blocked %",
"type": "stat",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"unit": "percentunit",
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.1 },
{ "color": "red", "value": 0.3 }
]
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"textMode": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"rawSql": "SELECT SUM(CASE WHEN response_type = 4 THEN 1 ELSE 0 END) / COUNT(*) as blocked_pct FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "NxDomain %",
"type": "stat",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"unit": "percentunit",
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.2 },
{ "color": "red", "value": 0.5 }
]
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"textMode": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"rawSql": "SELECT SUM(CASE WHEN rcode = 3 THEN 1 ELSE 0 END) / COUNT(*) as nxdomain_pct FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Avg Response Time",
"type": "stat",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"unit": "ms",
"thresholds": {
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 50 },
{ "color": "red", "value": 200 }
]
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"textMode": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"rawSql": "SELECT AVG(response_rtt) as avg_rtt_ms FROM dns_logs WHERE $__timeFilter(timestamp) AND response_rtt IS NOT NULL",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Queries by Protocol",
"type": "stat",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 0 },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" }
},
"overrides": []
},
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"textMode": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": true }
},
"targets": [
{
"rawSql": "SELECT SUM(CASE WHEN protocol = 0 THEN 1 ELSE 0 END) as UDP, SUM(CASE WHEN protocol = 1 THEN 1 ELSE 0 END) as TCP, SUM(CASE WHEN protocol = 3 THEN 1 ELSE 0 END) as DoH, SUM(CASE WHEN protocol = 4 THEN 1 ELSE 0 END) as DoT FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Queries Over Time",
"type": "timeseries",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 4 },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "bars",
"fillOpacity": 50,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "normal" }
}
},
"overrides": []
},
"options": {
"legend": { "calcs": ["sum"], "displayMode": "list", "placement": "bottom" },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"rawSql": "SELECT $__timeGroup(timestamp, $__interval) as time, SUM(CASE WHEN response_type = 1 THEN 1 ELSE 0 END) as Authoritative, SUM(CASE WHEN response_type = 2 THEN 1 ELSE 0 END) as Recursive, SUM(CASE WHEN response_type = 3 THEN 1 ELSE 0 END) as Cached, SUM(CASE WHEN response_type = 4 THEN 1 ELSE 0 END) as Blocked, SUM(CASE WHEN response_type = 5 THEN 1 ELSE 0 END) as Dropped FROM dns_logs WHERE $__timeFilter(timestamp) GROUP BY time ORDER BY time",
"format": "time_series",
"refId": "A"
}
]
},
{
"title": "Response Codes",
"type": "piechart",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" }
},
"overrides": [
{ "matcher": { "id": "byName", "options": "NOERROR" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
{ "matcher": { "id": "byName", "options": "NXDOMAIN" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] },
{ "matcher": { "id": "byName", "options": "SERVFAIL" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
{ "matcher": { "id": "byName", "options": "REFUSED" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }
]
},
"options": {
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] },
"pieType": "donut",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": true },
"tooltip": { "mode": "single" }
},
"targets": [
{
"rawSql": "SELECT SUM(CASE WHEN rcode = 0 THEN 1 ELSE 0 END) as NOERROR, SUM(CASE WHEN rcode = 2 THEN 1 ELSE 0 END) as SERVFAIL, SUM(CASE WHEN rcode = 3 THEN 1 ELSE 0 END) as NXDOMAIN, SUM(CASE WHEN rcode = 5 THEN 1 ELSE 0 END) as REFUSED, SUM(CASE WHEN rcode NOT IN (0,2,3,5) THEN 1 ELSE 0 END) as Other FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Response Types",
"type": "piechart",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" }
},
"overrides": [
{ "matcher": { "id": "byName", "options": "Cached" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] },
{ "matcher": { "id": "byName", "options": "Blocked" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] },
{ "matcher": { "id": "byName", "options": "Recursive" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] },
{ "matcher": { "id": "byName", "options": "Authoritative" }, "properties": [{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }] }
]
},
"options": {
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] },
"pieType": "donut",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": true },
"tooltip": { "mode": "single" }
},
"targets": [
{
"rawSql": "SELECT SUM(CASE WHEN response_type = 1 THEN 1 ELSE 0 END) as Authoritative, SUM(CASE WHEN response_type = 2 THEN 1 ELSE 0 END) as Recursive, SUM(CASE WHEN response_type = 3 THEN 1 ELSE 0 END) as Cached, SUM(CASE WHEN response_type = 4 THEN 1 ELSE 0 END) as Blocked, SUM(CASE WHEN response_type = 5 THEN 1 ELSE 0 END) as Dropped FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Query Types",
"type": "piechart",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" }
},
"overrides": []
},
"options": {
"legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] },
"pieType": "donut",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": true },
"tooltip": { "mode": "single" }
},
"targets": [
{
"rawSql": "SELECT SUM(CASE WHEN qtype = 1 THEN 1 ELSE 0 END) as A, SUM(CASE WHEN qtype = 28 THEN 1 ELSE 0 END) as AAAA, SUM(CASE WHEN qtype = 5 THEN 1 ELSE 0 END) as CNAME, SUM(CASE WHEN qtype = 15 THEN 1 ELSE 0 END) as MX, SUM(CASE WHEN qtype = 16 THEN 1 ELSE 0 END) as TXT, SUM(CASE WHEN qtype = 33 THEN 1 ELSE 0 END) as SRV, SUM(CASE WHEN qtype = 12 THEN 1 ELSE 0 END) as PTR, SUM(CASE WHEN qtype = 6 THEN 1 ELSE 0 END) as SOA, SUM(CASE WHEN qtype = 2 THEN 1 ELSE 0 END) as NS, SUM(CASE WHEN qtype = 65 THEN 1 ELSE 0 END) as HTTPS, SUM(CASE WHEN qtype NOT IN (1,2,5,6,12,15,16,28,33,65) THEN 1 ELSE 0 END) as Other FROM dns_logs WHERE $__timeFilter(timestamp)",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Top 20 Queried Domains",
"type": "table",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 20 },
"fieldConfig": {
"defaults": {
"custom": { "filterable": true }
},
"overrides": [
{ "matcher": { "id": "byName", "options": "count" }, "properties": [{ "id": "custom.width", "value": 100 }] }
]
},
"options": {
"showHeader": true,
"sortBy": [{ "desc": true, "displayName": "count" }]
},
"targets": [
{
"rawSql": "SELECT qname as domain, COUNT(*) as count FROM dns_logs WHERE $__timeFilter(timestamp) GROUP BY qname ORDER BY count DESC LIMIT 20",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Top 20 Clients",
"type": "table",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 20 },
"fieldConfig": {
"defaults": {
"custom": { "filterable": true }
},
"overrides": [
{ "matcher": { "id": "byName", "options": "count" }, "properties": [{ "id": "custom.width", "value": 100 }] }
]
},
"options": {
"showHeader": true,
"sortBy": [{ "desc": true, "displayName": "count" }]
},
"targets": [
{
"rawSql": "SELECT client_ip, COUNT(*) as count FROM dns_logs WHERE $__timeFilter(timestamp) GROUP BY client_ip ORDER BY count DESC LIMIT 20",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Average Response Time Over Time",
"type": "timeseries",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 30 },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"unit": "ms",
"custom": {
"axisBorderShow": false,
"axisLabel": "Response Time (ms)",
"axisPlacement": "auto",
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"lineWidth": 2,
"pointSize": 5,
"showPoints": "never",
"spanNulls": true
}
},
"overrides": []
},
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "list", "placement": "bottom" },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"rawSql": "SELECT $__timeGroup(timestamp, $__interval) as time, AVG(response_rtt) as avg_rtt, MAX(response_rtt) as max_rtt FROM dns_logs WHERE $__timeFilter(timestamp) AND response_rtt IS NOT NULL GROUP BY time ORDER BY time",
"format": "time_series",
"refId": "A"
}
]
},
{
"title": "Top 20 NxDomain Domains",
"type": "table",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 10, "w": 12, "x": 0, "y": 38 },
"fieldConfig": {
"defaults": {
"custom": { "filterable": true }
},
"overrides": [
{ "matcher": { "id": "byName", "options": "count" }, "properties": [{ "id": "custom.width", "value": 100 }] }
]
},
"options": {
"showHeader": true,
"sortBy": [{ "desc": true, "displayName": "count" }]
},
"targets": [
{
"rawSql": "SELECT qname as domain, COUNT(*) as count FROM dns_logs WHERE $__timeFilter(timestamp) AND rcode = 3 GROUP BY qname ORDER BY count DESC LIMIT 20",
"format": "table",
"refId": "A"
}
]
},
{
"title": "Top 20 Blocked Domains",
"type": "table",
"datasource": { "type": "mysql", "uid": "technitium-mysql" },
"gridPos": { "h": 10, "w": 12, "x": 12, "y": 38 },
"fieldConfig": {
"defaults": {
"custom": { "filterable": true }
},
"overrides": [
{ "matcher": { "id": "byName", "options": "count" }, "properties": [{ "id": "custom.width", "value": 100 }] }
]
},
"options": {
"showHeader": true,
"sortBy": [{ "desc": true, "displayName": "count" }]
},
"targets": [
{
"rawSql": "SELECT qname as domain, COUNT(*) as count FROM dns_logs WHERE $__timeFilter(timestamp) AND response_type = 4 GROUP BY qname ORDER BY count DESC LIMIT 20",
"format": "table",
"refId": "A"
}
]
}
],
"refresh": "5m",
"schemaVersion": 39,
"tags": ["dns", "technitium", "mysql"],
"templating": { "list": [] },
"time": { "from": "now-24h", "to": "now" },
"timepicker": {},
"timezone": "",
"title": "Technitium DNS",
"uid": "technitium-dns",
"version": 1
}

View file

@ -0,0 +1,303 @@
# HELP snmpEnableAuthenTraps Indicates whether the SNMP entity is permitted to generate authenticationFailure traps - 1.3.6.1.2.1.11.30
# TYPE snmpEnableAuthenTraps gauge
snmpEnableAuthenTraps 2
# HELP snmpInASNParseErrs The total number of ASN.1 or BER errors encountered by the SNMP entity when decoding received SNMP messages. - 1.3.6.1.2.1.11.6
# TYPE snmpInASNParseErrs counter
snmpInASNParseErrs 0
# HELP snmpInBadCommunityNames The total number of community-based SNMP messages (for example, SNMPv1) delivered to the SNMP entity which used an SNMP community name not known to said entity - 1.3.6.1.2.1.11.4
# TYPE snmpInBadCommunityNames counter
snmpInBadCommunityNames 184
# HELP snmpInBadCommunityUses The total number of community-based SNMP messages (for example, SNMPv1) delivered to the SNMP entity which represented an SNMP operation that was not allowed for the SNMP community named in the message - 1.3.6.1.2.1.11.5
# TYPE snmpInBadCommunityUses counter
snmpInBadCommunityUses 0
# HELP snmpInBadValues The total number of SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `badValue'. - 1.3.6.1.2.1.11.10
# TYPE snmpInBadValues counter
snmpInBadValues 0
# HELP snmpInBadVersions The total number of SNMP messages which were delivered to the SNMP entity and were for an unsupported SNMP version. - 1.3.6.1.2.1.11.3
# TYPE snmpInBadVersions counter
snmpInBadVersions 0
# HELP snmpInGenErrs The total number of SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `genErr'. - 1.3.6.1.2.1.11.12
# TYPE snmpInGenErrs counter
snmpInGenErrs 0
# HELP snmpInGetNexts The total number of SNMP Get-Next PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.16
# TYPE snmpInGetNexts counter
snmpInGetNexts 2940
# HELP snmpInGetRequests The total number of SNMP Get-Request PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.15
# TYPE snmpInGetRequests counter
snmpInGetRequests 9
# HELP snmpInGetResponses The total number of SNMP Get-Response PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.18
# TYPE snmpInGetResponses counter
snmpInGetResponses 0
# HELP snmpInNoSuchNames The total number of SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `noSuchName'. - 1.3.6.1.2.1.11.9
# TYPE snmpInNoSuchNames counter
snmpInNoSuchNames 0
# HELP snmpInPkts The total number of messages delivered to the SNMP entity from the transport service. - 1.3.6.1.2.1.11.1
# TYPE snmpInPkts counter
snmpInPkts 5928
# HELP snmpInReadOnlys The total number valid SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `readOnly' - 1.3.6.1.2.1.11.11
# TYPE snmpInReadOnlys counter
snmpInReadOnlys 0
# HELP snmpInSetRequests The total number of SNMP Set-Request PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.17
# TYPE snmpInSetRequests counter
snmpInSetRequests 0
# HELP snmpInTooBigs The total number of SNMP PDUs which were delivered to the SNMP protocol entity and for which the value of the error-status field was `tooBig'. - 1.3.6.1.2.1.11.8
# TYPE snmpInTooBigs counter
snmpInTooBigs 0
# HELP snmpInTotalReqVars The total number of MIB objects which have been retrieved successfully by the SNMP protocol entity as the result of receiving valid SNMP Get-Request and Get-Next PDUs. - 1.3.6.1.2.1.11.13
# TYPE snmpInTotalReqVars counter
snmpInTotalReqVars 72699
# HELP snmpInTotalSetVars The total number of MIB objects which have been altered successfully by the SNMP protocol entity as the result of receiving valid SNMP Set-Request PDUs. - 1.3.6.1.2.1.11.14
# TYPE snmpInTotalSetVars counter
snmpInTotalSetVars 0
# HELP snmpInTraps The total number of SNMP Trap PDUs which have been accepted and processed by the SNMP protocol entity. - 1.3.6.1.2.1.11.19
# TYPE snmpInTraps counter
snmpInTraps 0
# HELP snmpOutBadValues The total number of SNMP PDUs which were generated by the SNMP protocol entity and for which the value of the error-status field was `badValue'. - 1.3.6.1.2.1.11.22
# TYPE snmpOutBadValues counter
snmpOutBadValues 0
# HELP snmpOutGenErrs The total number of SNMP PDUs which were generated by the SNMP protocol entity and for which the value of the error-status field was `genErr'. - 1.3.6.1.2.1.11.24
# TYPE snmpOutGenErrs counter
snmpOutGenErrs 0
# HELP snmpOutGetNexts The total number of SNMP Get-Next PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.26
# TYPE snmpOutGetNexts counter
snmpOutGetNexts 0
# HELP snmpOutGetRequests The total number of SNMP Get-Request PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.25
# TYPE snmpOutGetRequests counter
snmpOutGetRequests 0
# HELP snmpOutGetResponses The total number of SNMP Get-Response PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.28
# TYPE snmpOutGetResponses counter
snmpOutGetResponses 5740
# HELP snmpOutNoSuchNames The total number of SNMP PDUs which were generated by the SNMP protocol entity and for which the value of the error-status was `noSuchName'. - 1.3.6.1.2.1.11.21
# TYPE snmpOutNoSuchNames counter
snmpOutNoSuchNames 0
# HELP snmpOutPkts The total number of SNMP Messages which were passed from the SNMP protocol entity to the transport service. - 1.3.6.1.2.1.11.2
# TYPE snmpOutPkts counter
snmpOutPkts 5739
# HELP snmpOutSetRequests The total number of SNMP Set-Request PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.27
# TYPE snmpOutSetRequests counter
snmpOutSetRequests 0
# HELP snmpOutTooBigs The total number of SNMP PDUs which were generated by the SNMP protocol entity and for which the value of the error-status field was `tooBig.' - 1.3.6.1.2.1.11.20
# TYPE snmpOutTooBigs counter
snmpOutTooBigs 0
# HELP snmpOutTraps The total number of SNMP Trap PDUs which have been generated by the SNMP protocol entity. - 1.3.6.1.2.1.11.29
# TYPE snmpOutTraps counter
snmpOutTraps 0
# HELP snmpProxyDrops The total number of Confirmed Class PDUs (such as GetRequest-PDUs, GetNextRequest-PDUs, GetBulkRequest-PDUs, SetRequest-PDUs, and InformRequest-PDUs) delivered to the SNMP entity which were silently dropped because the transmission of the (possibly translated) message to a proxy target failed in a manner (other than a time-out) such that no Response Class PDU (such as a Response-PDU) could be returned. - 1.3.6.1.2.1.11.32
# TYPE snmpProxyDrops counter
snmpProxyDrops 0
# HELP snmpSilentDrops The total number of Confirmed Class PDUs (such as GetRequest-PDUs, GetNextRequest-PDUs, GetBulkRequest-PDUs, SetRequest-PDUs, and InformRequest-PDUs) delivered to the SNMP entity which were silently dropped because the size of a reply containing an alternate Response Class PDU (such as a Response-PDU) with an empty variable-bindings field was greater than either a local constraint or the maximum message size associated with the originator of the request. - 1.3.6.1.2.1.11.31
# TYPE snmpSilentDrops counter
snmpSilentDrops 0
# HELP snmp_scrape_duration_seconds Total SNMP time scrape took (walk and processing).
# TYPE snmp_scrape_duration_seconds gauge
snmp_scrape_duration_seconds{module="huawei"} 0.39253882
# HELP snmp_scrape_packets_retried Packets retried for get, bulkget, and walk.
# TYPE snmp_scrape_packets_retried gauge
snmp_scrape_packets_retried{module="huawei"} 0
# HELP snmp_scrape_packets_sent Packets sent for get, bulkget, and walk; including retries.
# TYPE snmp_scrape_packets_sent gauge
snmp_scrape_packets_sent{module="huawei"} 6
# HELP snmp_scrape_pdus_returned PDUs returned from get, bulkget, and walk.
# TYPE snmp_scrape_pdus_returned gauge
snmp_scrape_pdus_returned{module="huawei"} 104
# HELP snmp_scrape_walk_duration_seconds Time SNMP walk/bulkwalk took.
# TYPE snmp_scrape_walk_duration_seconds gauge
snmp_scrape_walk_duration_seconds{module="huawei"} 0.391760524
# HELP sysContact The textual identification of the contact person for this managed node, together with information on how to contact this person - 1.3.6.1.2.1.1.4
# TYPE sysContact gauge
sysContact{sysContact="Not Configure System Contact"} 1
# HELP sysDescr A textual description of the entity - 1.3.6.1.2.1.1.1
# TYPE sysDescr gauge
sysDescr{sysDescr="Linux GSE200M 2.6.27-SPEAr310 #80 Fri Jan 13 11:22:09 CST 2017 armv5tejl"} 1
# HELP sysLocation The physical location of this node (e.g., 'telephone closet, 3rd floor') - 1.3.6.1.2.1.1.6
# TYPE sysLocation gauge
sysLocation{sysLocation="Garage G03"} 1
# HELP sysName An administratively-assigned name for this managed node - 1.3.6.1.2.1.1.5
# TYPE sysName gauge
sysName{sysName="ups2000"} 1
# HELP sysORDescr A textual description of the capabilities identified by the corresponding instance of sysORID. - 1.3.6.1.2.1.1.9.1.3
# TYPE sysORDescr gauge
sysORDescr{sysORDescr="The MIB for Message Processing and Dispatching.",sysORIndex="3"} 1
sysORDescr{sysORDescr="The MIB module for SNMPv2 entities",sysORIndex="1"} 1
sysORDescr{sysORDescr="The SNMP Management Architecture MIB.",sysORIndex="5"} 1
sysORDescr{sysORDescr="The management information definitions for the SNMP User-based Security Model.",sysORIndex="4"} 1
sysORDescr{sysORDescr="View-based Access Control Model for SNMP.",sysORIndex="2"} 1
# HELP sysORID An authoritative identification of a capabilities statement with respect to various MIB modules supported by the local SNMP application acting as a command responder. - 1.3.6.1.2.1.1.9.1.2
# TYPE sysORID gauge
sysORID{sysORID="1.3.6.1.6.3.1",sysORIndex="1"} 1
sysORID{sysORID="1.3.6.1.6.3.10.3.1.1",sysORIndex="5"} 1
sysORID{sysORID="1.3.6.1.6.3.11.3.1.1",sysORIndex="3"} 1
sysORID{sysORID="1.3.6.1.6.3.15.2.1.1",sysORIndex="4"} 1
sysORID{sysORID="1.3.6.1.6.3.16.2.2.1",sysORIndex="2"} 1
# HELP sysORLastChange The value of sysUpTime at the time of the most recent change in state or value of any instance of sysORID. - 1.3.6.1.2.1.1.8
# TYPE sysORLastChange gauge
sysORLastChange 8
# HELP sysORUpTime The value of sysUpTime at the time this conceptual row was last instantiated. - 1.3.6.1.2.1.1.9.1.4
# TYPE sysORUpTime gauge
sysORUpTime{sysORIndex="1"} 7
sysORUpTime{sysORIndex="2"} 8
sysORUpTime{sysORIndex="3"} 8
sysORUpTime{sysORIndex="4"} 8
sysORUpTime{sysORIndex="5"} 8
# HELP sysObjectID The vendor's authoritative identification of the network management subsystem contained in the entity - 1.3.6.1.2.1.1.2
# TYPE sysObjectID gauge
sysObjectID{sysObjectID="1.3.6.1.4.1.8072.3.2.10"} 1
# HELP sysUpTime The time (in hundredths of a second) since the network management portion of the system was last re-initialized. - 1.3.6.1.2.1.1.3
# TYPE sysUpTime gauge
sysUpTime 5.3264032e+07
# HELP upsAlarmsPresent The present number of active alarm conditions. - 1.3.6.1.2.1.33.1.6.1
# TYPE upsAlarmsPresent gauge
upsAlarmsPresent 0
# HELP upsAutoRestart Setting this object to 'on' will cause the UPS system to restart after a shutdown if the shutdown occurred during a power loss as a result of either a upsShutdownAfterDelay or an internal battery depleted condition - 1.3.6.1.2.1.33.1.8.5
# TYPE upsAutoRestart gauge
upsAutoRestart 0
# HELP upsBatteryCurrent The present battery current. - 1.3.6.1.2.1.33.1.2.6
# TYPE upsBatteryCurrent gauge
upsBatteryCurrent 2.147483647e+09
# HELP upsBatteryStatus The indication of the capacity remaining in the UPS system's batteries - 1.3.6.1.2.1.33.1.2.1
# TYPE upsBatteryStatus gauge
upsBatteryStatus 2
# HELP upsBatteryTemperature The ambient temperature at or near the UPS Battery casing. - 1.3.6.1.2.1.33.1.2.7
# TYPE upsBatteryTemperature gauge
upsBatteryTemperature 2.147483647e+09
# HELP upsBatteryVoltage The magnitude of the present battery voltage. - 1.3.6.1.2.1.33.1.2.5
# TYPE upsBatteryVoltage gauge
upsBatteryVoltage 821
# HELP upsBypassFrequency The present bypass frequency. - 1.3.6.1.2.1.33.1.5.1
# TYPE upsBypassFrequency gauge
upsBypassFrequency 500
# HELP upsBypassLineIndex The bypass line identifier. - 1.3.6.1.2.1.33.1.5.3.1.1
# TYPE upsBypassLineIndex gauge
upsBypassLineIndex{upsBypassLineIndex="1"} 1
# HELP upsBypassNumLines The number of bypass lines utilized in this device - 1.3.6.1.2.1.33.1.5.2
# TYPE upsBypassNumLines gauge
upsBypassNumLines 1
# HELP upsBypassVoltage The present bypass voltage. - 1.3.6.1.2.1.33.1.5.3.1.2
# TYPE upsBypassVoltage gauge
upsBypassVoltage{upsBypassLineIndex="1"} 220
# HELP upsConfigAudibleStatus The requested state of the audible alarm - 1.3.6.1.2.1.33.1.9.8
# TYPE upsConfigAudibleStatus gauge
upsConfigAudibleStatus 0
# HELP upsConfigHighVoltageTransferPoint The maximum line voltage allowed before the UPS system transfers to battery backup. - 1.3.6.1.2.1.33.1.9.10
# TYPE upsConfigHighVoltageTransferPoint gauge
upsConfigHighVoltageTransferPoint 0
# HELP upsConfigInputFreq The nominal input frequency - 1.3.6.1.2.1.33.1.9.2
# TYPE upsConfigInputFreq gauge
upsConfigInputFreq 0
# HELP upsConfigInputVoltage The magnitude of the nominal input voltage - 1.3.6.1.2.1.33.1.9.1
# TYPE upsConfigInputVoltage gauge
upsConfigInputVoltage 0
# HELP upsConfigLowBattTime The value of upsEstimatedMinutesRemaining at which a lowBattery condition is declared - 1.3.6.1.2.1.33.1.9.7
# TYPE upsConfigLowBattTime gauge
upsConfigLowBattTime 0
# HELP upsConfigLowVoltageTransferPoint The minimum input line voltage allowed before the UPS system transfers to battery backup. - 1.3.6.1.2.1.33.1.9.9
# TYPE upsConfigLowVoltageTransferPoint gauge
upsConfigLowVoltageTransferPoint 0
# HELP upsConfigOutputFreq The nominal output frequency - 1.3.6.1.2.1.33.1.9.4
# TYPE upsConfigOutputFreq gauge
upsConfigOutputFreq 0
# HELP upsConfigOutputPower The magnitude of the nominal true power rating. - 1.3.6.1.2.1.33.1.9.6
# TYPE upsConfigOutputPower gauge
upsConfigOutputPower 0
# HELP upsConfigOutputVA The magnitude of the nominal Volt-Amp rating. - 1.3.6.1.2.1.33.1.9.5
# TYPE upsConfigOutputVA gauge
upsConfigOutputVA 0
# HELP upsConfigOutputVoltage The magnitude of the nominal output voltage - 1.3.6.1.2.1.33.1.9.3
# TYPE upsConfigOutputVoltage gauge
upsConfigOutputVoltage 0
# HELP upsEstimatedChargeRemaining An estimate of the battery charge remaining expressed as a percent of full charge. - 1.3.6.1.2.1.33.1.2.4
# TYPE upsEstimatedChargeRemaining gauge
upsEstimatedChargeRemaining 91
# HELP upsEstimatedMinutesRemaining An estimate of the time to battery charge depletion under the present load conditions if the utility power is off and remains off, or if it were to be lost and remain off. - 1.3.6.1.2.1.33.1.2.3
# TYPE upsEstimatedMinutesRemaining gauge
upsEstimatedMinutesRemaining 34
# HELP upsIdentAgentSoftwareVersion The UPS agent software version - 1.3.6.1.2.1.33.1.1.4
# TYPE upsIdentAgentSoftwareVersion gauge
upsIdentAgentSoftwareVersion{upsIdentAgentSoftwareVersion="V200R001C31B016"} 1
# HELP upsIdentAttachedDevices A string identifying the devices attached to the output(s) of the UPS - 1.3.6.1.2.1.33.1.1.6
# TYPE upsIdentAttachedDevices gauge
upsIdentAttachedDevices{upsIdentAttachedDevices="None"} 1
# HELP upsIdentManufacturer The name of the UPS manufacturer. - 1.3.6.1.2.1.33.1.1.1
# TYPE upsIdentManufacturer gauge
upsIdentManufacturer{upsIdentManufacturer="HUAWEI"} 1
# HELP upsIdentModel The UPS Model designation. - 1.3.6.1.2.1.33.1.1.2
# TYPE upsIdentModel gauge
upsIdentModel{upsIdentModel="UPS2000 2kVA"} 1
# HELP upsIdentName A string identifying the UPS - 1.3.6.1.2.1.33.1.1.5
# TYPE upsIdentName gauge
upsIdentName{upsIdentName="ups2000"} 1
# HELP upsIdentUPSSoftwareVersion The UPS firmware/software version(s) - 1.3.6.1.2.1.33.1.1.3
# TYPE upsIdentUPSSoftwareVersion gauge
upsIdentUPSSoftwareVersion{upsIdentUPSSoftwareVersion="V2R1C1SPC40"} 1
# HELP upsInputFrequency The present input frequency. - 1.3.6.1.2.1.33.1.3.3.1.2
# TYPE upsInputFrequency gauge
upsInputFrequency{upsInputLineIndex="1"} 500
# HELP upsInputLineBads A count of the number of times the input entered an out-of-tolerance condition as defined by the manufacturer - 1.3.6.1.2.1.33.1.3.1
# TYPE upsInputLineBads counter
upsInputLineBads 0
# HELP upsInputLineIndex The input line identifier. - 1.3.6.1.2.1.33.1.3.3.1.1
# TYPE upsInputLineIndex gauge
upsInputLineIndex{upsInputLineIndex="1"} 1
# HELP upsInputNumLines The number of input lines utilized in this device - 1.3.6.1.2.1.33.1.3.2
# TYPE upsInputNumLines gauge
upsInputNumLines 1
# HELP upsInputVoltage The magnitude of the present input voltage. - 1.3.6.1.2.1.33.1.3.3.1.3
# TYPE upsInputVoltage gauge
upsInputVoltage{upsInputLineIndex="1"} 218
# HELP upsOutputCurrent The present output current. - 1.3.6.1.2.1.33.1.4.4.1.3
# TYPE upsOutputCurrent gauge
upsOutputCurrent{upsOutputLineIndex="1"} 56
# HELP upsOutputFrequency The present output frequency. - 1.3.6.1.2.1.33.1.4.2
# TYPE upsOutputFrequency gauge
upsOutputFrequency 500
# HELP upsOutputLineIndex The output line identifier. - 1.3.6.1.2.1.33.1.4.4.1.1
# TYPE upsOutputLineIndex gauge
upsOutputLineIndex{upsOutputLineIndex="1"} 1
# HELP upsOutputNumLines The number of output lines utilized in this device - 1.3.6.1.2.1.33.1.4.3
# TYPE upsOutputNumLines gauge
upsOutputNumLines 1
# HELP upsOutputPercentLoad The percentage of the UPS power capacity presently being used on this output line, i.e., the greater of the percent load of true power capacity and the percent load of VA. - 1.3.6.1.2.1.33.1.4.4.1.5
# TYPE upsOutputPercentLoad gauge
upsOutputPercentLoad{upsOutputLineIndex="1"} 66
# HELP upsOutputPower The present output true power. - 1.3.6.1.2.1.33.1.4.4.1.4
# TYPE upsOutputPower gauge
upsOutputPower{upsOutputLineIndex="1"} 1
# HELP upsOutputSource The present source of output power - 1.3.6.1.2.1.33.1.4.1
# TYPE upsOutputSource gauge
upsOutputSource 3
# HELP upsOutputVoltage The present output voltage. - 1.3.6.1.2.1.33.1.4.4.1.2
# TYPE upsOutputVoltage gauge
upsOutputVoltage{upsOutputLineIndex="1"} 230
# HELP upsRebootWithDuration Setting this object will immediately shutdown (i.e., turn off) either the UPS output or the UPS system (as determined by the value of upsShutdownType at the time of shutdown) for a period equal to the indicated number of seconds, after which time the output will be started, including starting the UPS, if necessary - 1.3.6.1.2.1.33.1.8.4
# TYPE upsRebootWithDuration gauge
upsRebootWithDuration 0
# HELP upsSecondsOnBattery If the unit is on battery power, the elapsed time since the UPS last switched to battery power, or the time since the network management subsystem was last restarted, whichever is less - 1.3.6.1.2.1.33.1.2.2
# TYPE upsSecondsOnBattery gauge
upsSecondsOnBattery 0
# HELP upsShutdownAfterDelay Setting this object will shutdown (i.e., turn off) either the UPS output or the UPS system (as determined by the value of upsShutdownType at the time of shutdown) after the indicated number of seconds, or less if the UPS batteries become depleted - 1.3.6.1.2.1.33.1.8.2
# TYPE upsShutdownAfterDelay gauge
upsShutdownAfterDelay 0
# HELP upsShutdownType This object determines the nature of the action to be taken at the time when the countdown of the upsShutdownAfterDelay and upsRebootWithDuration objects reaches zero - 1.3.6.1.2.1.33.1.8.1
# TYPE upsShutdownType gauge
upsShutdownType 0
# HELP upsStartupAfterDelay Setting this object will start the output after the indicated number of seconds, including starting the UPS, if necessary - 1.3.6.1.2.1.33.1.8.3
# TYPE upsStartupAfterDelay gauge
upsStartupAfterDelay 0
# HELP upsTestElapsedTime The amount of time, in TimeTicks, since the test in progress was initiated, or, if no test is in progress, the previous test took to complete - 1.3.6.1.2.1.33.1.7.6
# TYPE upsTestElapsedTime gauge
upsTestElapsedTime 0
# HELP upsTestId The test is named by an OBJECT IDENTIFIER which allows a standard mechanism for the initiation of tests, including the well known tests identified in this document as well as those introduced by a particular implementation, i.e., as documented in the private enterprise MIB definition for the device - 1.3.6.1.2.1.33.1.7.1
# TYPE upsTestId gauge
upsTestId{upsTestId="0"} 1
# HELP upsTestResultsDetail Additional information about upsTestResultsSummary - 1.3.6.1.2.1.33.1.7.4
# TYPE upsTestResultsDetail gauge
upsTestResultsDetail{upsTestResultsDetail="0"} 1
# HELP upsTestResultsSummary The results of the current or last UPS diagnostics test performed - 1.3.6.1.2.1.33.1.7.3
# TYPE upsTestResultsSummary gauge
upsTestResultsSummary 0
# HELP upsTestSpinLock A spin lock on the test subsystem - 1.3.6.1.2.1.33.1.7.2
# TYPE upsTestSpinLock gauge
upsTestSpinLock 0
# HELP upsTestStartTime The value of sysUpTime at the time the test in progress was initiated, or, if no test is in progress, the time the previous test was initiated - 1.3.6.1.2.1.33.1.7.5
# TYPE upsTestStartTime gauge
upsTestStartTime 0

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,97 @@
resource "kubernetes_deployment" "goflow2" {
metadata {
name = "goflow2"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "goflow2"
tier = var.tier
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "goflow2"
}
}
template {
metadata {
labels = {
app = "goflow2"
}
}
spec {
container {
name = "goflow2"
image = "netsampler/goflow2:v2.2.1"
args = ["-listen", "netflow://:2055"]
port {
name = "netflow"
container_port = 2055
protocol = "UDP"
}
port {
name = "metrics"
container_port = 8080
protocol = "TCP"
}
resources {
requests = {
cpu = "50m"
memory = "128Mi"
}
limits = {
memory = "128Mi"
}
}
}
}
}
}
}
resource "kubernetes_service" "goflow2" {
metadata {
name = "goflow2"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "goflow2"
}
}
spec {
selector = {
app = "goflow2"
}
port {
name = "metrics"
port = 8080
target_port = 8080
protocol = "TCP"
}
}
}
resource "kubernetes_service" "goflow2-netflow" {
metadata {
name = "goflow2-netflow"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "goflow2"
}
}
spec {
type = "NodePort"
selector = {
app = "goflow2"
}
port {
name = "netflow"
port = 2055
target_port = 2055
protocol = "UDP"
node_port = 32055
}
}
}

View file

@ -0,0 +1,132 @@
# resource "kubernetes_persistent_volume" "prometheus_grafana_pv" {
# metadata {
# name = "grafana-pv"
# }
# spec {
# capacity = {
# "storage" = "2Gi"
# }
# access_modes = ["ReadWriteOnce"]
# persistent_volume_source {
# nfs {
# path = "/mnt/main/grafana"
# server = var.nfs_server
# }
# # iscsi {
# # target_portal = "iscsi.viktorbarzin.lan:3260"
# # iqn = "iqn.2020-12.lan.viktorbarzin:storage:monitoring:grafana"
# # lun = 0
# # fs_type = "ext4"
# # }
# }
# }
# }
resource "kubernetes_persistent_volume" "alertmanager_pv" {
metadata {
name = "alertmanager-pv"
}
spec {
capacity = {
"storage" = "2Gi"
}
access_modes = ["ReadWriteOnce"]
persistent_volume_source {
csi {
driver = "nfs.csi.k8s.io"
volume_handle = "alertmanager-pv"
volume_attributes = {
server = var.nfs_server
share = "/mnt/main/alertmanager"
}
}
}
mount_options = [
"soft",
"timeo=30",
"retrans=3",
"actimeo=5",
]
storage_class_name = "nfs-truenas"
}
}
# resource "kubernetes_persistent_volume_claim" "grafana_pvc" {
# metadata {
# name = "grafana-pvc"
# namespace = kubernetes_namespace.monitoring.metadata[0].name
# }
# spec {
# access_modes = ["ReadWriteOnce"]
# resources {
# requests = {
# "storage" = "2Gi"
# }
# }
# }
# }
# DB credentials from Vault database engine (rotated automatically)
# Provides GF_DATABASE_PASSWORD that auto-updates when password rotates
resource "kubernetes_manifest" "grafana_db_creds" {
manifest = {
apiVersion = "external-secrets.io/v1beta1"
kind = "ExternalSecret"
metadata = {
name = "grafana-db-creds"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec = {
refreshInterval = "15m"
secretStoreRef = {
name = "vault-database"
kind = "ClusterSecretStore"
}
target = {
name = "grafana-db-creds"
template = {
data = {
GF_DATABASE_PASSWORD = "{{ .password }}"
}
}
}
data = [{
secretKey = "password"
remoteRef = {
key = "static-creds/mysql-grafana"
property = "password"
}
}]
}
}
}
resource "kubernetes_config_map" "grafana_dashboards" {
for_each = fileset("${path.module}/dashboards", "*.json")
metadata {
name = "grafana-dashboard-${replace(trimsuffix(each.value, ".json"), "_", "-")}"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
grafana_dashboard = "1"
}
}
data = {
(each.value) = file("${path.module}/dashboards/${each.value}")
}
}
resource "helm_release" "grafana" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "grafana"
atomic = true
timeout = 600
repository = "https://grafana.github.io/helm-charts"
chart = "grafana"
values = [templatefile("${path.module}/grafana_chart_values.yaml", { grafana_admin_password = var.grafana_admin_password, mysql_host = var.mysql_host })]
depends_on = [kubernetes_manifest.grafana_db_creds]
}

View file

@ -0,0 +1,103 @@
deploymentStrategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
replicas: 2
adminPassword: "${grafana_admin_password}"
resources:
requests:
cpu: 50m
memory: 512Mi
limits:
memory: 512Mi
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
app.kubernetes.io/name: grafana
podAnnotations:
dependency.kyverno.io/wait-for: "mysql.dbaas:3306"
podDisruptionBudget:
maxUnavailable: 1
persistence:
enabled: false # using external mysql
existingClaim: "grafana-pvc"
ingress:
enabled: "true"
ingressClassName: "traefik"
annotations:
traefik.ingress.kubernetes.io/router.middlewares: "traefik-rate-limit@kubernetescrd,traefik-csp-headers@kubernetescrd,traefik-crowdsec@kubernetescrd"
traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
gethomepage.dev/enabled: "true"
gethomepage.dev/name: "Grafana"
gethomepage.dev/description: "Dashboards & observability"
gethomepage.dev/icon: "grafana.png"
gethomepage.dev/group: "Core Platform"
gethomepage.dev/pod-selector: ""
gethomepage.dev/widget.type: "grafana"
gethomepage.dev/widget.url: "http://grafana.monitoring.svc.cluster.local"
gethomepage.dev/widget.username: "admin"
gethomepage.dev/widget.password: "${grafana_admin_password}"
tls:
- secretName: "tls-secret"
hosts:
- "grafana.viktorbarzin.me"
hosts:
- "grafana.viktorbarzin.me"
sidecar:
datasources:
enabled: "true"
dashboards:
enabled: true
label: "grafana_dashboard"
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
name: default
ordId: 1
# folder: ""
type: "file"
# disableDeletion: "false"
# editable: "true"
options:
path: "/var/lib/grafana/dashboards/default"
envFromSecrets:
- name: grafana-db-creds
optional: false
env:
GF_SERVER_ROOT_URL: https://grafana.viktorbarzin.me
grafana.ini:
database:
type: mysql
host: ${mysql_host}:3306
name: grafana
user: grafana
password: $__env{GF_DATABASE_PASSWORD}
ssl_mode: disable
auth.anonymous:
enabled: true
org_role: Viewer
# auth.google:
# enabled: true
analytics:
check_for_updates: "true"
grafana_net:
url: "https://grafana.net"
log:
mode: "console"
paths:
data: "/var/lib/grafana/data"
logs: "/var/log/grafana"
plugins: "/var/lib/grafana/plugins"
provisioning: "/etc/grafana/provisioning"
security:
allow_embedding: true # Allow to be iframed
# url: https://grafana.com/api/dashboards/11074/revisions/2/download
# datasources:
# - name: Prometheus
# url: http://prometheus-server

View file

@ -0,0 +1,130 @@
resource "kubernetes_config_map" "redfish-config" {
metadata {
name = "redfish-exporter-config"
namespace = kubernetes_namespace.monitoring.metadata[0].name
annotations = {
"reloader.stakater.com/match" = "true"
}
}
data = {
"config.yml" = <<-EOF
address: 0.0.0.0
port: 9610
hosts:
${var.idrac_host}:
username: ${var.idrac_username}
password: ${var.idrac_password}
default:
username: root
password: calvin
metrics:
all: true
# system: true
# sensors: true
# power: true
# sel: false # Disable SEL - often slow
# storage: true # Disable storage - slowest endpoint
# memory: true
# network: false # Disable network adapters
# firmware: false # Don't need this frequently
EOF
}
}
resource "kubernetes_deployment" "idrac-redfish" {
metadata {
name = "idrac-redfish-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "idrac-redfish-exporter"
tier = var.tier
}
annotations = {
"reloader.stakater.com/search" = "true"
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "idrac-redfish-exporter"
}
}
template {
metadata {
labels = {
app = "idrac-redfish-exporter"
}
}
spec {
priority_class_name = "tier-1-cluster"
container {
# https://github.com/mrlhansen/idrac_exporter?tab=readme-ov-file
# Pinned tag Kyverno policy sets imagePullPolicy: IfNotPresent
image = "ghcr.io/mrlhansen/idrac_exporter:2.4.1"
name = "redfish-exporter"
port {
container_port = 9610
}
volume_mount {
name = "redfish-exporter-config"
mount_path = "/etc/prometheus/idrac.yml"
sub_path = "config.yml"
}
}
volume {
name = "redfish-exporter-config"
config_map {
name = "redfish-exporter-config"
}
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
}
resource "kubernetes_service" "idrac-redfish-exporter" {
metadata {
name = "idrac-redfish-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
"app" = "idrac-redfish-exporter"
}
# annotations = {
# "prometheus.io/scrape" = "true"
# "prometheus.io/path" = "/metrics"
# "prometheus.io/port" = "9090"
# }
}
spec {
selector = {
"app" = "idrac-redfish-exporter"
}
port {
name = "http"
port = "9090"
target_port = "9610"
}
}
}
module "idrac-redfish-exporter-ingress" {
source = "../../../../modules/kubernetes/ingress_factory"
namespace = kubernetes_namespace.monitoring.metadata[0].name
name = "idrac-redfish-exporter"
root_domain = "viktorbarzin.lan"
tls_secret_name = var.tls_secret_name
allow_local_access_only = true
ssl_redirect = false
port = 9090
}

View file

@ -0,0 +1,78 @@
---
cluster:
name: default
destinations:
- name: loki
type: loki
url: http://loki-gateway.monitoring.svc.cluster.local/loki/api/v1/push
clusterEvents:
enabled: false
collector: alloy-logs
namespaces:
- dbaas
- immich
- authentik
- mailserver
- crowdsec
- descheduler
- calibre
- monitoring
- ingress-nginx
- vaultwarden
nodeLogs:
enabled: false
podLogs:
enabled: true
gatherMethod: kubernetesApi
collector: alloy-logs
labelsToKeep:
[
"app_kubernetes_io_name",
"container",
"instance",
"job",
"level",
"namespace",
"service_name",
"service_namespace",
"deployment_environment",
"deployment_environment_name",
]
structuredMetadata:
pod: pod # Set structured metadata "pod" from label "pod"
namespaces:
- dbaas
- immich
- authentik
- mailserver
- crowdsec
- descheduler
- calibre
- monitoring
- ingress-nginx
- vaultwarden
# Collectors
alloy-singleton:
enabled: false
alloy-metrics:
enabled: false
alloy-logs:
enabled: true
# Required when using the Kubernetes API to pod logs
alloy:
mounts:
varlog: false
clustering:
enabled: true
alloy-profiles:
enabled: false
alloy-receiver:
enabled: false

View file

@ -0,0 +1,220 @@
variable "nfs_server" { type = string }
# LOKI DISABLED - Uncomment to re-enable centralized logging
# Disabled due to operational overhead vs benefit analysis after node2 incident
# All configuration preserved in loki.yaml for future re-enabling
/*
resource "helm_release" "loki" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "loki"
repository = "https://grafana.github.io/helm-charts"
chart = "loki"
values = [templatefile("${path.module}/loki.yaml", {})]
timeout = 600
depends_on = [kubernetes_config_map.loki_alert_rules]
}
*/
# ALLOY DISABLED - Log collection agents (depends on Loki)
# https://grafana.com/docs/alloy/latest/configure/kubernetes/
# Configuration preserved in alloy.yaml for future re-enabling
/*
resource "helm_release" "alloy" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "alloy"
repository = "https://grafana.github.io/helm-charts"
chart = "alloy"
values = [file("${path.module}/alloy.yaml")]
atomic = true
depends_on = [helm_release.loki]
}
*/
# SYSCTL INOTIFY DISABLED - Was specifically for Loki file watching requirements
# Can be re-enabled when Loki is restored
/*
resource "kubernetes_daemon_set_v1" "sysctl-inotify" {
metadata {
name = "sysctl-inotify"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "sysctl-inotify"
}
}
spec {
selector {
match_labels = {
app = "sysctl-inotify"
}
}
template {
metadata {
labels = {
app = "sysctl-inotify"
}
}
spec {
init_container {
name = "sysctl"
image = "busybox:1.37"
command = [
"sh", "-c",
"sysctl -w fs.inotify.max_user_watches=1048576 && sysctl -w fs.inotify.max_user_instances=8192 && sysctl -w fs.inotify.max_queued_events=1048576"
]
security_context {
privileged = true
}
}
container {
name = "pause"
image = "registry.k8s.io/pause:3.10"
resources {
requests = {
cpu = "1m"
memory = "4Mi"
}
limits = {
cpu = "1m"
memory = "4Mi"
}
}
}
host_pid = true
toleration {
operator = "Exists"
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
}
*/
# resource "helm_release" "k8s-monitoring" {
# namespace = kubernetes_namespace.monitoring.metadata[0].name
# create_namespace = true
# name = "k8s-monitoring"
# repository = "https://grafana.github.io/helm-charts"
# chart = "k8s-monitoring"
# values = [templatefile("${path.module}/k8s-monitoring-values.yaml", {})]
# atomic = true
# }
# LOKI ALERT RULES DISABLED - Depend on Loki log queries
# These alert on kernel events from systemd journal logs via Loki
# Can be re-enabled when Loki is restored
/*
resource "kubernetes_config_map" "loki_alert_rules" {
metadata {
name = "loki-alert-rules"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
data = {
"rules.yaml" = yamlencode({
groups = [
{
name = "Node Health"
rules = [
{
alert = "KernelOOMKiller"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Out of memory.*Killed process\" [5m])) > 0"
for = "0m"
labels = {
severity = "critical"
}
annotations = {
summary = "OOM killer active on {{ $labels.node }}"
}
},
{
alert = "KernelPanic"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Kernel panic\" [5m])) > 0"
for = "0m"
labels = {
severity = "critical"
}
annotations = {
summary = "Kernel panic on {{ $labels.node }}"
}
},
{
alert = "KernelHungTask"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"blocked for more than\" [5m])) > 0"
for = "0m"
labels = {
severity = "warning"
}
annotations = {
summary = "Hung task detected on {{ $labels.node }}"
}
},
{
alert = "KernelSoftLockup"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)soft lockup\" [5m])) > 0"
for = "0m"
labels = {
severity = "critical"
}
annotations = {
summary = "Soft lockup on {{ $labels.node }}"
}
},
{
alert = "ContainerdDown"
expr = "sum by (node) (count_over_time({job=\"node-journal\", unit=\"containerd.service\"} |~ \"(?i)(dead|failed|deactivating)\" [5m])) > 0"
for = "1m"
labels = {
severity = "critical"
}
annotations = {
summary = "containerd service unhealthy on {{ $labels.node }}"
}
},
]
}
]
})
}
}
*/
# GRAFANA LOKI DATASOURCE DISABLED - Points to non-existent Loki service
# Can be re-enabled when Loki is restored
/*
resource "kubernetes_config_map" "grafana_loki_datasource" {
metadata {
name = "grafana-loki-datasource"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
grafana_datasource = "1"
}
}
data = {
"loki-datasource.yaml" = yamlencode({
apiVersion = 1
datasources = [{
name = "Loki"
type = "loki"
access = "proxy"
url = "http://loki.monitoring.svc.cluster.local:3100"
isDefault = false
}]
})
}
}
*/

View file

@ -0,0 +1,109 @@
loki:
commonConfig:
replication_factor: 1
schemaConfig:
configs:
- from: "2025-04-01"
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: loki_index_
period: 24h
ingester:
chunk_idle_period: 12h
max_chunk_age: 24h
chunk_retain_period: 1m
chunk_target_size: 1572864
wal:
dir: /loki-wal
pattern_ingester:
enabled: true
limits_config:
allow_structured_metadata: true
volume_enabled: true
retention_period: 720h
compactor:
retention_enabled: true
working_directory: /var/loki/compactor
compaction_interval: 1h
delete_request_store: filesystem
ruler:
enable_api: true
storage:
type: local
local:
directory: /var/loki/rules
alertmanager_url: http://prometheus-alertmanager.monitoring.svc.cluster.local:9093
ring:
kvstore:
store: inmemory
rule_path: /var/loki/scratch
storage:
type: "filesystem"
auth_enabled: false
minio:
enabled: false
deploymentMode: SingleBinary
singleBinary:
replicas: 1
persistence:
enabled: true
size: 50Gi
storageClass: "iscsi-truenas"
extraVolumes:
- name: wal
emptyDir:
medium: Memory
sizeLimit: 2Gi
- name: rules
configMap:
name: loki-alert-rules
extraVolumeMounts:
- name: wal
mountPath: /loki-wal
- name: rules
mountPath: /var/loki/rules/fake
resources:
requests:
cpu: 250m
memory: 2Gi
limits:
memory: 4Gi
# Zero out replica counts of other deployment modes
backend:
replicas: 0
read:
replicas: 0
write:
replicas: 0
ingester:
replicas: 0
querier:
replicas: 0
queryFrontend:
replicas: 0
queryScheduler:
replicas: 0
distributor:
replicas: 0
compactor:
replicas: 0
indexGateway:
replicas: 0
bloomCompactor:
replicas: 0
bloomGateway:
replicas: 0
# Disable optional components for single binary mode
gateway:
enabled: false
chunksCache:
enabled: false
resultsCache:
enabled: false

View file

@ -0,0 +1,214 @@
variable "tls_secret_name" {}
variable "alertmanager_account_password" {}
variable "idrac_host" {
default = "192.168.1.4"
}
variable "idrac_username" {
default = "root"
}
variable "idrac_password" {
default = "calvin"
sensitive = true
}
variable "alertmanager_slack_api_url" {}
variable "tiny_tuya_service_secret" {
type = string
sensitive = true
}
variable "haos_api_token" {
type = string
sensitive = true
}
variable "pve_password" {
type = string
sensitive = true
}
variable "grafana_admin_password" {
type = string
sensitive = true
}
variable "tier" { type = string }
variable "mysql_host" { type = string }
resource "kubernetes_namespace" "monitoring" {
metadata {
name = "monitoring"
labels = {
"istio-injection" : "disabled"
tier = var.tier
"resource-governance/custom-quota" = "true"
}
}
}
module "tls_secret" {
source = "../../../../modules/kubernetes/setup_tls_secret"
namespace = kubernetes_namespace.monitoring.metadata[0].name
tls_secret_name = var.tls_secret_name
}
# Terraform get angry with the 30k values file :/ use ansible until solved
# resource "helm_release" "ups_prometheus_snmp_exporter" {
# namespace = kubernetes_namespace.monitoring.metadata[0].name
# create_namespace = true
# name = "ups_prometheus_exporter"
# repository = "https://prometheus-community.github.io/helm-charts"
# chart = "prometheus-snmp-exporter"
# values = [file("${path.module}/ups_snmp_values.yaml")]
# }
resource "kubernetes_cron_job_v1" "monitor_prom" {
metadata {
name = "monitor-prometheus"
}
spec {
concurrency_policy = "Replace"
failed_jobs_history_limit = 5
schedule = "*/30 * * * *"
job_template {
metadata {
}
spec {
template {
metadata {
}
spec {
container {
name = "monitor-prometheus"
image = "alpine"
command = ["/bin/sh", "-c", "apk add --update curl && curl --connect-timeout 2 prometheus-server.monitoring.svc.cluster.local || curl https://webhook.viktorbarzin.me/fb/message-viktor -d 'Prometheus is down!'"]
}
}
}
}
}
}
}
resource "kubernetes_manifest" "status_redirect_middleware" {
manifest = {
apiVersion = "traefik.io/v1alpha1"
kind = "Middleware"
metadata = {
name = "status-redirect"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec = {
redirectRegex = {
regex = ".*"
replacement = "https://hetrixtools.com/r/38981b548b5d38b052aca8d01285a3f3/"
permanent = true
}
}
}
}
resource "kubernetes_ingress_v1" "status" {
metadata {
name = "hetrix-redirect-ingress"
namespace = kubernetes_namespace.monitoring.metadata[0].name
annotations = {
"traefik.ingress.kubernetes.io/router.middlewares" = "monitoring-status-redirect@kubernetescrd"
"traefik.ingress.kubernetes.io/router.entrypoints" = "websecure"
}
}
spec {
ingress_class_name = "traefik"
tls {
hosts = ["status.viktorbarzin.me"]
secret_name = var.tls_secret_name
}
rule {
host = "status.viktorbarzin.me"
http {
path {
path = "/"
backend {
service {
name = "not-used"
port {
number = 80 # redirected by middleware
}
}
}
}
}
}
}
}
resource "kubernetes_manifest" "yotovski_redirect_middleware" {
manifest = {
apiVersion = "traefik.io/v1alpha1"
kind = "Middleware"
metadata = {
name = "yotovski-redirect"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec = {
redirectRegex = {
regex = ".*"
replacement = "https://hetrixtools.com/r/2ba9d7a5e017794db0fd91f0115a8b3b/"
permanent = true
}
}
}
}
resource "kubernetes_ingress_v1" "status_yotovski" {
metadata {
name = "hetrix-yotovski-redirect-ingress"
namespace = kubernetes_namespace.monitoring.metadata[0].name
annotations = {
"traefik.ingress.kubernetes.io/router.middlewares" = "monitoring-yotovski-redirect@kubernetescrd"
"traefik.ingress.kubernetes.io/router.entrypoints" = "websecure"
}
}
spec {
ingress_class_name = "traefik"
tls {
hosts = ["yotovski-status.viktorbarzin.me"]
secret_name = var.tls_secret_name
}
rule {
host = "yotovski-status.viktorbarzin.me"
http {
path {
path = "/"
backend {
service {
name = "not-used" # redirected by middleware
port {
number = 80
}
}
}
}
}
}
}
}
# Custom ResourceQuota for monitoring larger than the default 1-cluster tier quota
# because monitoring runs 29+ pods (Prometheus, Grafana, Loki, Alloy, exporters, etc.)
resource "kubernetes_resource_quota" "monitoring" {
metadata {
name = "monitoring-quota"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec {
hard = {
"requests.cpu" = "16"
"requests.memory" = "16Gi"
"limits.memory" = "64Gi"
pods = "100"
}
}
}

View file

@ -0,0 +1,31 @@
resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" {
metadata {
name = "prometheus-data"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec {
access_modes = ["ReadWriteOnce"]
storage_class_name = "iscsi-truenas"
resources {
requests = {
storage = "200Gi"
}
}
}
}
resource "helm_release" "prometheus" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "prometheus"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "prometheus"
# version = "15.0.2"
version = "25.8.2"
values = [templatefile("${path.module}/prometheus_chart_values.tpl", { alertmanager_mail_pass = var.alertmanager_account_password, alertmanager_slack_api_url = var.alertmanager_slack_api_url, tuya_api_key = var.tiny_tuya_service_secret, haos_api_token = var.haos_api_token })]
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,122 @@
resource "kubernetes_secret" "pve_exporter_config" {
metadata {
name = "pve-exporter-config"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
data = {
"pve.yml" = <<-EOF
default:
user: "root@pam"
password: ${var.pve_password}
verify_ssl: false
timeout: 30
EOF
}
}
resource "kubernetes_deployment" "pve_exporter" {
metadata {
name = "proxmox-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
tier = var.tier
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "proxmox-exporter"
}
}
template {
metadata {
labels = {
app = "proxmox-exporter"
}
}
spec {
container {
name = "proxmox-exporter"
image = "prompve/prometheus-pve-exporter:latest"
port {
container_port = 9221
}
resources {
requests = {
cpu = "15m"
memory = "256Mi"
}
limits = {
memory = "256Mi"
}
}
# Mount the file into the container
volume_mount {
name = "config-volume"
mount_path = "/etc/prometheus"
read_only = true
}
}
volume {
name = "config-volume"
secret {
secret_name = kubernetes_secret.pve_exporter_config.metadata[0].name
items {
key = "pve.yml"
path = "pve.yml" # This results in /etc/prometheus/pve.yml
}
}
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
}
resource "kubernetes_service" "proxmox-exporter" {
metadata {
name = "proxmox-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
"app" = "proxmox-exporter"
}
annotations = {
"prometheus.io/scrape" = "true"
"prometheus.io/port" = 9221
"prometheus.io/path" = "/pve"
"prometheus.io/param_target" = "192.168.1.127"
"prometheus.io/param_node" = "1"
"prometheus.io/param_cluster" = "1"
}
}
spec {
selector = {
"app" = "proxmox-exporter"
}
port {
name = "http"
port = 9221
target_port = 9221
}
}
}
# To monitor the pve node, use the node exporter and the playbook in this repo. from the root run:
# ansible-playbook -i ./playbooks/inventory.ini ./playbooks/deploy_node_exporter.yaml
# This installs the exporter binary

View file

@ -0,0 +1,51 @@
import asyncio
import logging
import os
import signal
import sys
import time
import aiohttp
iDRAC_HOST = 'idrac'
iDRAC_USER_ENV_VAR = 'idrac_user'
iDRAC_PASSWORD_ENV_VAR = 'idrac_password'
SHOULD_RUN = True
def signal_handler(sig, frame):
logging.warning(f'signal {sig} received. shutting down gracefully...')
global SHOULD_RUN
SHOULD_RUN = False
time.sleep(60)
sys.exit(0)
async def main() -> None:
# define signal handlers
signal.signal(signal.SIGINT, signal_handler)
user = os.environ.get(iDRAC_USER_ENV_VAR)
if user is None:
logging.critical('missing environment variable for idrac user'
f' please set {iDRAC_USER_ENV_VAR}')
return
password = os.environ.get(iDRAC_PASSWORD_ENV_VAR)
if password is None:
logging.critical('missing environment variable for idrac password'
f' please set {iDRAC_PASSWORD_ENV_VAR}')
return
logging.info('service initiated with credentials')
return await monitor(user, password)
async def monitor(user: str, password: str) -> None:
while SHOULD_RUN:
pass
if __name__ == '__main__':
# abandoned bc server cannot start itself when it's off :/
asyncio.run(main())

View file

@ -0,0 +1,66 @@
#!/bin/sh
tag=server-power-cycle-script
logger -t $tag start $(date '+%F-%R')
if [ -f /tmp/server-power-cycle-lock ]; then
logger -t $tag 'Script already running. exiting'
exit 0
fi
touch /tmp/server-power-cycle-lock
if [ -f /root/server-power-cycle/state.off ]; then
logger -t $tag 'Server state set to off'
while true; do
sleep 60 # sleep 1 minute
logger -t $tag 'Trying to connect to idrac system...'
curl --connect-timeout 5 -s -k -u root:calvin -H"Content-type: application/json" -X GET https://192.168.1.4/redfish/v1/Chassis/System.Embedded.1/Power/PowerSupplies/PSU.Slot.2
if [[ $? -eq 0 ]]; then
logger -t $tag "Connected to idrac, assuming power is back on"
logger -t $tag "Power supply restored, sending power on command"
curl -s -k -u root:calvin -X POST -d '{"Action": "Reset", "ResetType": "On"}' -H"Content-type: application/json" https://192.168.1.4/redfish/v1/Systems/System.Embedded.1/Actions/ComputerSystem.Reset
rm /root/server-power-cycle/state.off
logger -t $tag end $(date '+%F-%R')
rm /tmp/server-power-cycle-lock
exit 0
fi
done
fi
voltage=$(curl -s -k -u root:calvin -H"Content-type: application/json" -X GET https://192.168.1.4/redfish/v1/Chassis/System.Embedded.1/Power/PowerSupplies/PSU.Slot.2 |jq .LineInputVoltage)
# check input voltage on the pwoer supply connected to the outer system
if [[ $voltage -gt 0 ]]; then
logger -t $tag "power supply is on. exiting"
logger -t $tag end $(date '+%F-%R')
rm /tmp/server-power-cycle-lock
exit 0
fi
to_wait=30
echo "Continuously checking power supply for the next $to_wait minutes"
for i in $(seq 30); do
logger -t $tag "Sleeping a minute..Minute $i"
sleep 60
# check input voltage on the pwoer supply connected to the outer system
voltage=$(curl -s -k -u root:calvin -H"Content-type: application/json" -X GET https://192.168.1.4/redfish/v1/Chassis/System.Embedded.1/Power/PowerSupplies/PSU.Slot.2 |jq .LineInputVoltage)
if [[ $voltage -gt 0 ]]; then
logger -t $tag "power supply is on. exiting"
logger -t $tag end $(date '+%F-%R')
rm /tmp/server-power-cycle-lock
exit 0
fi
done
logger -t $tag "Power supply did not come back, sending graceful shutdown signal"
curl -s -k -u root:calvin -X POST -d '{"Action": "Reset", "ResetType": "GracefulShutdown"}' -H"Content-type: application/json" https://192.168.1.4/redfish/v1/Systems/System.Embedded.1/Actions/ComputerSystem.Reset
touch /root/server-power-cycle/state.off
rm /tmp/server-power-cycle-lock
logger -t $tag end $(date '+%F-%R')

View file

@ -0,0 +1,130 @@
/**
1. clone snmp exporter
2. update generator.yaml to include only interesting modules
3. make generate
4. cp snmp.yml to whereever is used
5. scrape service with curl 'http://snmp-exporter.monitoring.svc.cluster.local:9116/snmp?auth=public_v2&module=huawei&target=192.168.1.5%3A161'
generate reference - https://github.com/prometheus/snmp_exporter/tree/main/generator
https://sbcode.net/prometheus/snmp-generate-huawei/
*/
resource "kubernetes_config_map" "snmp-exporter-yaml" {
metadata {
name = "snmp-exporter-yaml"
namespace = kubernetes_namespace.monitoring.metadata[0].name
annotations = {
"reloader.stakater.com/match" = "true"
}
}
data = {
"snmp.yml" = file("${path.module}/ups_snmp_values.yaml")
}
}
resource "kubernetes_deployment" "snmp-exporter" {
metadata {
name = "snmp-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "snmp-exporter"
tier = var.tier
}
annotations = {
"reloader.stakater.com/search" = "true"
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "snmp-exporter"
}
}
template {
metadata {
labels = {
app = "snmp-exporter"
}
}
spec {
container {
image = "prom/snmp-exporter"
name = "snmp-exporter"
# command = ["/usr/local/bin/redfish_exporter", "--config.file", "/app/config.yml"]
resources {
requests = {
cpu = "10m"
memory = "256Mi"
}
limits = {
memory = "256Mi"
}
}
port {
container_port = 9116
}
volume_mount {
name = "config-volume"
mount_path = "/etc/snmp_exporter/"
}
}
volume {
name = "config-volume"
config_map {
name = "snmp-exporter-yaml"
}
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
}
resource "kubernetes_service" "snmp-exporter" {
metadata {
name = "snmp-exporter"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
"app" = "snmp-exporter"
}
# annotations = {
# "prometheus.io/scrape" = "true"
# "prometheus.io/path" = "/snmp?auth=Public0&target=tcp%3A%2F%2F192.%3A161"
# "prometheus.io/port" = "9116"
# }
}
spec {
selector = {
"app" = "snmp-exporter"
}
port {
name = "http"
port = "9116"
target_port = "9116"
}
}
}
module "snmp-exporter-ingress" {
source = "../../../../modules/kubernetes/ingress_factory"
namespace = kubernetes_namespace.monitoring.metadata[0].name
name = "snmp-exporter"
root_domain = "viktorbarzin.lan"
tls_secret_name = var.tls_secret_name
allow_local_access_only = true
ssl_redirect = false
port = 9116
}

File diff suppressed because it is too large Load diff