2021-02-07 23:45:55 +00:00
|
|
|
# Helm values
|
|
|
|
|
# all values - https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus/values.yaml
|
|
|
|
|
alertmanager:
|
|
|
|
|
persistentVolume:
|
2021-05-03 01:25:28 +01:00
|
|
|
enabled: false
|
|
|
|
|
#existingClaim: alertmanager-iscsi-pvc
|
2021-02-07 23:45:55 +00:00
|
|
|
# storageClass: rook-cephfs
|
|
|
|
|
strategy:
|
|
|
|
|
type: Recreate
|
|
|
|
|
ingress:
|
|
|
|
|
enabled: "true"
|
|
|
|
|
annotations:
|
|
|
|
|
kubernetes.io/ingress.class: nginx
|
|
|
|
|
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
|
|
|
|
# Enable client certificate authentication
|
|
|
|
|
nginx.ingress.kubernetes.io/auth-tls-verify-client: "on"
|
|
|
|
|
# Create the secret containing the trusted ca certificates
|
|
|
|
|
nginx.ingress.kubernetes.io/auth-tls-secret: "default/ca-secret"
|
|
|
|
|
tls:
|
|
|
|
|
- secretName: "tls-secret"
|
|
|
|
|
hosts:
|
|
|
|
|
- "alertmanager.viktorbarzin.me"
|
|
|
|
|
hosts:
|
|
|
|
|
- "alertmanager.viktorbarzin.me"
|
|
|
|
|
alertmanagerFiles:
|
|
|
|
|
alertmanager.yml:
|
|
|
|
|
global:
|
|
|
|
|
smtp_from: "alertmanager@viktorbarzin.me"
|
|
|
|
|
# smtp_smarthost: "smtp.viktorbarzin.me:587"
|
|
|
|
|
smtp_smarthost: "mailserver.mailserver.svc.cluster.local:587"
|
|
|
|
|
smtp_auth_username: "alertmanager@viktorbarzin.me"
|
|
|
|
|
smtp_auth_password: "${alertmanager_mail_pass}"
|
|
|
|
|
smtp_require_tls: true
|
2022-01-06 20:09:20 +00:00
|
|
|
slack_api_url: "${alertmanager_slack_api_url}"
|
2021-02-07 23:45:55 +00:00
|
|
|
templates:
|
|
|
|
|
- "/etc/alertmanager/template/*.tmpl"
|
|
|
|
|
route:
|
|
|
|
|
group_by: ["alertname"]
|
|
|
|
|
group_wait: 3s
|
|
|
|
|
group_interval: 5s
|
|
|
|
|
repeat_interval: 1h
|
2022-01-06 20:09:20 +00:00
|
|
|
receiver: ALL
|
2021-02-07 23:45:55 +00:00
|
|
|
receivers:
|
2022-01-06 20:09:20 +00:00
|
|
|
- name: ALL
|
2021-02-07 23:45:55 +00:00
|
|
|
email_configs:
|
|
|
|
|
- to: "me@viktorbarzin.me"
|
|
|
|
|
send_resolved: true
|
|
|
|
|
tls_config:
|
|
|
|
|
insecure_skip_verify: true
|
2022-01-06 20:09:20 +00:00
|
|
|
slack_configs:
|
|
|
|
|
- send_resolved: true
|
|
|
|
|
channel: "#general"
|
2021-02-07 23:45:55 +00:00
|
|
|
|
|
|
|
|
server:
|
|
|
|
|
# Enable me to delete metrics
|
2021-05-03 01:25:28 +01:00
|
|
|
extraFlags:
|
2021-04-05 15:06:24 +01:00
|
|
|
# - "web.enable-admin-api"
|
2021-05-03 01:25:28 +01:00
|
|
|
- "storage.tsdb.allow-overlapping-blocks"
|
2023-04-21 23:04:39 +01:00
|
|
|
# - "storage.tsdb.retention.size=1GB"
|
2021-02-07 23:45:55 +00:00
|
|
|
persistentVolume:
|
|
|
|
|
# enabled: false
|
|
|
|
|
existingClaim: prometheus-iscsi-pvc
|
|
|
|
|
# storageClass: rook-cephfs
|
|
|
|
|
retention: "12w" # ~100GB storage
|
|
|
|
|
strategy:
|
|
|
|
|
type: Recreate
|
|
|
|
|
ingress:
|
|
|
|
|
enabled: "true"
|
|
|
|
|
annotations:
|
|
|
|
|
kubernetes.io/ingress.class: nginx
|
|
|
|
|
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
|
|
|
|
# Enable client certificate authentication
|
|
|
|
|
nginx.ingress.kubernetes.io/auth-tls-verify-client: "on"
|
|
|
|
|
# Create the secret containing the trusted ca certificates
|
|
|
|
|
nginx.ingress.kubernetes.io/auth-tls-secret: "default/ca-secret"
|
|
|
|
|
tls:
|
|
|
|
|
- secretName: "tls-secret"
|
|
|
|
|
hosts:
|
|
|
|
|
- "prometheus.viktorbarzin.me"
|
|
|
|
|
hosts:
|
|
|
|
|
- "prometheus.viktorbarzin.me"
|
|
|
|
|
alertmanagers:
|
|
|
|
|
- static_configs:
|
|
|
|
|
- targets:
|
|
|
|
|
- "prometheus-alertmanager.monitoring.svc.cluster.local"
|
|
|
|
|
# - "alertmanager.viktorbarzin.me"
|
|
|
|
|
tls_config:
|
|
|
|
|
insecure_skip_verify: true
|
|
|
|
|
|
|
|
|
|
serverFiles:
|
|
|
|
|
# prometheus.yml:
|
2023-04-17 01:28:03 +01:00
|
|
|
# storage:
|
|
|
|
|
# tsdb:
|
|
|
|
|
# # no_lockfile: true
|
|
|
|
|
# # max_blocks_in_cache: 100000
|
|
|
|
|
# # max_lookback_duration: 0s
|
|
|
|
|
# # min_block_duration: 2h
|
|
|
|
|
# # retention: 15d
|
|
|
|
|
# # chunk_encoding: 1
|
|
|
|
|
# # chunk_range: 1h
|
|
|
|
|
# # max_chunks_to_persist: 4800
|
|
|
|
|
# # chunks_to_persist: 4800
|
|
|
|
|
# cache:
|
|
|
|
|
# entries: 5000
|
|
|
|
|
# head:
|
|
|
|
|
# chunk_bytes: 1048576
|
|
|
|
|
# # wal:
|
|
|
|
|
# # compressions: 1
|
|
|
|
|
# # flush_after_seconds: 30
|
|
|
|
|
# # segment_size: 1073741824
|
|
|
|
|
# series_file:
|
|
|
|
|
# # no_sync: true
|
|
|
|
|
# # max_concurrent_writes: 256
|
|
|
|
|
# # block_size: 262144
|
|
|
|
|
# cache:
|
|
|
|
|
# max_size: 1073741824
|
|
|
|
|
|
2021-02-07 23:45:55 +00:00
|
|
|
# alertingaaa:
|
|
|
|
|
# alertmanagers:
|
|
|
|
|
# - static_configs:
|
|
|
|
|
# targets: "alertmanager.viktorbarzin.lan"
|
|
|
|
|
alerting_rules.yml:
|
|
|
|
|
groups:
|
|
|
|
|
- name: NodeDown
|
|
|
|
|
rules:
|
|
|
|
|
- alert: NodeDown
|
2022-01-06 20:09:20 +00:00
|
|
|
expr: (up{job="kubernetes-nodes"} or on() vector(0)) == 0
|
2021-02-07 23:45:55 +00:00
|
|
|
for: 1m
|
|
|
|
|
labels:
|
|
|
|
|
severity: page
|
|
|
|
|
annotations:
|
2021-02-19 18:58:36 +00:00
|
|
|
summary: Node {{$labels.instance}} down.
|
2021-02-07 23:45:55 +00:00
|
|
|
- name: NodeHighCPUUsage
|
|
|
|
|
rules:
|
|
|
|
|
- alert: NodeHighCPUUsage
|
|
|
|
|
expr: node_load1 > 2
|
2021-02-17 18:33:52 +00:00
|
|
|
for: 10m
|
2021-02-07 23:45:55 +00:00
|
|
|
labels:
|
|
|
|
|
severity: page
|
|
|
|
|
annotations:
|
2021-02-19 18:58:36 +00:00
|
|
|
summary: "High CPU usage on node. Node load: {{ $value }}"
|
2023-03-26 17:22:04 +01:00
|
|
|
- name: NodeLowFreeMemory
|
|
|
|
|
rules:
|
|
|
|
|
- alert: NodeLowFreeMemory
|
|
|
|
|
expr: node_memory_MemAvailable_bytes < 500000000
|
|
|
|
|
for: 10m
|
|
|
|
|
labels:
|
|
|
|
|
severity: page
|
|
|
|
|
annotations:
|
|
|
|
|
summary: "Low free memory on node. Node load: {{ $value }}"
|
2021-02-07 23:45:55 +00:00
|
|
|
# - name: PodStuckNotReady
|
|
|
|
|
# rules:
|
|
|
|
|
# - alert: PodStuckNotReady
|
|
|
|
|
# expr: kube_pod_status_ready{condition="true"} == 0
|
|
|
|
|
# for: 5m
|
|
|
|
|
# labels:
|
|
|
|
|
# severity: page
|
|
|
|
|
# annotations:
|
|
|
|
|
# summary: Pod stuck not ready.
|
|
|
|
|
- name: ReadyPodsInDeploymentLessThanSpec
|
|
|
|
|
rules:
|
|
|
|
|
- alert: ReadyPodsInDeploymentLessThanSpec
|
2022-01-06 20:09:20 +00:00
|
|
|
expr: kube_deployment_status_replicas_available - on(exported_namespace, deployment) kube_deployment_spec_replicas < 0
|
2021-02-07 23:45:55 +00:00
|
|
|
for: 10m
|
|
|
|
|
labels:
|
|
|
|
|
severity: page
|
|
|
|
|
annotations:
|
|
|
|
|
summary: Number of ready pods in deployment is less than what is defined in spec.
|
|
|
|
|
- name: PowerOutage
|
|
|
|
|
rules:
|
|
|
|
|
- alert: PowerOutage
|
|
|
|
|
expr: r730_idrac_powerSupplyCurrentInputVoltage < 200
|
|
|
|
|
labels:
|
|
|
|
|
severity: page
|
|
|
|
|
annotations:
|
|
|
|
|
summary: Power voltage on a power supply is critically low indicating power outage.
|
2021-02-10 23:14:09 +00:00
|
|
|
- name: HighPowerUsage
|
|
|
|
|
rules:
|
|
|
|
|
- alert: HighPowerUsage
|
|
|
|
|
expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) > 112
|
2022-01-11 20:24:04 +00:00
|
|
|
for: 60m
|
2021-02-10 23:14:09 +00:00
|
|
|
labels:
|
|
|
|
|
severity: page
|
|
|
|
|
annotations:
|
2021-02-19 18:58:36 +00:00
|
|
|
summary: "High Power usage. Baseline is 112W. Current reading: {{$value}}"
|
2021-02-10 23:14:09 +00:00
|
|
|
- name: NoNodeLoadData
|
|
|
|
|
rules:
|
|
|
|
|
- alert: NoNodeLoadData
|
|
|
|
|
expr: (node_load1 OR on() vector(0)) == 0
|
|
|
|
|
for: 10m
|
|
|
|
|
labels:
|
|
|
|
|
severity: page
|
|
|
|
|
annotations:
|
|
|
|
|
summary: No node load data. Can signal that prometheus is not scraping
|
|
|
|
|
- name: NoiDRACData
|
|
|
|
|
rules:
|
|
|
|
|
- alert: NoiDRACData
|
|
|
|
|
expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) == 0
|
|
|
|
|
for: 10m
|
|
|
|
|
labels:
|
|
|
|
|
severity: page
|
|
|
|
|
annotations:
|
|
|
|
|
summary: No iDRAC amperage reading. Can signal that prometheus is not scraping
|
2021-03-07 23:14:35 +00:00
|
|
|
- name: OpenWRT High Memory Usage
|
|
|
|
|
rules:
|
|
|
|
|
- alert: OpenWRT High Memory Usage
|
|
|
|
|
expr: 100 - ((openwrt_node_memory_MemAvailable_bytes * 100) / openwrt_node_memory_MemTotal_bytes) > 90
|
|
|
|
|
for: 10m
|
|
|
|
|
labels:
|
|
|
|
|
severity: page
|
|
|
|
|
annotations:
|
|
|
|
|
summary: OpenWRT high memory usage. Can cause services getting stuck.
|
2021-04-10 18:15:56 +01:00
|
|
|
- name: Mailserver Down
|
|
|
|
|
rules:
|
|
|
|
|
- alert: Mail server has no replicas available
|
2022-01-06 20:09:20 +00:00
|
|
|
expr: (kube_deployment_status_replicas_available{exported_namespace="mailserver"} or on() vector(0)) < 1
|
2021-04-10 18:28:14 +01:00
|
|
|
for: 10m
|
2021-04-10 18:15:56 +01:00
|
|
|
labels:
|
|
|
|
|
severity: page
|
2021-04-11 19:18:40 +01:00
|
|
|
annotations:
|
|
|
|
|
summary: Mail server has no available replicas. This means mail may not be received.
|
2021-04-10 18:28:14 +01:00
|
|
|
- name: Hackmd Down
|
|
|
|
|
rules:
|
|
|
|
|
- alert: Hackmd has no replicas available
|
2022-01-06 20:09:20 +00:00
|
|
|
expr: (kube_deployment_status_replicas_available{exported_namespace="hackmd"} or on() vector(0)) < 1
|
2021-04-10 18:28:14 +01:00
|
|
|
for: 1m
|
|
|
|
|
labels:
|
|
|
|
|
severity: page
|
2021-04-11 19:18:40 +01:00
|
|
|
annotations:
|
|
|
|
|
summary: Hackmd has no available replicas.
|
2021-04-10 18:28:14 +01:00
|
|
|
- name: Privatebin Down
|
|
|
|
|
rules:
|
|
|
|
|
- alert: Privatebin has no replicas available
|
2022-01-06 20:09:20 +00:00
|
|
|
expr: (kube_deployment_status_replicas_available{exported_namespace="privatebin"} or on() vector(0)) < 1
|
2021-04-10 18:28:14 +01:00
|
|
|
for: 10m
|
|
|
|
|
labels:
|
|
|
|
|
severity: page
|
2021-04-11 19:18:40 +01:00
|
|
|
annotations:
|
|
|
|
|
summary: Privatebin has no available replicas.
|
2023-01-23 21:38:23 +00:00
|
|
|
- name: London OpenWRT Down
|
|
|
|
|
rules:
|
|
|
|
|
- alert: OpenWRT client unreachable
|
|
|
|
|
expr: (openwrt_node_openwrt_info or on() vector(0)) == 0
|
|
|
|
|
for: 10m
|
|
|
|
|
labels:
|
|
|
|
|
severity: page
|
|
|
|
|
annotations:
|
|
|
|
|
summary: London OpenWRT router unreachable through VPN
|
|
|
|
|
- name: London OpenWRT High System Load
|
|
|
|
|
rules:
|
|
|
|
|
- alert: OpenWRT high system load
|
|
|
|
|
expr: openwrt_node_load1 > 0.9
|
|
|
|
|
for: 15m
|
|
|
|
|
labels:
|
|
|
|
|
severity: page
|
|
|
|
|
annotations:
|
|
|
|
|
summary: High system load on OpenWRT
|
2023-04-03 23:35:33 +01:00
|
|
|
- name: Finance app webhook exceptions
|
2023-04-03 22:37:59 +01:00
|
|
|
rules:
|
2023-04-03 23:35:33 +01:00
|
|
|
- alert: Finance app webhook exceptions
|
|
|
|
|
expr: changes(webhook_failure_total[5m]) >= 1
|
2023-04-03 22:37:59 +01:00
|
|
|
for: 1m
|
|
|
|
|
labels:
|
|
|
|
|
severity: page
|
|
|
|
|
annotations:
|
2023-04-03 23:35:33 +01:00
|
|
|
summary: Finance app webhook exceptions
|
2023-04-05 00:15:10 +01:00
|
|
|
- name: Finance app unhandled exceptions
|
|
|
|
|
rules:
|
|
|
|
|
- alert: Finance app unhandled exceptions
|
|
|
|
|
expr: changes(flask_http_request_exceptions_total[5m]) >= 1
|
|
|
|
|
for: 1m
|
|
|
|
|
labels:
|
|
|
|
|
severity: page
|
|
|
|
|
annotations:
|
|
|
|
|
summary: Finance app unhandled exceptions
|
2021-02-07 23:45:55 +00:00
|
|
|
|
|
|
|
|
extraScrapeConfigs: |
|
|
|
|
|
- job_name: 'snmp-idrac'
|
|
|
|
|
static_configs:
|
|
|
|
|
- targets:
|
2023-02-12 21:30:54 +00:00
|
|
|
- "idrac.viktorbarzin.me:161"
|
2021-02-07 23:45:55 +00:00
|
|
|
metrics_path: '/snmp'
|
|
|
|
|
params:
|
|
|
|
|
module: [dell_idrac]
|
|
|
|
|
relabel_configs:
|
|
|
|
|
- source_labels: [__address__]
|
|
|
|
|
target_label: __param_target
|
|
|
|
|
- source_labels: [__param_target]
|
|
|
|
|
target_label: instance
|
|
|
|
|
- target_label: __address__
|
|
|
|
|
replacement: 'prometheus-snmp-exporter.monitoring.svc.cluster.local:9116'
|
|
|
|
|
metric_relabel_configs:
|
|
|
|
|
- source_labels: [ __name__ ]
|
|
|
|
|
target_label: '__name__'
|
|
|
|
|
action: replace
|
|
|
|
|
regex: '(.*)'
|
2021-02-10 23:14:09 +00:00
|
|
|
replacement: 'r730_idrac_$${1}'
|
2021-04-05 15:06:24 +01:00
|
|
|
- job_name: 'redfish-idrac'
|
|
|
|
|
scrape_interval: 5m
|
|
|
|
|
scrape_timeout: 2m
|
|
|
|
|
metrics_path: /redfish
|
|
|
|
|
static_configs:
|
|
|
|
|
- targets:
|
|
|
|
|
- idrac.viktorbarzin.lan
|
|
|
|
|
relabel_configs:
|
|
|
|
|
- source_labels: [__address__]
|
|
|
|
|
target_label: __param_target
|
|
|
|
|
- source_labels: [__param_target]
|
|
|
|
|
target_label: instance
|
|
|
|
|
- target_label: __address__
|
|
|
|
|
replacement: idrac-redfish-exporter.monitoring.svc.cluster.local:9090
|
|
|
|
|
metric_relabel_configs:
|
|
|
|
|
- source_labels: [ __name__ ]
|
|
|
|
|
target_label: '__name__'
|
|
|
|
|
action: replace
|
|
|
|
|
regex: '(.*)'
|
|
|
|
|
replacement: 'r730_idrac_$${1}'
|
2021-02-07 23:45:55 +00:00
|
|
|
- job_name: 'openwrt'
|
|
|
|
|
static_configs:
|
|
|
|
|
- targets:
|
2021-08-17 23:25:13 +01:00
|
|
|
#- "home.viktorbarzin.lan:9100"
|
2021-09-05 18:39:18 +01:00
|
|
|
- "10.0.20.100:9100"
|
2021-02-07 23:45:55 +00:00
|
|
|
metrics_path: '/metrics'
|
|
|
|
|
relabel_configs:
|
|
|
|
|
- source_labels: [__address__]
|
|
|
|
|
target_label: __param_target
|
|
|
|
|
- source_labels: [__param_target]
|
|
|
|
|
target_label: instance
|
|
|
|
|
- target_label: __address__
|
2021-08-17 23:25:13 +01:00
|
|
|
#replacement: 'home.viktorbarzin.lan:9100'
|
2021-09-05 18:39:18 +01:00
|
|
|
replacement: '10.0.20.100:9100'
|
2021-02-07 23:45:55 +00:00
|
|
|
metric_relabel_configs:
|
|
|
|
|
- source_labels: [ __name__ ]
|
|
|
|
|
target_label: '__name__'
|
|
|
|
|
action: replace
|
|
|
|
|
regex: '(.*)'
|
2021-02-10 23:14:09 +00:00
|
|
|
replacement: 'openwrt_$${1}'
|