infra/modules/kubernetes/monitoring/prometheus_chart_values.tpl

267 lines
8.7 KiB
Smarty
Raw Normal View History

2021-02-07 23:45:55 +00:00
# Helm values
# all values - https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus/values.yaml
alertmanager:
persistentVolume:
enabled: false
#existingClaim: alertmanager-iscsi-pvc
2021-02-07 23:45:55 +00:00
# storageClass: rook-cephfs
strategy:
type: Recreate
ingress:
enabled: "true"
annotations:
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
# Enable client certificate authentication
nginx.ingress.kubernetes.io/auth-tls-verify-client: "on"
# Create the secret containing the trusted ca certificates
nginx.ingress.kubernetes.io/auth-tls-secret: "default/ca-secret"
tls:
- secretName: "tls-secret"
hosts:
- "alertmanager.viktorbarzin.me"
hosts:
- "alertmanager.viktorbarzin.me"
alertmanagerFiles:
alertmanager.yml:
global:
smtp_from: "alertmanager@viktorbarzin.me"
# smtp_smarthost: "smtp.viktorbarzin.me:587"
smtp_smarthost: "mailserver.mailserver.svc.cluster.local:587"
smtp_auth_username: "alertmanager@viktorbarzin.me"
smtp_auth_password: "${alertmanager_mail_pass}"
smtp_require_tls: true
slack_api_url: "${alertmanager_slack_api_url}"
2021-02-07 23:45:55 +00:00
templates:
- "/etc/alertmanager/template/*.tmpl"
route:
group_by: ["alertname"]
group_wait: 3s
group_interval: 5s
repeat_interval: 1h
receiver: ALL
2021-02-07 23:45:55 +00:00
receivers:
- name: ALL
2021-02-07 23:45:55 +00:00
email_configs:
- to: "me@viktorbarzin.me"
send_resolved: true
tls_config:
insecure_skip_verify: true
slack_configs:
- send_resolved: true
channel: "#general"
2021-02-07 23:45:55 +00:00
server:
# Enable me to delete metrics
extraFlags:
2021-04-05 15:06:24 +01:00
# - "web.enable-admin-api"
- "storage.tsdb.allow-overlapping-blocks"
2021-02-07 23:45:55 +00:00
persistentVolume:
# enabled: false
existingClaim: prometheus-iscsi-pvc
# storageClass: rook-cephfs
retention: "12w" # ~100GB storage
strategy:
type: Recreate
ingress:
enabled: "true"
annotations:
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
# Enable client certificate authentication
nginx.ingress.kubernetes.io/auth-tls-verify-client: "on"
# Create the secret containing the trusted ca certificates
nginx.ingress.kubernetes.io/auth-tls-secret: "default/ca-secret"
tls:
- secretName: "tls-secret"
hosts:
- "prometheus.viktorbarzin.me"
hosts:
- "prometheus.viktorbarzin.me"
alertmanagers:
- static_configs:
- targets:
- "prometheus-alertmanager.monitoring.svc.cluster.local"
# - "alertmanager.viktorbarzin.me"
tls_config:
insecure_skip_verify: true
serverFiles:
# prometheus.yml:
# alertingaaa:
# alertmanagers:
# - static_configs:
# targets: "alertmanager.viktorbarzin.lan"
alerting_rules.yml:
groups:
- name: NodeDown
rules:
- alert: NodeDown
expr: (up{job="kubernetes-nodes"} or on() vector(0)) == 0
2021-02-07 23:45:55 +00:00
for: 1m
labels:
severity: page
annotations:
2021-02-19 18:58:36 +00:00
summary: Node {{$labels.instance}} down.
2021-02-07 23:45:55 +00:00
- name: NodeHighCPUUsage
rules:
- alert: NodeHighCPUUsage
expr: node_load1 > 2
2021-02-17 18:33:52 +00:00
for: 10m
2021-02-07 23:45:55 +00:00
labels:
severity: page
annotations:
2021-02-19 18:58:36 +00:00
summary: "High CPU usage on node. Node load: {{ $value }}"
2021-02-07 23:45:55 +00:00
# - name: PodStuckNotReady
# rules:
# - alert: PodStuckNotReady
# expr: kube_pod_status_ready{condition="true"} == 0
# for: 5m
# labels:
# severity: page
# annotations:
# summary: Pod stuck not ready.
- name: ReadyPodsInDeploymentLessThanSpec
rules:
- alert: ReadyPodsInDeploymentLessThanSpec
expr: kube_deployment_status_replicas_available - on(exported_namespace, deployment) kube_deployment_spec_replicas < 0
2021-02-07 23:45:55 +00:00
for: 10m
labels:
severity: page
annotations:
summary: Number of ready pods in deployment is less than what is defined in spec.
- name: PowerOutage
rules:
- alert: PowerOutage
expr: r730_idrac_powerSupplyCurrentInputVoltage < 200
labels:
severity: page
annotations:
summary: Power voltage on a power supply is critically low indicating power outage.
- name: HighPowerUsage
rules:
- alert: HighPowerUsage
expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) > 112
for: 60m
labels:
severity: page
annotations:
2021-02-19 18:58:36 +00:00
summary: "High Power usage. Baseline is 112W. Current reading: {{$value}}"
- name: NoNodeLoadData
rules:
- alert: NoNodeLoadData
expr: (node_load1 OR on() vector(0)) == 0
for: 10m
labels:
severity: page
annotations:
summary: No node load data. Can signal that prometheus is not scraping
- name: NoiDRACData
rules:
- alert: NoiDRACData
expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) == 0
for: 10m
labels:
severity: page
annotations:
summary: No iDRAC amperage reading. Can signal that prometheus is not scraping
- name: OpenWRT High Memory Usage
rules:
- alert: OpenWRT High Memory Usage
expr: 100 - ((openwrt_node_memory_MemAvailable_bytes * 100) / openwrt_node_memory_MemTotal_bytes) > 90
for: 10m
labels:
severity: page
annotations:
summary: OpenWRT high memory usage. Can cause services getting stuck.
- name: Mailserver Down
rules:
- alert: Mail server has no replicas available
expr: (kube_deployment_status_replicas_available{exported_namespace="mailserver"} or on() vector(0)) < 1
for: 10m
labels:
severity: page
2021-04-11 19:18:40 +01:00
annotations:
summary: Mail server has no available replicas. This means mail may not be received.
- name: Hackmd Down
rules:
- alert: Hackmd has no replicas available
expr: (kube_deployment_status_replicas_available{exported_namespace="hackmd"} or on() vector(0)) < 1
for: 1m
labels:
severity: page
2021-04-11 19:18:40 +01:00
annotations:
summary: Hackmd has no available replicas.
- name: Privatebin Down
rules:
- alert: Privatebin has no replicas available
expr: (kube_deployment_status_replicas_available{exported_namespace="privatebin"} or on() vector(0)) < 1
for: 10m
labels:
severity: page
2021-04-11 19:18:40 +01:00
annotations:
summary: Privatebin has no available replicas.
2021-02-07 23:45:55 +00:00
extraScrapeConfigs: |
- job_name: 'snmp-idrac'
static_configs:
- targets:
- "idrac.viktorbarzin.lan:161"
metrics_path: '/snmp'
params:
module: [dell_idrac]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 'prometheus-snmp-exporter.monitoring.svc.cluster.local:9116'
metric_relabel_configs:
- source_labels: [ __name__ ]
target_label: '__name__'
action: replace
regex: '(.*)'
replacement: 'r730_idrac_$${1}'
2021-04-05 15:06:24 +01:00
- job_name: 'redfish-idrac'
scrape_interval: 5m
scrape_timeout: 2m
metrics_path: /redfish
static_configs:
- targets:
- idrac.viktorbarzin.lan
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: idrac-redfish-exporter.monitoring.svc.cluster.local:9090
metric_relabel_configs:
- source_labels: [ __name__ ]
target_label: '__name__'
action: replace
regex: '(.*)'
replacement: 'r730_idrac_$${1}'
2021-02-07 23:45:55 +00:00
- job_name: 'openwrt'
static_configs:
- targets:
2021-08-17 23:25:13 +01:00
#- "home.viktorbarzin.lan:9100"
2021-09-05 18:39:18 +01:00
- "10.0.20.100:9100"
2021-02-07 23:45:55 +00:00
metrics_path: '/metrics'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
2021-08-17 23:25:13 +01:00
#replacement: 'home.viktorbarzin.lan:9100'
2021-09-05 18:39:18 +01:00
replacement: '10.0.20.100:9100'
2021-02-07 23:45:55 +00:00
metric_relabel_configs:
- source_labels: [ __name__ ]
target_label: '__name__'
action: replace
regex: '(.*)'
replacement: 'openwrt_$${1}'