disable email notifications as they are spammy and using sendgrid quota [ci skip]
This commit is contained in:
parent
17612c50a4
commit
3047ff1d9b
2 changed files with 149 additions and 149 deletions
|
|
@ -151,11 +151,11 @@ resource "kubernetes_deployment" "finance_app" {
|
||||||
value = var.graphql_api_secret
|
value = var.graphql_api_secret
|
||||||
}
|
}
|
||||||
env {
|
env {
|
||||||
name = "ENABLE_SCHEDULER"
|
name = "ENABLE_SCHEDULER"
|
||||||
value = 1
|
value = 1
|
||||||
}
|
}
|
||||||
env {
|
env {
|
||||||
name = "DEBUG_METRICS"
|
name = "DEBUG_METRICS"
|
||||||
value = 1
|
value = 1
|
||||||
}
|
}
|
||||||
volume_mount {
|
volume_mount {
|
||||||
|
|
|
||||||
|
|
@ -42,11 +42,11 @@ alertmanagerFiles:
|
||||||
receiver: ALL
|
receiver: ALL
|
||||||
receivers:
|
receivers:
|
||||||
- name: ALL
|
- name: ALL
|
||||||
email_configs:
|
# email_configs:
|
||||||
- to: "me@viktorbarzin.me"
|
# - to: "me@viktorbarzin.me"
|
||||||
send_resolved: true
|
# send_resolved: true
|
||||||
tls_config:
|
# tls_config:
|
||||||
insecure_skip_verify: true
|
# insecure_skip_verify: true
|
||||||
slack_configs:
|
slack_configs:
|
||||||
- send_resolved: true
|
- send_resolved: true
|
||||||
channel: "#general"
|
channel: "#general"
|
||||||
|
|
@ -54,7 +54,7 @@ alertmanagerFiles:
|
||||||
server:
|
server:
|
||||||
# Enable me to delete metrics
|
# Enable me to delete metrics
|
||||||
extraFlags:
|
extraFlags:
|
||||||
# - "web.enable-admin-api"
|
# - "web.enable-admin-api"
|
||||||
- "storage.tsdb.allow-overlapping-blocks"
|
- "storage.tsdb.allow-overlapping-blocks"
|
||||||
# - "storage.tsdb.retention.size=1GB"
|
# - "storage.tsdb.retention.size=1GB"
|
||||||
persistentVolume:
|
persistentVolume:
|
||||||
|
|
@ -82,38 +82,38 @@ server:
|
||||||
alertmanagers:
|
alertmanagers:
|
||||||
- static_configs:
|
- static_configs:
|
||||||
- targets:
|
- targets:
|
||||||
- "prometheus-alertmanager.monitoring.svc.cluster.local"
|
- "prometheus-alertmanager.monitoring.svc.cluster.local"
|
||||||
# - "alertmanager.viktorbarzin.me"
|
# - "alertmanager.viktorbarzin.me"
|
||||||
tls_config:
|
tls_config:
|
||||||
insecure_skip_verify: true
|
insecure_skip_verify: true
|
||||||
|
|
||||||
serverFiles:
|
serverFiles:
|
||||||
# prometheus.yml:
|
# prometheus.yml:
|
||||||
# storage:
|
# storage:
|
||||||
# tsdb:
|
# tsdb:
|
||||||
# # no_lockfile: true
|
# # no_lockfile: true
|
||||||
# # max_blocks_in_cache: 100000
|
# # max_blocks_in_cache: 100000
|
||||||
# # max_lookback_duration: 0s
|
# # max_lookback_duration: 0s
|
||||||
# # min_block_duration: 2h
|
# # min_block_duration: 2h
|
||||||
# # retention: 15d
|
# # retention: 15d
|
||||||
# # chunk_encoding: 1
|
# # chunk_encoding: 1
|
||||||
# # chunk_range: 1h
|
# # chunk_range: 1h
|
||||||
# # max_chunks_to_persist: 4800
|
# # max_chunks_to_persist: 4800
|
||||||
# # chunks_to_persist: 4800
|
# # chunks_to_persist: 4800
|
||||||
# cache:
|
# cache:
|
||||||
# entries: 5000
|
# entries: 5000
|
||||||
# head:
|
# head:
|
||||||
# chunk_bytes: 1048576
|
# chunk_bytes: 1048576
|
||||||
# # wal:
|
# # wal:
|
||||||
# # compressions: 1
|
# # compressions: 1
|
||||||
# # flush_after_seconds: 30
|
# # flush_after_seconds: 30
|
||||||
# # segment_size: 1073741824
|
# # segment_size: 1073741824
|
||||||
# series_file:
|
# series_file:
|
||||||
# # no_sync: true
|
# # no_sync: true
|
||||||
# # max_concurrent_writes: 256
|
# # max_concurrent_writes: 256
|
||||||
# # block_size: 262144
|
# # block_size: 262144
|
||||||
# cache:
|
# cache:
|
||||||
# max_size: 1073741824
|
# max_size: 1073741824
|
||||||
|
|
||||||
# alertingaaa:
|
# alertingaaa:
|
||||||
# alertmanagers:
|
# alertmanagers:
|
||||||
|
|
@ -123,31 +123,31 @@ serverFiles:
|
||||||
groups:
|
groups:
|
||||||
- name: NodeDown
|
- name: NodeDown
|
||||||
rules:
|
rules:
|
||||||
- alert: NodeDown
|
- alert: NodeDown
|
||||||
expr: (up{job="kubernetes-nodes"} or on() vector(0)) == 0
|
expr: (up{job="kubernetes-nodes"} or on() vector(0)) == 0
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: Node {{$labels.instance}} down.
|
summary: Node {{$labels.instance}} down.
|
||||||
- name: NodeHighCPUUsage
|
- name: NodeHighCPUUsage
|
||||||
rules:
|
rules:
|
||||||
- alert: NodeHighCPUUsage
|
- alert: NodeHighCPUUsage
|
||||||
expr: node_load1 > 2
|
expr: node_load1 > 2
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: "High CPU usage on node. Node load: {{ $value }}"
|
summary: "High CPU usage on node. Node load: {{ $value }}"
|
||||||
- name: NodeLowFreeMemory
|
- name: NodeLowFreeMemory
|
||||||
rules:
|
rules:
|
||||||
- alert: NodeLowFreeMemory
|
- alert: NodeLowFreeMemory
|
||||||
expr: node_memory_MemAvailable_bytes < 500000000
|
expr: node_memory_MemAvailable_bytes < 500000000
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Low free memory on node. Node load: {{ $value }}"
|
summary: "Low free memory on node. Node load: {{ $value }}"
|
||||||
# - name: PodStuckNotReady
|
# - name: PodStuckNotReady
|
||||||
# rules:
|
# rules:
|
||||||
# - alert: PodStuckNotReady
|
# - alert: PodStuckNotReady
|
||||||
|
|
@ -159,120 +159,120 @@ serverFiles:
|
||||||
# summary: Pod stuck not ready.
|
# summary: Pod stuck not ready.
|
||||||
- name: ReadyPodsInDeploymentLessThanSpec
|
- name: ReadyPodsInDeploymentLessThanSpec
|
||||||
rules:
|
rules:
|
||||||
- alert: ReadyPodsInDeploymentLessThanSpec
|
- alert: ReadyPodsInDeploymentLessThanSpec
|
||||||
expr: kube_deployment_status_replicas_available - on(exported_namespace, deployment) kube_deployment_spec_replicas < 0
|
expr: kube_deployment_status_replicas_available - on(exported_namespace, deployment) kube_deployment_spec_replicas < 0
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: Number of ready pods in deployment is less than what is defined in spec.
|
summary: Number of ready pods in deployment is less than what is defined in spec.
|
||||||
- name: PowerOutage
|
- name: PowerOutage
|
||||||
rules:
|
rules:
|
||||||
- alert: PowerOutage
|
- alert: PowerOutage
|
||||||
expr: r730_idrac_powerSupplyCurrentInputVoltage < 200
|
expr: r730_idrac_powerSupplyCurrentInputVoltage < 200
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: Power voltage on a power supply is critically low indicating power outage.
|
summary: Power voltage on a power supply is critically low indicating power outage.
|
||||||
- name: HighPowerUsage
|
- name: HighPowerUsage
|
||||||
rules:
|
rules:
|
||||||
- alert: HighPowerUsage
|
- alert: HighPowerUsage
|
||||||
expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) > 112
|
expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) > 112
|
||||||
for: 60m
|
for: 60m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: "High Power usage. Baseline is 112W. Current reading: {{$value}}"
|
summary: "High Power usage. Baseline is 112W. Current reading: {{$value}}"
|
||||||
- name: NoNodeLoadData
|
- name: NoNodeLoadData
|
||||||
rules:
|
rules:
|
||||||
- alert: NoNodeLoadData
|
- alert: NoNodeLoadData
|
||||||
expr: (node_load1 OR on() vector(0)) == 0
|
expr: (node_load1 OR on() vector(0)) == 0
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: No node load data. Can signal that prometheus is not scraping
|
summary: No node load data. Can signal that prometheus is not scraping
|
||||||
- name: NoiDRACData
|
- name: NoiDRACData
|
||||||
rules:
|
rules:
|
||||||
- alert: NoiDRACData
|
- alert: NoiDRACData
|
||||||
expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) == 0
|
expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) == 0
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: No iDRAC amperage reading. Can signal that prometheus is not scraping
|
summary: No iDRAC amperage reading. Can signal that prometheus is not scraping
|
||||||
- name: OpenWRT High Memory Usage
|
- name: OpenWRT High Memory Usage
|
||||||
rules:
|
rules:
|
||||||
- alert: OpenWRT High Memory Usage
|
- alert: OpenWRT High Memory Usage
|
||||||
expr: 100 - ((openwrt_node_memory_MemAvailable_bytes * 100) / openwrt_node_memory_MemTotal_bytes) > 90
|
expr: 100 - ((openwrt_node_memory_MemAvailable_bytes * 100) / openwrt_node_memory_MemTotal_bytes) > 90
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: OpenWRT high memory usage. Can cause services getting stuck.
|
summary: OpenWRT high memory usage. Can cause services getting stuck.
|
||||||
- name: Mailserver Down
|
- name: Mailserver Down
|
||||||
rules:
|
rules:
|
||||||
- alert: Mail server has no replicas available
|
- alert: Mail server has no replicas available
|
||||||
expr: (kube_deployment_status_replicas_available{exported_namespace="mailserver"} or on() vector(0)) < 1
|
expr: (kube_deployment_status_replicas_available{exported_namespace="mailserver"} or on() vector(0)) < 1
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: Mail server has no available replicas. This means mail may not be received.
|
summary: Mail server has no available replicas. This means mail may not be received.
|
||||||
- name: Hackmd Down
|
- name: Hackmd Down
|
||||||
rules:
|
rules:
|
||||||
- alert: Hackmd has no replicas available
|
- alert: Hackmd has no replicas available
|
||||||
expr: (kube_deployment_status_replicas_available{exported_namespace="hackmd"} or on() vector(0)) < 1
|
expr: (kube_deployment_status_replicas_available{exported_namespace="hackmd"} or on() vector(0)) < 1
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: Hackmd has no available replicas.
|
summary: Hackmd has no available replicas.
|
||||||
- name: Privatebin Down
|
- name: Privatebin Down
|
||||||
rules:
|
rules:
|
||||||
- alert: Privatebin has no replicas available
|
- alert: Privatebin has no replicas available
|
||||||
expr: (kube_deployment_status_replicas_available{exported_namespace="privatebin"} or on() vector(0)) < 1
|
expr: (kube_deployment_status_replicas_available{exported_namespace="privatebin"} or on() vector(0)) < 1
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: Privatebin has no available replicas.
|
summary: Privatebin has no available replicas.
|
||||||
- name: London OpenWRT Down
|
- name: London OpenWRT Down
|
||||||
rules:
|
rules:
|
||||||
- alert: OpenWRT client unreachable
|
- alert: OpenWRT client unreachable
|
||||||
expr: (openwrt_node_openwrt_info or on() vector(0)) == 0
|
expr: (openwrt_node_openwrt_info or on() vector(0)) == 0
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: London OpenWRT router unreachable through VPN
|
summary: London OpenWRT router unreachable through VPN
|
||||||
- name: London OpenWRT High System Load
|
- name: London OpenWRT High System Load
|
||||||
rules:
|
rules:
|
||||||
- alert: OpenWRT high system load
|
- alert: OpenWRT high system load
|
||||||
expr: openwrt_node_load1 > 0.9
|
expr: openwrt_node_load1 > 0.9
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: High system load on OpenWRT
|
summary: High system load on OpenWRT
|
||||||
- name: Finance app webhook exceptions
|
- name: Finance app webhook exceptions
|
||||||
rules:
|
rules:
|
||||||
- alert: Finance app webhook exceptions
|
- alert: Finance app webhook exceptions
|
||||||
expr: changes(webhook_failure_total[5m]) >= 1
|
expr: changes(webhook_failure_total[5m]) >= 1
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: Finance app webhook exceptions
|
summary: Finance app webhook exceptions
|
||||||
- name: Finance app unhandled exceptions
|
- name: Finance app unhandled exceptions
|
||||||
rules:
|
rules:
|
||||||
- alert: Finance app unhandled exceptions
|
- alert: Finance app unhandled exceptions
|
||||||
expr: changes(flask_http_request_exceptions_total[5m]) >= 1
|
expr: changes(flask_http_request_exceptions_total[5m]) >= 1
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: Finance app unhandled exceptions
|
summary: Finance app unhandled exceptions
|
||||||
|
|
||||||
extraScrapeConfigs: |
|
extraScrapeConfigs: |
|
||||||
- job_name: 'snmp-idrac'
|
- job_name: 'snmp-idrac'
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue