disable email notifications as they are spammy and using sendgrid quota [ci skip]

This commit is contained in:
Viktor Barzin 2023-06-20 14:04:02 +00:00
parent 17612c50a4
commit 3047ff1d9b
2 changed files with 149 additions and 149 deletions

View file

@ -151,11 +151,11 @@ resource "kubernetes_deployment" "finance_app" {
value = var.graphql_api_secret value = var.graphql_api_secret
} }
env { env {
name = "ENABLE_SCHEDULER" name = "ENABLE_SCHEDULER"
value = 1 value = 1
} }
env { env {
name = "DEBUG_METRICS" name = "DEBUG_METRICS"
value = 1 value = 1
} }
volume_mount { volume_mount {

View file

@ -42,11 +42,11 @@ alertmanagerFiles:
receiver: ALL receiver: ALL
receivers: receivers:
- name: ALL - name: ALL
email_configs: # email_configs:
- to: "me@viktorbarzin.me" # - to: "me@viktorbarzin.me"
send_resolved: true # send_resolved: true
tls_config: # tls_config:
insecure_skip_verify: true # insecure_skip_verify: true
slack_configs: slack_configs:
- send_resolved: true - send_resolved: true
channel: "#general" channel: "#general"
@ -54,7 +54,7 @@ alertmanagerFiles:
server: server:
# Enable me to delete metrics # Enable me to delete metrics
extraFlags: extraFlags:
# - "web.enable-admin-api" # - "web.enable-admin-api"
- "storage.tsdb.allow-overlapping-blocks" - "storage.tsdb.allow-overlapping-blocks"
# - "storage.tsdb.retention.size=1GB" # - "storage.tsdb.retention.size=1GB"
persistentVolume: persistentVolume:
@ -82,38 +82,38 @@ server:
alertmanagers: alertmanagers:
- static_configs: - static_configs:
- targets: - targets:
- "prometheus-alertmanager.monitoring.svc.cluster.local" - "prometheus-alertmanager.monitoring.svc.cluster.local"
# - "alertmanager.viktorbarzin.me" # - "alertmanager.viktorbarzin.me"
tls_config: tls_config:
insecure_skip_verify: true insecure_skip_verify: true
serverFiles: serverFiles:
# prometheus.yml: # prometheus.yml:
# storage: # storage:
# tsdb: # tsdb:
# # no_lockfile: true # # no_lockfile: true
# # max_blocks_in_cache: 100000 # # max_blocks_in_cache: 100000
# # max_lookback_duration: 0s # # max_lookback_duration: 0s
# # min_block_duration: 2h # # min_block_duration: 2h
# # retention: 15d # # retention: 15d
# # chunk_encoding: 1 # # chunk_encoding: 1
# # chunk_range: 1h # # chunk_range: 1h
# # max_chunks_to_persist: 4800 # # max_chunks_to_persist: 4800
# # chunks_to_persist: 4800 # # chunks_to_persist: 4800
# cache: # cache:
# entries: 5000 # entries: 5000
# head: # head:
# chunk_bytes: 1048576 # chunk_bytes: 1048576
# # wal: # # wal:
# # compressions: 1 # # compressions: 1
# # flush_after_seconds: 30 # # flush_after_seconds: 30
# # segment_size: 1073741824 # # segment_size: 1073741824
# series_file: # series_file:
# # no_sync: true # # no_sync: true
# # max_concurrent_writes: 256 # # max_concurrent_writes: 256
# # block_size: 262144 # # block_size: 262144
# cache: # cache:
# max_size: 1073741824 # max_size: 1073741824
# alertingaaa: # alertingaaa:
# alertmanagers: # alertmanagers:
@ -123,31 +123,31 @@ serverFiles:
groups: groups:
- name: NodeDown - name: NodeDown
rules: rules:
- alert: NodeDown - alert: NodeDown
expr: (up{job="kubernetes-nodes"} or on() vector(0)) == 0 expr: (up{job="kubernetes-nodes"} or on() vector(0)) == 0
for: 1m for: 1m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: Node {{$labels.instance}} down. summary: Node {{$labels.instance}} down.
- name: NodeHighCPUUsage - name: NodeHighCPUUsage
rules: rules:
- alert: NodeHighCPUUsage - alert: NodeHighCPUUsage
expr: node_load1 > 2 expr: node_load1 > 2
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: "High CPU usage on node. Node load: {{ $value }}" summary: "High CPU usage on node. Node load: {{ $value }}"
- name: NodeLowFreeMemory - name: NodeLowFreeMemory
rules: rules:
- alert: NodeLowFreeMemory - alert: NodeLowFreeMemory
expr: node_memory_MemAvailable_bytes < 500000000 expr: node_memory_MemAvailable_bytes < 500000000
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: "Low free memory on node. Node load: {{ $value }}" summary: "Low free memory on node. Node load: {{ $value }}"
# - name: PodStuckNotReady # - name: PodStuckNotReady
# rules: # rules:
# - alert: PodStuckNotReady # - alert: PodStuckNotReady
@ -159,120 +159,120 @@ serverFiles:
# summary: Pod stuck not ready. # summary: Pod stuck not ready.
- name: ReadyPodsInDeploymentLessThanSpec - name: ReadyPodsInDeploymentLessThanSpec
rules: rules:
- alert: ReadyPodsInDeploymentLessThanSpec - alert: ReadyPodsInDeploymentLessThanSpec
expr: kube_deployment_status_replicas_available - on(exported_namespace, deployment) kube_deployment_spec_replicas < 0 expr: kube_deployment_status_replicas_available - on(exported_namespace, deployment) kube_deployment_spec_replicas < 0
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: Number of ready pods in deployment is less than what is defined in spec. summary: Number of ready pods in deployment is less than what is defined in spec.
- name: PowerOutage - name: PowerOutage
rules: rules:
- alert: PowerOutage - alert: PowerOutage
expr: r730_idrac_powerSupplyCurrentInputVoltage < 200 expr: r730_idrac_powerSupplyCurrentInputVoltage < 200
labels: labels:
severity: page severity: page
annotations: annotations:
summary: Power voltage on a power supply is critically low indicating power outage. summary: Power voltage on a power supply is critically low indicating power outage.
- name: HighPowerUsage - name: HighPowerUsage
rules: rules:
- alert: HighPowerUsage - alert: HighPowerUsage
expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) > 112 expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) > 112
for: 60m for: 60m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: "High Power usage. Baseline is 112W. Current reading: {{$value}}" summary: "High Power usage. Baseline is 112W. Current reading: {{$value}}"
- name: NoNodeLoadData - name: NoNodeLoadData
rules: rules:
- alert: NoNodeLoadData - alert: NoNodeLoadData
expr: (node_load1 OR on() vector(0)) == 0 expr: (node_load1 OR on() vector(0)) == 0
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: No node load data. Can signal that prometheus is not scraping summary: No node load data. Can signal that prometheus is not scraping
- name: NoiDRACData - name: NoiDRACData
rules: rules:
- alert: NoiDRACData - alert: NoiDRACData
expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) == 0 expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) == 0
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: No iDRAC amperage reading. Can signal that prometheus is not scraping summary: No iDRAC amperage reading. Can signal that prometheus is not scraping
- name: OpenWRT High Memory Usage - name: OpenWRT High Memory Usage
rules: rules:
- alert: OpenWRT High Memory Usage - alert: OpenWRT High Memory Usage
expr: 100 - ((openwrt_node_memory_MemAvailable_bytes * 100) / openwrt_node_memory_MemTotal_bytes) > 90 expr: 100 - ((openwrt_node_memory_MemAvailable_bytes * 100) / openwrt_node_memory_MemTotal_bytes) > 90
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: OpenWRT high memory usage. Can cause services getting stuck. summary: OpenWRT high memory usage. Can cause services getting stuck.
- name: Mailserver Down - name: Mailserver Down
rules: rules:
- alert: Mail server has no replicas available - alert: Mail server has no replicas available
expr: (kube_deployment_status_replicas_available{exported_namespace="mailserver"} or on() vector(0)) < 1 expr: (kube_deployment_status_replicas_available{exported_namespace="mailserver"} or on() vector(0)) < 1
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: Mail server has no available replicas. This means mail may not be received. summary: Mail server has no available replicas. This means mail may not be received.
- name: Hackmd Down - name: Hackmd Down
rules: rules:
- alert: Hackmd has no replicas available - alert: Hackmd has no replicas available
expr: (kube_deployment_status_replicas_available{exported_namespace="hackmd"} or on() vector(0)) < 1 expr: (kube_deployment_status_replicas_available{exported_namespace="hackmd"} or on() vector(0)) < 1
for: 1m for: 1m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: Hackmd has no available replicas. summary: Hackmd has no available replicas.
- name: Privatebin Down - name: Privatebin Down
rules: rules:
- alert: Privatebin has no replicas available - alert: Privatebin has no replicas available
expr: (kube_deployment_status_replicas_available{exported_namespace="privatebin"} or on() vector(0)) < 1 expr: (kube_deployment_status_replicas_available{exported_namespace="privatebin"} or on() vector(0)) < 1
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: Privatebin has no available replicas. summary: Privatebin has no available replicas.
- name: London OpenWRT Down - name: London OpenWRT Down
rules: rules:
- alert: OpenWRT client unreachable - alert: OpenWRT client unreachable
expr: (openwrt_node_openwrt_info or on() vector(0)) == 0 expr: (openwrt_node_openwrt_info or on() vector(0)) == 0
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: London OpenWRT router unreachable through VPN summary: London OpenWRT router unreachable through VPN
- name: London OpenWRT High System Load - name: London OpenWRT High System Load
rules: rules:
- alert: OpenWRT high system load - alert: OpenWRT high system load
expr: openwrt_node_load1 > 0.9 expr: openwrt_node_load1 > 0.9
for: 15m for: 15m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: High system load on OpenWRT summary: High system load on OpenWRT
- name: Finance app webhook exceptions - name: Finance app webhook exceptions
rules: rules:
- alert: Finance app webhook exceptions - alert: Finance app webhook exceptions
expr: changes(webhook_failure_total[5m]) >= 1 expr: changes(webhook_failure_total[5m]) >= 1
for: 1m for: 1m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: Finance app webhook exceptions summary: Finance app webhook exceptions
- name: Finance app unhandled exceptions - name: Finance app unhandled exceptions
rules: rules:
- alert: Finance app unhandled exceptions - alert: Finance app unhandled exceptions
expr: changes(flask_http_request_exceptions_total[5m]) >= 1 expr: changes(flask_http_request_exceptions_total[5m]) >= 1
for: 1m for: 1m
labels: labels:
severity: page severity: page
annotations: annotations:
summary: Finance app unhandled exceptions summary: Finance app unhandled exceptions
extraScrapeConfigs: | extraScrapeConfigs: |
- job_name: 'snmp-idrac' - job_name: 'snmp-idrac'