diff --git a/modules/kubernetes/finance_app/main.tf b/modules/kubernetes/finance_app/main.tf index c7f1f9ef..5f726440 100644 --- a/modules/kubernetes/finance_app/main.tf +++ b/modules/kubernetes/finance_app/main.tf @@ -151,11 +151,11 @@ resource "kubernetes_deployment" "finance_app" { value = var.graphql_api_secret } env { - name = "ENABLE_SCHEDULER" + name = "ENABLE_SCHEDULER" value = 1 } env { - name = "DEBUG_METRICS" + name = "DEBUG_METRICS" value = 1 } volume_mount { diff --git a/modules/kubernetes/monitoring/prometheus_chart_values.tpl b/modules/kubernetes/monitoring/prometheus_chart_values.tpl index 68a591e9..aacc859b 100644 --- a/modules/kubernetes/monitoring/prometheus_chart_values.tpl +++ b/modules/kubernetes/monitoring/prometheus_chart_values.tpl @@ -25,7 +25,7 @@ alertmanager: alertmanagerFiles: alertmanager.yml: global: - smtp_from: "alertmanager@viktorbarzin.me" + smtp_from: "alertmanager@viktorbarzin.me" # smtp_smarthost: "smtp.viktorbarzin.me:587" smtp_smarthost: "mailserver.mailserver.svc.cluster.local:587" smtp_auth_username: "alertmanager@viktorbarzin.me" @@ -42,11 +42,11 @@ alertmanagerFiles: receiver: ALL receivers: - name: ALL - email_configs: - - to: "me@viktorbarzin.me" - send_resolved: true - tls_config: - insecure_skip_verify: true + # email_configs: + # - to: "me@viktorbarzin.me" + # send_resolved: true + # tls_config: + # insecure_skip_verify: true slack_configs: - send_resolved: true channel: "#general" @@ -54,7 +54,7 @@ alertmanagerFiles: server: # Enable me to delete metrics extraFlags: - # - "web.enable-admin-api" + # - "web.enable-admin-api" - "storage.tsdb.allow-overlapping-blocks" # - "storage.tsdb.retention.size=1GB" persistentVolume: @@ -81,39 +81,39 @@ server: - "prometheus.viktorbarzin.me" alertmanagers: - static_configs: - - targets: - - "prometheus-alertmanager.monitoring.svc.cluster.local" + - targets: + - "prometheus-alertmanager.monitoring.svc.cluster.local" # - "alertmanager.viktorbarzin.me" tls_config: insecure_skip_verify: true serverFiles: - # prometheus.yml: - # storage: - # tsdb: - # # no_lockfile: true - # # max_blocks_in_cache: 100000 - # # max_lookback_duration: 0s - # # min_block_duration: 2h - # # retention: 15d - # # chunk_encoding: 1 - # # chunk_range: 1h - # # max_chunks_to_persist: 4800 - # # chunks_to_persist: 4800 - # cache: - # entries: 5000 - # head: - # chunk_bytes: 1048576 - # # wal: - # # compressions: 1 - # # flush_after_seconds: 30 - # # segment_size: 1073741824 - # series_file: - # # no_sync: true - # # max_concurrent_writes: 256 - # # block_size: 262144 - # cache: - # max_size: 1073741824 + # prometheus.yml: + # storage: + # tsdb: + # # no_lockfile: true + # # max_blocks_in_cache: 100000 + # # max_lookback_duration: 0s + # # min_block_duration: 2h + # # retention: 15d + # # chunk_encoding: 1 + # # chunk_range: 1h + # # max_chunks_to_persist: 4800 + # # chunks_to_persist: 4800 + # cache: + # entries: 5000 + # head: + # chunk_bytes: 1048576 + # # wal: + # # compressions: 1 + # # flush_after_seconds: 30 + # # segment_size: 1073741824 + # series_file: + # # no_sync: true + # # max_concurrent_writes: 256 + # # block_size: 262144 + # cache: + # max_size: 1073741824 # alertingaaa: # alertmanagers: @@ -123,31 +123,31 @@ serverFiles: groups: - name: NodeDown rules: - - alert: NodeDown - expr: (up{job="kubernetes-nodes"} or on() vector(0)) == 0 - for: 1m - labels: - severity: page - annotations: - summary: Node {{$labels.instance}} down. + - alert: NodeDown + expr: (up{job="kubernetes-nodes"} or on() vector(0)) == 0 + for: 1m + labels: + severity: page + annotations: + summary: Node {{$labels.instance}} down. - name: NodeHighCPUUsage rules: - - alert: NodeHighCPUUsage - expr: node_load1 > 2 - for: 10m - labels: - severity: page - annotations: - summary: "High CPU usage on node. Node load: {{ $value }}" + - alert: NodeHighCPUUsage + expr: node_load1 > 2 + for: 10m + labels: + severity: page + annotations: + summary: "High CPU usage on node. Node load: {{ $value }}" - name: NodeLowFreeMemory rules: - - alert: NodeLowFreeMemory - expr: node_memory_MemAvailable_bytes < 500000000 - for: 10m - labels: - severity: page - annotations: - summary: "Low free memory on node. Node load: {{ $value }}" + - alert: NodeLowFreeMemory + expr: node_memory_MemAvailable_bytes < 500000000 + for: 10m + labels: + severity: page + annotations: + summary: "Low free memory on node. Node load: {{ $value }}" # - name: PodStuckNotReady # rules: # - alert: PodStuckNotReady @@ -159,121 +159,121 @@ serverFiles: # summary: Pod stuck not ready. - name: ReadyPodsInDeploymentLessThanSpec rules: - - alert: ReadyPodsInDeploymentLessThanSpec - expr: kube_deployment_status_replicas_available - on(exported_namespace, deployment) kube_deployment_spec_replicas < 0 - for: 10m - labels: - severity: page - annotations: - summary: Number of ready pods in deployment is less than what is defined in spec. + - alert: ReadyPodsInDeploymentLessThanSpec + expr: kube_deployment_status_replicas_available - on(exported_namespace, deployment) kube_deployment_spec_replicas < 0 + for: 10m + labels: + severity: page + annotations: + summary: Number of ready pods in deployment is less than what is defined in spec. - name: PowerOutage rules: - - alert: PowerOutage - expr: r730_idrac_powerSupplyCurrentInputVoltage < 200 - labels: - severity: page - annotations: - summary: Power voltage on a power supply is critically low indicating power outage. + - alert: PowerOutage + expr: r730_idrac_powerSupplyCurrentInputVoltage < 200 + labels: + severity: page + annotations: + summary: Power voltage on a power supply is critically low indicating power outage. - name: HighPowerUsage rules: - - alert: HighPowerUsage - expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) > 112 - for: 60m - labels: - severity: page - annotations: - summary: "High Power usage. Baseline is 112W. Current reading: {{$value}}" + - alert: HighPowerUsage + expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) > 112 + for: 60m + labels: + severity: page + annotations: + summary: "High Power usage. Baseline is 112W. Current reading: {{$value}}" - name: NoNodeLoadData rules: - - alert: NoNodeLoadData - expr: (node_load1 OR on() vector(0)) == 0 - for: 10m - labels: - severity: page - annotations: - summary: No node load data. Can signal that prometheus is not scraping + - alert: NoNodeLoadData + expr: (node_load1 OR on() vector(0)) == 0 + for: 10m + labels: + severity: page + annotations: + summary: No node load data. Can signal that prometheus is not scraping - name: NoiDRACData rules: - - alert: NoiDRACData - expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) == 0 - for: 10m - labels: - severity: page - annotations: - summary: No iDRAC amperage reading. Can signal that prometheus is not scraping + - alert: NoiDRACData + expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) == 0 + for: 10m + labels: + severity: page + annotations: + summary: No iDRAC amperage reading. Can signal that prometheus is not scraping - name: OpenWRT High Memory Usage rules: - - alert: OpenWRT High Memory Usage - expr: 100 - ((openwrt_node_memory_MemAvailable_bytes * 100) / openwrt_node_memory_MemTotal_bytes) > 90 - for: 10m - labels: - severity: page - annotations: - summary: OpenWRT high memory usage. Can cause services getting stuck. + - alert: OpenWRT High Memory Usage + expr: 100 - ((openwrt_node_memory_MemAvailable_bytes * 100) / openwrt_node_memory_MemTotal_bytes) > 90 + for: 10m + labels: + severity: page + annotations: + summary: OpenWRT high memory usage. Can cause services getting stuck. - name: Mailserver Down rules: - - alert: Mail server has no replicas available - expr: (kube_deployment_status_replicas_available{exported_namespace="mailserver"} or on() vector(0)) < 1 - for: 10m - labels: - severity: page - annotations: - summary: Mail server has no available replicas. This means mail may not be received. + - alert: Mail server has no replicas available + expr: (kube_deployment_status_replicas_available{exported_namespace="mailserver"} or on() vector(0)) < 1 + for: 10m + labels: + severity: page + annotations: + summary: Mail server has no available replicas. This means mail may not be received. - name: Hackmd Down rules: - - alert: Hackmd has no replicas available - expr: (kube_deployment_status_replicas_available{exported_namespace="hackmd"} or on() vector(0)) < 1 - for: 1m - labels: - severity: page - annotations: - summary: Hackmd has no available replicas. + - alert: Hackmd has no replicas available + expr: (kube_deployment_status_replicas_available{exported_namespace="hackmd"} or on() vector(0)) < 1 + for: 1m + labels: + severity: page + annotations: + summary: Hackmd has no available replicas. - name: Privatebin Down rules: - - alert: Privatebin has no replicas available - expr: (kube_deployment_status_replicas_available{exported_namespace="privatebin"} or on() vector(0)) < 1 - for: 10m - labels: - severity: page - annotations: - summary: Privatebin has no available replicas. + - alert: Privatebin has no replicas available + expr: (kube_deployment_status_replicas_available{exported_namespace="privatebin"} or on() vector(0)) < 1 + for: 10m + labels: + severity: page + annotations: + summary: Privatebin has no available replicas. - name: London OpenWRT Down rules: - - alert: OpenWRT client unreachable - expr: (openwrt_node_openwrt_info or on() vector(0)) == 0 - for: 10m - labels: - severity: page - annotations: - summary: London OpenWRT router unreachable through VPN + - alert: OpenWRT client unreachable + expr: (openwrt_node_openwrt_info or on() vector(0)) == 0 + for: 10m + labels: + severity: page + annotations: + summary: London OpenWRT router unreachable through VPN - name: London OpenWRT High System Load rules: - - alert: OpenWRT high system load - expr: openwrt_node_load1 > 0.9 - for: 15m - labels: - severity: page - annotations: - summary: High system load on OpenWRT + - alert: OpenWRT high system load + expr: openwrt_node_load1 > 0.9 + for: 15m + labels: + severity: page + annotations: + summary: High system load on OpenWRT - name: Finance app webhook exceptions rules: - - alert: Finance app webhook exceptions - expr: changes(webhook_failure_total[5m]) >= 1 - for: 1m - labels: - severity: page - annotations: - summary: Finance app webhook exceptions + - alert: Finance app webhook exceptions + expr: changes(webhook_failure_total[5m]) >= 1 + for: 1m + labels: + severity: page + annotations: + summary: Finance app webhook exceptions - name: Finance app unhandled exceptions rules: - - alert: Finance app unhandled exceptions - expr: changes(flask_http_request_exceptions_total[5m]) >= 1 - for: 1m - labels: - severity: page - annotations: - summary: Finance app unhandled exceptions - + - alert: Finance app unhandled exceptions + expr: changes(flask_http_request_exceptions_total[5m]) >= 1 + for: 1m + labels: + severity: page + annotations: + summary: Finance app unhandled exceptions + extraScrapeConfigs: | - job_name: 'snmp-idrac' static_configs: diff --git a/terraform.tfstate b/terraform.tfstate index 0df33301..7ae6491f 100644 Binary files a/terraform.tfstate and b/terraform.tfstate differ