add slack to notifications and update alert definitions after upgrade [ci skip]

This commit is contained in:
viktorbarzin 2022-01-06 20:09:20 +00:00
parent 6870cee492
commit 0b20fc1e73
No known key found for this signature in database
GPG key ID: 0EB088298288D958
6 changed files with 18 additions and 8 deletions

View file

@ -44,6 +44,7 @@ variable "webhook_handler_git_token" {}
variable "webhook_handler_ssh_key" {}
variable "monitoring_idrac_username" {}
variable "monitoring_idrac_password" {}
variable "alertmanager_slack_api_url" {}
variable "ansible_prefix" {
default = "ANSIBLE_VAULT_PASSWORD_FILE=~/.ansible/vault_pass.txt ansible-playbook -i playbook/hosts.yaml playbook/linux.yml -t linux/initial_setup"
@ -211,6 +212,7 @@ module "kubernetes_cluster" {
bind_named_conf_options = var.bind_named_conf_options
alertmanager_account_password = var.alertmanager_account_password
alertmanager_slack_api_url = var.alertmanager_slack_api_url
# Drone
drone_github_client_id = var.drone_github_client_id

View file

@ -32,6 +32,7 @@ variable "webhook_handler_git_token" {}
variable "webhook_handler_ssh_key" {}
variable "idrac_username" {}
variable "idrac_password" {}
variable "alertmanager_slack_api_url" {}
resource "null_resource" "core_services" {
# List all the core modules that must be provisioned first
@ -143,6 +144,7 @@ module "monitoring" {
alertmanager_account_password = var.alertmanager_account_password
idrac_username = var.idrac_username
idrac_password = var.idrac_password
alertmanager_slack_api_url = var.alertmanager_slack_api_url
depends_on = [null_resource.core_services]
}

View file

@ -9,6 +9,7 @@ variable "idrac_username" {
variable "idrac_password" {
default = "calvin"
}
variable "alertmanager_slack_api_url" {}
module "tls_secret" {
source = "../setup_tls_secret"
@ -23,8 +24,9 @@ resource "helm_release" "prometheus" {
repository = "https://prometheus-community.github.io/helm-charts"
chart = "prometheus"
version = "15.0.2"
values = [templatefile("${path.module}/prometheus_chart_values.tpl", { alertmanager_mail_pass = var.alertmanager_account_password })]
values = [templatefile("${path.module}/prometheus_chart_values.tpl", { alertmanager_mail_pass = var.alertmanager_account_password, alertmanager_slack_api_url = var.alertmanager_slack_api_url })]
}
# Terraform get angry with the 30k values file :/ use ansible until solved

View file

@ -31,6 +31,7 @@ alertmanagerFiles:
smtp_auth_username: "alertmanager@viktorbarzin.me"
smtp_auth_password: "${alertmanager_mail_pass}"
smtp_require_tls: true
slack_api_url: "${alertmanager_slack_api_url}"
templates:
- "/etc/alertmanager/template/*.tmpl"
route:
@ -38,14 +39,17 @@ alertmanagerFiles:
group_wait: 3s
group_interval: 5s
repeat_interval: 1h
receiver: SMTP_STARTTLS
receiver: ALL
receivers:
- name: 'SMTP_STARTTLS'
- name: ALL
email_configs:
- to: "me@viktorbarzin.me"
send_resolved: true
tls_config:
insecure_skip_verify: true
slack_configs:
- send_resolved: true
channel: "#general"
server:
# Enable me to delete metrics
@ -93,7 +97,7 @@ serverFiles:
- name: NodeDown
rules:
- alert: NodeDown
expr: up{job="kubernetes-nodes"} == 0
expr: (up{job="kubernetes-nodes"} or on() vector(0)) == 0
for: 1m
labels:
severity: page
@ -120,7 +124,7 @@ serverFiles:
- name: ReadyPodsInDeploymentLessThanSpec
rules:
- alert: ReadyPodsInDeploymentLessThanSpec
expr: kube_deployment_status_replicas_available - on(namespace, deployment) kube_deployment_spec_replicas < 0
expr: kube_deployment_status_replicas_available - on(exported_namespace, deployment) kube_deployment_spec_replicas < 0
for: 10m
labels:
severity: page
@ -174,7 +178,7 @@ serverFiles:
- name: Mailserver Down
rules:
- alert: Mail server has no replicas available
expr: (kube_deployment_status_replicas_available{namespace="mailserver"} or on() vector(0)) < 1
expr: (kube_deployment_status_replicas_available{exported_namespace="mailserver"} or on() vector(0)) < 1
for: 10m
labels:
severity: page
@ -183,7 +187,7 @@ serverFiles:
- name: Hackmd Down
rules:
- alert: Hackmd has no replicas available
expr: (kube_deployment_status_replicas_available{namespace="hackmd"} or on() vector(0)) < 1
expr: (kube_deployment_status_replicas_available{exported_namespace="hackmd"} or on() vector(0)) < 1
for: 1m
labels:
severity: page
@ -192,7 +196,7 @@ serverFiles:
- name: Privatebin Down
rules:
- alert: Privatebin has no replicas available
expr: (kube_deployment_status_replicas_available{namespace="privatebin"} or on() vector(0)) < 1
expr: (kube_deployment_status_replicas_available{exported_namespace="privatebin"} or on() vector(0)) < 1
for: 10m
labels:
severity: page

Binary file not shown.

Binary file not shown.