This commit is contained in:
viktorbarzin 2021-02-07 23:45:55 +00:00
commit 7a7bc34ae3
32 changed files with 4857 additions and 0 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,148 @@
variable "tls_secret_name" {}
variable "tls_crt" {}
variable "tls_key" {}
variable "alertmanager_account_password" {}
module "tls_secret" {
source = "../setup_tls_secret"
namespace = "monitoring"
tls_secret_name = var.tls_secret_name
tls_crt = var.tls_crt
tls_key = var.tls_key
}
resource "helm_release" "prometheus" {
namespace = "monitoring"
create_namespace = true
name = "prometheus"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "prometheus"
values = [templatefile("${path.module}/prometheus_chart_values.tpl", { alertmanager_mail_pass = var.alertmanager_account_password })]
}
# Terraform get angry with the 30k values file :/ use ansible until solved
# resource "helm_release" "prometheus_snmp_exporter" {
# namespace = "monitoring"
# create_namespace = true
# name = "prometheus_exporter"
# repository = "https://prometheus-community.github.io/helm-charts"
# chart = "prometheus-snmp-exporter"
# values = [file("${path.module}/prometheus_snmp_chart_values.yaml")]
# }
resource "kubernetes_secret" "prometheus_grafana_datasource" {
metadata {
name = "prometheus-grafana-datasource"
namespace = "monitoring"
labels = {
grafana_datasource = "1"
}
}
data = {
"datasource.yaml" = <<EOT
# config file version
apiVersion: 1
# list of datasources that should be deleted from the database
#deleteDatasources:
# - name: Prometheus
# orgId: 1
# list of datasources to insert/update depending
# whats available in the database
datasources:
# <string, required> name of the datasource. Required
- name: Prometheus
# <string, required> datasource type. Required
type: prometheus
# <string, required> access mode. proxy or direct (Server or Browser in the UI). Required
access: proxy
# <int> org id. will default to orgId 1 if not specified
orgId: 1
# <string> url
url: http://prometheus-server
# <string> database password, if used
password:
# <string> database user, if used
user:
# <string> database name, if used
database:
# <bool> enable/disable basic auth
basicAuth:
# <string> basic auth username
basicAuthUser:
# <string> basic auth password
basicAuthPassword:
# <bool> enable/disable with credentials headers
withCredentials:
# <bool> mark as default datasource. Max one per org
isDefault:
# <map> fields that will be converted to json and stored in json_data
#jsonData:
# graphiteVersion: \"1.1\"
# tlsAuth: true
# tlsAuthWithCACert: true
# <string> json object of data that will be encrypted.
#secureJsonData:
# tlsCACert: \"...\"
# tlsClientCert: \"...\"
# tlsClientKey: \"...\"
version: 1
# <bool> allow users to edit datasources from the UI.
editable: false
EOT
}
type = "Opaque"
}
resource "kubernetes_persistent_volume" "prometheus_grafana_pv" {
metadata {
name = "grafana-iscsi-pv"
}
spec {
capacity = {
"storage" = "2Gi"
}
access_modes = ["ReadWriteOnce"]
persistent_volume_source {
iscsi {
target_portal = "iscsi.viktorbarzin.lan:3260"
iqn = "iqn.2020-12.lan.viktorbarzin:storage:monitoring:grafana"
lun = 0
fs_type = "ext4"
}
}
}
}
resource "kubernetes_persistent_volume_claim" "prometheus_grafana_pvc" {
metadata {
name = "grafana-iscsi-pvc"
namespace = "monitoring"
}
spec {
access_modes = ["ReadWriteOnce"]
resources {
requests = {
"storage" = "2Gi"
}
}
}
}
resource "helm_release" "grafana" {
namespace = "monitoring"
create_namespace = true
name = "grafana"
repository = "https://grafana.github.io/helm-charts"
chart = "grafana"
values = [file("${path.module}/grafana_chart_values.yaml")]
}

View file

@ -0,0 +1,176 @@
# Helm values
# all values - https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus/values.yaml
alertmanager:
persistentVolume:
# enabled: false
existingClaim: alertmanager-iscsi-pvc
# storageClass: rook-cephfs
strategy:
type: Recreate
ingress:
enabled: "true"
annotations:
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
# Enable client certificate authentication
nginx.ingress.kubernetes.io/auth-tls-verify-client: "on"
# Create the secret containing the trusted ca certificates
nginx.ingress.kubernetes.io/auth-tls-secret: "default/ca-secret"
tls:
- secretName: "tls-secret"
hosts:
- "alertmanager.viktorbarzin.me"
hosts:
- "alertmanager.viktorbarzin.me"
alertmanagerFiles:
alertmanager.yml:
global:
smtp_from: "alertmanager@viktorbarzin.me"
# smtp_smarthost: "smtp.viktorbarzin.me:587"
smtp_smarthost: "mailserver.mailserver.svc.cluster.local:587"
smtp_auth_username: "alertmanager@viktorbarzin.me"
smtp_auth_password: "${alertmanager_mail_pass}"
smtp_require_tls: true
templates:
- "/etc/alertmanager/template/*.tmpl"
route:
group_by: ["alertname"]
group_wait: 3s
group_interval: 5s
repeat_interval: 1h
receiver: SMTP_STARTTLS
receivers:
- name: 'SMTP_STARTTLS'
email_configs:
- to: "me@viktorbarzin.me"
send_resolved: true
tls_config:
insecure_skip_verify: true
server:
# Enable me to delete metrics
# extraFlags:
# - "web.enable-admin-api"
persistentVolume:
# enabled: false
existingClaim: prometheus-iscsi-pvc
# storageClass: rook-cephfs
retention: "12w" # ~100GB storage
strategy:
type: Recreate
ingress:
enabled: "true"
annotations:
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
# Enable client certificate authentication
nginx.ingress.kubernetes.io/auth-tls-verify-client: "on"
# Create the secret containing the trusted ca certificates
nginx.ingress.kubernetes.io/auth-tls-secret: "default/ca-secret"
tls:
- secretName: "tls-secret"
hosts:
- "prometheus.viktorbarzin.me"
hosts:
- "prometheus.viktorbarzin.me"
alertmanagers:
- static_configs:
- targets:
- "prometheus-alertmanager.monitoring.svc.cluster.local"
# - "alertmanager.viktorbarzin.me"
tls_config:
insecure_skip_verify: true
serverFiles:
# prometheus.yml:
# alertingaaa:
# alertmanagers:
# - static_configs:
# targets: "alertmanager.viktorbarzin.lan"
alerting_rules.yml:
groups:
- name: NodeDown
rules:
- alert: NodeDown
expr: up{job="kubernetes-nodes"} == 0
for: 1m
labels:
severity: page
annotations:
summary: Node down.
- name: NodeHighCPUUsage
rules:
- alert: NodeHighCPUUsage
expr: node_load1 > 2
# for: 10m
for: 1m # DEBUG
labels:
severity: page
annotations:
summary: High CPU usage on node.
# - name: PodStuckNotReady
# rules:
# - alert: PodStuckNotReady
# expr: kube_pod_status_ready{condition="true"} == 0
# for: 5m
# labels:
# severity: page
# annotations:
# summary: Pod stuck not ready.
- name: ReadyPodsInDeploymentLessThanSpec
rules:
- alert: ReadyPodsInDeploymentLessThanSpec
expr: kube_deployment_status_replicas_available - on(namespace, deployment) kube_deployment_spec_replicas < 0
for: 10m
labels:
severity: page
annotations:
summary: Number of ready pods in deployment is less than what is defined in spec.
- name: PowerOutage
rules:
- alert: PowerOutage
expr: r730_idrac_powerSupplyCurrentInputVoltage < 200
labels:
severity: page
annotations:
summary: Power voltage on a power supply is critically low indicating power outage.
extraScrapeConfigs: |
- job_name: 'snmp-idrac'
static_configs:
- targets:
- "idrac.viktorbarzin.lan:161"
metrics_path: '/snmp'
params:
module: [dell_idrac]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 'prometheus-snmp-exporter.monitoring.svc.cluster.local:9116'
metric_relabel_configs:
- source_labels: [ __name__ ]
target_label: '__name__'
action: replace
regex: '(.*)'
replacement: 'r730_idrac_${1}'
- job_name: 'openwrt'
static_configs:
- targets:
- "home.viktorbarzin.lan:9100"
metrics_path: '/metrics'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 'home.viktorbarzin.lan:9100'
metric_relabel_configs:
- source_labels: [ __name__ ]
target_label: '__name__'
action: replace
regex: '(.*)'
replacement: 'openwrt_${1}'