initial
This commit is contained in:
commit
7a7bc34ae3
32 changed files with 4857 additions and 0 deletions
2051
modules/kubernetes/monitoring/grafana_chart_values.yaml
Normal file
2051
modules/kubernetes/monitoring/grafana_chart_values.yaml
Normal file
File diff suppressed because it is too large
Load diff
148
modules/kubernetes/monitoring/main.tf
Normal file
148
modules/kubernetes/monitoring/main.tf
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
variable "tls_secret_name" {}
|
||||
variable "tls_crt" {}
|
||||
variable "tls_key" {}
|
||||
variable "alertmanager_account_password" {}
|
||||
|
||||
module "tls_secret" {
|
||||
source = "../setup_tls_secret"
|
||||
namespace = "monitoring"
|
||||
tls_secret_name = var.tls_secret_name
|
||||
tls_crt = var.tls_crt
|
||||
tls_key = var.tls_key
|
||||
}
|
||||
|
||||
resource "helm_release" "prometheus" {
|
||||
namespace = "monitoring"
|
||||
create_namespace = true
|
||||
name = "prometheus"
|
||||
|
||||
repository = "https://prometheus-community.github.io/helm-charts"
|
||||
chart = "prometheus"
|
||||
|
||||
values = [templatefile("${path.module}/prometheus_chart_values.tpl", { alertmanager_mail_pass = var.alertmanager_account_password })]
|
||||
}
|
||||
|
||||
# Terraform get angry with the 30k values file :/ use ansible until solved
|
||||
# resource "helm_release" "prometheus_snmp_exporter" {
|
||||
# namespace = "monitoring"
|
||||
# create_namespace = true
|
||||
# name = "prometheus_exporter"
|
||||
|
||||
# repository = "https://prometheus-community.github.io/helm-charts"
|
||||
# chart = "prometheus-snmp-exporter"
|
||||
|
||||
# values = [file("${path.module}/prometheus_snmp_chart_values.yaml")]
|
||||
# }
|
||||
|
||||
resource "kubernetes_secret" "prometheus_grafana_datasource" {
|
||||
metadata {
|
||||
name = "prometheus-grafana-datasource"
|
||||
namespace = "monitoring"
|
||||
|
||||
labels = {
|
||||
grafana_datasource = "1"
|
||||
}
|
||||
}
|
||||
|
||||
data = {
|
||||
"datasource.yaml" = <<EOT
|
||||
# config file version
|
||||
apiVersion: 1
|
||||
|
||||
# list of datasources that should be deleted from the database
|
||||
#deleteDatasources:
|
||||
# - name: Prometheus
|
||||
# orgId: 1
|
||||
|
||||
# list of datasources to insert/update depending
|
||||
# whats available in the database
|
||||
datasources:
|
||||
# <string, required> name of the datasource. Required
|
||||
- name: Prometheus
|
||||
# <string, required> datasource type. Required
|
||||
type: prometheus
|
||||
# <string, required> access mode. proxy or direct (Server or Browser in the UI). Required
|
||||
access: proxy
|
||||
# <int> org id. will default to orgId 1 if not specified
|
||||
orgId: 1
|
||||
# <string> url
|
||||
url: http://prometheus-server
|
||||
# <string> database password, if used
|
||||
password:
|
||||
# <string> database user, if used
|
||||
user:
|
||||
# <string> database name, if used
|
||||
database:
|
||||
# <bool> enable/disable basic auth
|
||||
basicAuth:
|
||||
# <string> basic auth username
|
||||
basicAuthUser:
|
||||
# <string> basic auth password
|
||||
basicAuthPassword:
|
||||
# <bool> enable/disable with credentials headers
|
||||
withCredentials:
|
||||
# <bool> mark as default datasource. Max one per org
|
||||
isDefault:
|
||||
# <map> fields that will be converted to json and stored in json_data
|
||||
#jsonData:
|
||||
# graphiteVersion: \"1.1\"
|
||||
# tlsAuth: true
|
||||
# tlsAuthWithCACert: true
|
||||
# <string> json object of data that will be encrypted.
|
||||
#secureJsonData:
|
||||
# tlsCACert: \"...\"
|
||||
# tlsClientCert: \"...\"
|
||||
# tlsClientKey: \"...\"
|
||||
version: 1
|
||||
# <bool> allow users to edit datasources from the UI.
|
||||
editable: false
|
||||
EOT
|
||||
}
|
||||
|
||||
type = "Opaque"
|
||||
}
|
||||
|
||||
resource "kubernetes_persistent_volume" "prometheus_grafana_pv" {
|
||||
metadata {
|
||||
name = "grafana-iscsi-pv"
|
||||
}
|
||||
spec {
|
||||
capacity = {
|
||||
"storage" = "2Gi"
|
||||
}
|
||||
access_modes = ["ReadWriteOnce"]
|
||||
persistent_volume_source {
|
||||
iscsi {
|
||||
target_portal = "iscsi.viktorbarzin.lan:3260"
|
||||
iqn = "iqn.2020-12.lan.viktorbarzin:storage:monitoring:grafana"
|
||||
lun = 0
|
||||
fs_type = "ext4"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
resource "kubernetes_persistent_volume_claim" "prometheus_grafana_pvc" {
|
||||
metadata {
|
||||
name = "grafana-iscsi-pvc"
|
||||
namespace = "monitoring"
|
||||
}
|
||||
spec {
|
||||
access_modes = ["ReadWriteOnce"]
|
||||
resources {
|
||||
requests = {
|
||||
"storage" = "2Gi"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resource "helm_release" "grafana" {
|
||||
namespace = "monitoring"
|
||||
create_namespace = true
|
||||
name = "grafana"
|
||||
|
||||
repository = "https://grafana.github.io/helm-charts"
|
||||
chart = "grafana"
|
||||
|
||||
values = [file("${path.module}/grafana_chart_values.yaml")]
|
||||
}
|
||||
176
modules/kubernetes/monitoring/prometheus_chart_values.tpl
Normal file
176
modules/kubernetes/monitoring/prometheus_chart_values.tpl
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
# Helm values
|
||||
# all values - https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus/values.yaml
|
||||
alertmanager:
|
||||
persistentVolume:
|
||||
# enabled: false
|
||||
existingClaim: alertmanager-iscsi-pvc
|
||||
# storageClass: rook-cephfs
|
||||
strategy:
|
||||
type: Recreate
|
||||
ingress:
|
||||
enabled: "true"
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: nginx
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
# Enable client certificate authentication
|
||||
nginx.ingress.kubernetes.io/auth-tls-verify-client: "on"
|
||||
# Create the secret containing the trusted ca certificates
|
||||
nginx.ingress.kubernetes.io/auth-tls-secret: "default/ca-secret"
|
||||
tls:
|
||||
- secretName: "tls-secret"
|
||||
hosts:
|
||||
- "alertmanager.viktorbarzin.me"
|
||||
hosts:
|
||||
- "alertmanager.viktorbarzin.me"
|
||||
alertmanagerFiles:
|
||||
alertmanager.yml:
|
||||
global:
|
||||
smtp_from: "alertmanager@viktorbarzin.me"
|
||||
# smtp_smarthost: "smtp.viktorbarzin.me:587"
|
||||
smtp_smarthost: "mailserver.mailserver.svc.cluster.local:587"
|
||||
smtp_auth_username: "alertmanager@viktorbarzin.me"
|
||||
smtp_auth_password: "${alertmanager_mail_pass}"
|
||||
smtp_require_tls: true
|
||||
templates:
|
||||
- "/etc/alertmanager/template/*.tmpl"
|
||||
route:
|
||||
group_by: ["alertname"]
|
||||
group_wait: 3s
|
||||
group_interval: 5s
|
||||
repeat_interval: 1h
|
||||
receiver: SMTP_STARTTLS
|
||||
receivers:
|
||||
- name: 'SMTP_STARTTLS'
|
||||
email_configs:
|
||||
- to: "me@viktorbarzin.me"
|
||||
send_resolved: true
|
||||
tls_config:
|
||||
insecure_skip_verify: true
|
||||
|
||||
server:
|
||||
# Enable me to delete metrics
|
||||
# extraFlags:
|
||||
# - "web.enable-admin-api"
|
||||
persistentVolume:
|
||||
# enabled: false
|
||||
existingClaim: prometheus-iscsi-pvc
|
||||
# storageClass: rook-cephfs
|
||||
retention: "12w" # ~100GB storage
|
||||
strategy:
|
||||
type: Recreate
|
||||
ingress:
|
||||
enabled: "true"
|
||||
annotations:
|
||||
kubernetes.io/ingress.class: nginx
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
# Enable client certificate authentication
|
||||
nginx.ingress.kubernetes.io/auth-tls-verify-client: "on"
|
||||
# Create the secret containing the trusted ca certificates
|
||||
nginx.ingress.kubernetes.io/auth-tls-secret: "default/ca-secret"
|
||||
tls:
|
||||
- secretName: "tls-secret"
|
||||
hosts:
|
||||
- "prometheus.viktorbarzin.me"
|
||||
hosts:
|
||||
- "prometheus.viktorbarzin.me"
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- "prometheus-alertmanager.monitoring.svc.cluster.local"
|
||||
# - "alertmanager.viktorbarzin.me"
|
||||
tls_config:
|
||||
insecure_skip_verify: true
|
||||
|
||||
serverFiles:
|
||||
# prometheus.yml:
|
||||
# alertingaaa:
|
||||
# alertmanagers:
|
||||
# - static_configs:
|
||||
# targets: "alertmanager.viktorbarzin.lan"
|
||||
alerting_rules.yml:
|
||||
groups:
|
||||
- name: NodeDown
|
||||
rules:
|
||||
- alert: NodeDown
|
||||
expr: up{job="kubernetes-nodes"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: Node down.
|
||||
- name: NodeHighCPUUsage
|
||||
rules:
|
||||
- alert: NodeHighCPUUsage
|
||||
expr: node_load1 > 2
|
||||
# for: 10m
|
||||
for: 1m # DEBUG
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: High CPU usage on node.
|
||||
# - name: PodStuckNotReady
|
||||
# rules:
|
||||
# - alert: PodStuckNotReady
|
||||
# expr: kube_pod_status_ready{condition="true"} == 0
|
||||
# for: 5m
|
||||
# labels:
|
||||
# severity: page
|
||||
# annotations:
|
||||
# summary: Pod stuck not ready.
|
||||
- name: ReadyPodsInDeploymentLessThanSpec
|
||||
rules:
|
||||
- alert: ReadyPodsInDeploymentLessThanSpec
|
||||
expr: kube_deployment_status_replicas_available - on(namespace, deployment) kube_deployment_spec_replicas < 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: Number of ready pods in deployment is less than what is defined in spec.
|
||||
- name: PowerOutage
|
||||
rules:
|
||||
- alert: PowerOutage
|
||||
expr: r730_idrac_powerSupplyCurrentInputVoltage < 200
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: Power voltage on a power supply is critically low indicating power outage.
|
||||
|
||||
extraScrapeConfigs: |
|
||||
- job_name: 'snmp-idrac'
|
||||
static_configs:
|
||||
- targets:
|
||||
- "idrac.viktorbarzin.lan:161"
|
||||
metrics_path: '/snmp'
|
||||
params:
|
||||
module: [dell_idrac]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: 'prometheus-snmp-exporter.monitoring.svc.cluster.local:9116'
|
||||
metric_relabel_configs:
|
||||
- source_labels: [ __name__ ]
|
||||
target_label: '__name__'
|
||||
action: replace
|
||||
regex: '(.*)'
|
||||
replacement: 'r730_idrac_${1}'
|
||||
- job_name: 'openwrt'
|
||||
static_configs:
|
||||
- targets:
|
||||
- "home.viktorbarzin.lan:9100"
|
||||
metrics_path: '/metrics'
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: 'home.viktorbarzin.lan:9100'
|
||||
metric_relabel_configs:
|
||||
- source_labels: [ __name__ ]
|
||||
target_label: '__name__'
|
||||
action: replace
|
||||
regex: '(.*)'
|
||||
replacement: 'openwrt_${1}'
|
||||
Loading…
Add table
Add a link
Reference in a new issue