fix monitoring stack [ci skip]
This commit is contained in:
parent
24bb9aca05
commit
40f4354316
3 changed files with 46 additions and 4 deletions
|
|
@ -17,8 +17,12 @@ alertmanager:
|
||||||
# nginx.ingress.kubernetes.io/auth-tls-verify-client: "on"
|
# nginx.ingress.kubernetes.io/auth-tls-verify-client: "on"
|
||||||
# Create the secret containing the trusted ca certificates
|
# Create the secret containing the trusted ca certificates
|
||||||
# nginx.ingress.kubernetes.io/auth-tls-secret: "default/ca-secret"
|
# nginx.ingress.kubernetes.io/auth-tls-secret: "default/ca-secret"
|
||||||
nginx.ingress.kubernetes.io/auth-url: "https://oauth2.viktorbarzin.me/oauth2/auth"
|
# nginx.ingress.kubernetes.io/auth-url: "https://oauth2.viktorbarzin.me/oauth2/auth"
|
||||||
nginx.ingress.kubernetes.io/auth-signin: "https://oauth2.viktorbarzin.me/oauth2/start?rd=/redirect/$http_host$escaped_request_uri"
|
# nginx.ingress.kubernetes.io/auth-signin: "https://oauth2.viktorbarzin.me/oauth2/start?rd=/redirect/$http_host$escaped_request_uri"
|
||||||
|
nginx.ingress.kubernetes.io/auth-url: "http://ak-outpost-authentik-embedded-outpost.authentik.svc.cluster.local:9000/outpost.goauthentik.io/auth/nginx"
|
||||||
|
nginx.ingress.kubernetes.io/auth-signin: "https://authentik.viktorbarzin.me/outpost.goauthentik.io/start?rd=$scheme%3A%2F%2F$host$escaped_request_uri"
|
||||||
|
nginx.ingress.kubernetes.io/auth-response-headers: "Set-Cookie,X-authentik-username,X-authentik-groups,X-authentik-email,X-authentik-name,X-authentik-uid"
|
||||||
|
nginx.ingress.kubernetes.io/auth-snippet: "proxy_set_header X-Forwarded-Host $http_host;"
|
||||||
tls:
|
tls:
|
||||||
- secretName: "tls-secret"
|
- secretName: "tls-secret"
|
||||||
hosts:
|
hosts:
|
||||||
|
|
@ -31,6 +35,34 @@ alertmanager:
|
||||||
pathType: Prefix
|
pathType: Prefix
|
||||||
serviceName: prometheus-server
|
serviceName: prometheus-server
|
||||||
servicePort: 80
|
servicePort: 80
|
||||||
|
config:
|
||||||
|
enabled: true
|
||||||
|
global:
|
||||||
|
smtp_from: "alertmanager@viktorbarzin.me"
|
||||||
|
# smtp_smarthost: "smtp.viktorbarzin.me:587"
|
||||||
|
smtp_smarthost: "mailserver.mailserver.svc.cluster.local:587"
|
||||||
|
smtp_auth_username: "alertmanager@viktorbarzin.me"
|
||||||
|
smtp_auth_password: "${alertmanager_mail_pass}"
|
||||||
|
smtp_require_tls: true
|
||||||
|
slack_api_url: "${alertmanager_slack_api_url}"
|
||||||
|
templates:
|
||||||
|
- "/etc/alertmanager/template/*.tmpl"
|
||||||
|
route:
|
||||||
|
group_by: ["alertname"]
|
||||||
|
group_wait: 3s
|
||||||
|
group_interval: 5s
|
||||||
|
repeat_interval: 1h
|
||||||
|
receiver: ALL
|
||||||
|
receivers:
|
||||||
|
- name: ALL
|
||||||
|
# email_configs:
|
||||||
|
# - to: "me@viktorbarzin.me"
|
||||||
|
# send_resolved: true
|
||||||
|
# tls_config:
|
||||||
|
# insecure_skip_verify: true
|
||||||
|
slack_configs:
|
||||||
|
- send_resolved: true
|
||||||
|
channel: "#general"
|
||||||
# web.external-url seems to be hardcoded, edited deployment manually
|
# web.external-url seems to be hardcoded, edited deployment manually
|
||||||
# extraArgs:
|
# extraArgs:
|
||||||
# web.external-url: "https://prometheus.viktorbarzin.me"
|
# web.external-url: "https://prometheus.viktorbarzin.me"
|
||||||
|
|
@ -110,7 +142,7 @@ server:
|
||||||
alertmanagers:
|
alertmanagers:
|
||||||
- static_configs:
|
- static_configs:
|
||||||
- targets:
|
- targets:
|
||||||
- "prometheus-alertmanager.monitoring.svc.cluster.local"
|
- "prometheus-alertmanager.monitoring.svc.cluster.local:9093"
|
||||||
# - "alertmanager.viktorbarzin.me"
|
# - "alertmanager.viktorbarzin.me"
|
||||||
tls_config:
|
tls_config:
|
||||||
insecure_skip_verify: true
|
insecure_skip_verify: true
|
||||||
|
|
@ -214,7 +246,7 @@ serverFiles:
|
||||||
- name: HighPowerUsage
|
- name: HighPowerUsage
|
||||||
rules:
|
rules:
|
||||||
- alert: HighPowerUsage
|
- alert: HighPowerUsage
|
||||||
expr: (max(r730_idrac_amperageProbeReading) or on() vector(0)) > 112
|
expr: (max(r730_idrac_redfish_chassis_power_average_consumed_watts) or on() vector(0)) > 127
|
||||||
for: 60m
|
for: 60m
|
||||||
labels:
|
labels:
|
||||||
severity: page
|
severity: page
|
||||||
|
|
@ -238,6 +270,16 @@ serverFiles:
|
||||||
severity: page
|
severity: page
|
||||||
annotations:
|
annotations:
|
||||||
summary: No iDRAC amperage reading. Can signal that prometheus is not scraping
|
summary: No iDRAC amperage reading. Can signal that prometheus is not scraping
|
||||||
|
- name: IngressSuccessRateDrop
|
||||||
|
rules:
|
||||||
|
- alert: IngressSuccessRateDrop
|
||||||
|
expr: (sum(rate(nginx_ingress_controller_requests{status!~"[4-5].*"}[2m])) by (ingress) / sum(rate(nginx_ingress_controller_requests[2m])) by (ingress)) < 0.95
|
||||||
|
# for: 10m
|
||||||
|
for: 1m # DEBUG
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
summary: Ingress {{ $labels.ingress }} success rate dropped below 95% - {{ $value }}%.
|
||||||
- name: OpenWRT High Memory Usage
|
- name: OpenWRT High Memory Usage
|
||||||
rules:
|
rules:
|
||||||
- alert: OpenWRT High Memory Usage
|
- alert: OpenWRT High Memory Usage
|
||||||
|
|
|
||||||
Binary file not shown.
BIN
terraform.tfvars
BIN
terraform.tfvars
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue