infra/stacks/descheduler/values.yaml

311 lines
8.4 KiB
YAML
Raw Normal View History

# Source from https://github.com/kubernetes-sigs/descheduler/blob/master/charts/descheduler/values.yaml
# Default values for descheduler.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
# CronJob or Deployment
kind: CronJob
image:
repository: registry.k8s.io/descheduler/descheduler
# Overrides the image tag whose default is the chart version
tag: ""
pullPolicy: IfNotPresent
imagePullSecrets:
# - name: container-registry-secret
resources:
requests:
cpu: 500m
memory: 256Mi
limits:
memory: 256Mi
ports:
- containerPort: 10258
protocol: TCP
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
privileged: false
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
# podSecurityContext -- [Security context for pod](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/)
podSecurityContext:
{}
# fsGroup: 1000
nameOverride: ""
fullnameOverride: ""
# -- Override the deployment namespace; defaults to .Release.Namespace
namespaceOverride: ""
# labels that'll be applied to all resources
commonLabels: {}
cronJobApiVersion: "batch/v1"
etcd-load-reduction: remove VPA/Goldilocks, disable kyverno reporting, descheduler hourly The control-plane flap (etcd lease-renewal timeouts) recurred. Rather than move etcd to SSD (code-oflt, deferred again), the chosen direction is to REDUCE etcd load enough that the leader-election-timeout band-aid (renew 10s->30s) becomes removable. These are the big, clean cuts: 1. Remove VPA/Goldilocks (stacks/vpa emptied). All 349 VPAs ran updateMode=Off (no auto-right-sizing) yet cost ~800 etcd objects + continuous recommender writes + a pod-creation admission webhook, purely to feed a dashboard. krr (Dockerized, on-demand) replaces it. Reverses the re-add after memory 2431. 2. Disable kyverno reporting (admission/aggregate/background). policyReports were already off, so the pipeline generated ephemeralreports + an hourly all-resource etcd re-scan for NO user-facing output. Admission enforcement (deny-* policies) and Keel mutation are unaffected; violations surface via Loki->Slack. 3. descheduler */5 -> hourly (fewer list/evict cycles; rebalancing isn't urgent). Deferred (poor ROI / unsafe as planned): ESO refreshInterval 15m->1h is a ~20-stack sprawl for ~0.1 writes/s; keel background=false is invalid for a mutate-existing policy and its churn is apply-time not steady-state. Both filed as follow-up beads. Post-apply: delete the chart-orphaned VPA CRDs to cascade-clean leftover CRs. Then measure etcd apply-latency and revert the timeouts. Docs updated (VPA/Goldilocks -> krr). See memory 5402-5407. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-12 19:41:22 +00:00
schedule: "0 * * * *" # hourly (was */5; 2026-06-12 etcd-load-reduction — fewer list/evict cycles, rebalancing isn't time-critical)
suspend: false
# startingDeadlineSeconds: 200
successfulJobsHistoryLimit: 10
# failedJobsHistoryLimit: 1
# ttlSecondsAfterFinished 600
# timeZone: Etc/UTC
# Required when running as a Deployment
deschedulingInterval: 5m
# Specifies the replica count for Deployment
# Set leaderElection if you want to use more than 1 replica
# Set affinity.podAntiAffinity rule if you want to schedule onto a node
# only if that node is in the same zone as at least one already-running descheduler
replicas: 1
# Specifies whether Leader Election resources should be created
# Required when running as a Deployment
# NOTE: Leader election can't be activated if DryRun enabled
leaderElection: {}
# enabled: true
# leaseDuration: 15s
# renewDeadline: 10s
# retryPeriod: 2s
# resourceLock: "leases"
# resourceName: "descheduler"
# resourceNamespace: "kube-system"
command:
- "/bin/descheduler"
cmdOptions:
v: 3
# Recommended to use the latest Policy API version supported by the Descheduler app version
deschedulerPolicyAPIVersion: "descheduler/v1alpha2"
# deschedulerPolicy contains the policies the descheduler will execute.
# To use policies stored in an existing configMap use:
# NOTE: The name of the cm should comply to {{ template "descheduler.fullname" . }}
# deschedulerPolicy: {}
deschedulerPolicy:
# nodeSelector: "key1=value1,key2=value2"
# maxNoOfPodsToEvictPerNode: 10
maxNoOfPodsToEvictTotal: 10
# maxNoOfPodsToEvictPerNamespace: 10
# ignorePvcPods: true
# evictLocalStoragePods: true
# evictDaemonSetPods: true
# tracing:
# collectorEndpoint: otel-collector.observability.svc.cluster.local:4317
# transportCert: ""
# serviceName: ""
# serviceNamespace: ""
# sampleRate: 1.0
# fallbackToNoOpProviderOnError: true
metricsCollector:
enabled: true
profiles:
- name: default
pluginConfig:
- name: DefaultEvictor
args:
ignorePvcPods: true
evictLocalStoragePods: true
- name: RemoveDuplicates
- name: RemovePodsHavingTooManyRestarts
args:
podRestartThreshold: 2
includingInitContainers: true
states:
- CrashLoopBackOff
- name: RemovePodsViolatingNodeAffinity
args:
nodeAffinityType:
- requiredDuringSchedulingIgnoredDuringExecution
- name: RemovePodsViolatingNodeTaints
- name: RemovePodsViolatingInterPodAntiAffinity
- name: RemovePodsViolatingTopologySpreadConstraint
- name: LowNodeUtilization
args:
evictableNamespaces:
exclude:
- "dbaas" # let's not meddle with the dbs
thresholds:
cpu: 50
memory: 50
# pods: 20
targetThresholds:
cpu: 80
memory: 80
# pods: 30
metricsUtilization:
metricsServer: true
- name: PodLifeTime
args:
maxPodLifeTimeSeconds: 604800
namespaces:
exclude:
- "dbaas" # let's not meddle with the dbs
- "kube-system"
- "calico-system"
- "calico-apiserver"
- "metallb-system"
- "monitoring"
- "authentik"
- name: "RemoveFailedPods"
args:
reasons:
- "CrashLoopBackOff"
- "Error"
- "ContainerStatusUnknown"
- "ImagePullBackOff"
# exitCodes:
# - 1
includingInitContainers: true
# minPodLifetimeSeconds: 0
plugins:
balance:
enabled:
- RemoveDuplicates
- RemovePodsViolatingTopologySpreadConstraint
- LowNodeUtilization
deschedule:
enabled:
- RemovePodsHavingTooManyRestarts
- RemovePodsViolatingNodeTaints
- RemovePodsViolatingNodeAffinity
- RemovePodsViolatingInterPodAntiAffinity
- PodLifeTime
- RemoveFailedPods
- name: idrac-restart
pluginConfig:
- name: DefaultEvictor
args:
ignorePvcPods: true
evictLocalStoragePods: true
- name: PodLifeTime
args:
maxPodLifeTimeSeconds: 21600
namespaces:
include:
- "monitoring"
labelSelector:
matchLabels:
app: idrac-redfish-exporter
plugins:
deschedule:
enabled:
- PodLifeTime
priorityClassName: system-cluster-critical
nodeSelector: {}
# foo: bar
affinity: {}
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: kubernetes.io/e2e-az-name
# operator: In
# values:
# - e2e-az1
# - e2e-az2
# podAntiAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# - labelSelector:
# matchExpressions:
# - key: app.kubernetes.io/name
# operator: In
# values:
# - descheduler
# topologyKey: "kubernetes.io/hostname"
topologySpreadConstraints: []
# - maxSkew: 1
# topologyKey: kubernetes.io/hostname
# whenUnsatisfiable: DoNotSchedule
# labelSelector:
# matchLabels:
# app.kubernetes.io/name: descheduler
tolerations: []
# - key: 'management'
# operator: 'Equal'
# value: 'tool'
# effect: 'NoSchedule'
rbac:
# Specifies whether RBAC resources should be created
create: true
serviceAccount:
# Specifies whether a ServiceAccount should be created
create: false
# The name of the ServiceAccount to use.
# If not set and create is true, a name is generated using the fullname template
name: "descheduler-sa"
# Specifies custom annotations for the serviceAccount
annotations: {}
podAnnotations: {}
podLabels: {}
dnsConfig: {}
livenessProbe:
failureThreshold: 3
httpGet:
path: /healthz
port: 10258
scheme: HTTPS
initialDelaySeconds: 3
periodSeconds: 10
service:
enabled: false
# @param service.ipFamilyPolicy [string], support SingleStack, PreferDualStack and RequireDualStack
#
ipFamilyPolicy: ""
# @param service.ipFamilies [array] List of IP families (e.g. IPv4, IPv6) assigned to the service.
# Ref: https://kubernetes.io/docs/concepts/services-networking/dual-stack/
# E.g.
# ipFamilies:
# - IPv6
# - IPv4
ipFamilies: []
serviceMonitor:
enabled: false
# The namespace where Prometheus expects to find service monitors.
# namespace: ""
# Add custom labels to the ServiceMonitor resource
additionalLabels:
{}
# prometheus: kube-prometheus-stack
interval: ""
# honorLabels: true
insecureSkipVerify: true
serverName: null
metricRelabelings:
[]
# - action: keep
# regex: 'descheduler_(build_info|pods_evicted)'
# sourceLabels: [__name__]
relabelings:
[]
# - sourceLabels: [__meta_kubernetes_pod_node_name]
# separator: ;
# regex: ^(.*)$
# targetLabel: nodename
# replacement: $1
# action: replace