update claude knowledge: OpenClaw deployment and tg wrapper learnings [ci skip]
This commit is contained in:
parent
92cc3f01c1
commit
916aa6c6cb
2 changed files with 45 additions and 5 deletions
|
|
@ -20,6 +20,8 @@
|
||||||
|
|
||||||
## Known Issues
|
## Known Issues
|
||||||
- **CrowdSec Helm upgrade times out**: `terragrunt apply` on platform stack causes CrowdSec Helm release to get stuck in `pending-upgrade`. Workaround: `helm rollback crowdsec <rev> -n crowdsec`. Root cause: likely ResourceQuota CPU at 302% preventing pods from passing readiness probes. Needs investigation.
|
- **CrowdSec Helm upgrade times out**: `terragrunt apply` on platform stack causes CrowdSec Helm release to get stuck in `pending-upgrade`. Workaround: `helm rollback crowdsec <rev> -n crowdsec`. Root cause: likely ResourceQuota CPU at 302% preventing pods from passing readiness probes. Needs investigation.
|
||||||
|
- **OpenClaw config is writable**: OpenClaw writes to `openclaw.json` at runtime (doctor --fix, plugin auto-enable). Never use subPath ConfigMap mounts for it — use an init container to copy into a writable volume. Needs 2Gi memory + `NODE_OPTIONS=--max-old-space-size=1536`.
|
||||||
|
- **Goldilocks VPA sets limits**: When increasing memory requests, always set explicit `limits` too — Goldilocks may have added a limit that blocks the change.
|
||||||
|
|
||||||
## User Preferences
|
## User Preferences
|
||||||
- **Calendar**: Nextcloud at `nextcloud.viktorbarzin.me`
|
- **Calendar**: Nextcloud at `nextcloud.viktorbarzin.me`
|
||||||
|
|
|
||||||
|
|
@ -100,6 +100,17 @@ parse_args() {
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH"
|
KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH"
|
||||||
|
|
||||||
|
# Auto-source UPTIME_KUMA_PASSWORD from terraform.tfvars if not set
|
||||||
|
if [[ -z "${UPTIME_KUMA_PASSWORD:-}" ]]; then
|
||||||
|
local script_dir tfvars_file
|
||||||
|
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
tfvars_file="${script_dir}/../terraform.tfvars"
|
||||||
|
if [[ -f "$tfvars_file" ]]; then
|
||||||
|
UPTIME_KUMA_PASSWORD=$(grep 'uptime_kuma_password' "$tfvars_file" | head -1 | sed 's/.*= *"\(.*\)"/\1/')
|
||||||
|
export UPTIME_KUMA_PASSWORD
|
||||||
|
fi
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- 1. Node Status ---
|
# --- 1. Node Status ---
|
||||||
|
|
@ -913,7 +924,7 @@ check_helm_releases() {
|
||||||
|
|
||||||
local releases detail="" had_issue=false status="PASS"
|
local releases detail="" had_issue=false status="PASS"
|
||||||
|
|
||||||
releases=$(helm list -A --kubeconfig "$KUBECONFIG_PATH" --all -o json 2>/dev/null) || {
|
releases=$(helm list --all-namespaces --kubeconfig "$KUBECONFIG_PATH" -o json 2>/dev/null) || {
|
||||||
[[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
|
[[ "$QUIET" == true ]] && section_always 18 "Helm Release Health"
|
||||||
warn "Cannot list Helm releases"
|
warn "Cannot list Helm releases"
|
||||||
json_add "helm_releases" "WARN" "Cannot list"
|
json_add "helm_releases" "WARN" "Cannot list"
|
||||||
|
|
@ -1258,10 +1269,24 @@ check_cloudflare_tunnel() {
|
||||||
# --- 25. Advanced CPU Monitoring (Prometheus) ---
|
# --- 25. Advanced CPU Monitoring (Prometheus) ---
|
||||||
check_prometheus_cpu() {
|
check_prometheus_cpu() {
|
||||||
section 25 "Advanced CPU Monitoring"
|
section 25 "Advanced CPU Monitoring"
|
||||||
local prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query"
|
|
||||||
local cpu_query="100%20-%20(avg%20by%20(instance)%20(irate(node_cpu_seconds_total%7Bmode%3D%22idle%22%7D%5B5m%5D))%20*%20100)"
|
local cpu_query="100%20-%20(avg%20by%20(instance)%20(irate(node_cpu_seconds_total%7Bmode%3D%22idle%22%7D%5B5m%5D))%20*%20100)"
|
||||||
local detail="" had_issue=false status="PASS"
|
local detail="" had_issue=false status="PASS"
|
||||||
|
|
||||||
|
# Start port-forward to Prometheus if not using in-cluster DNS
|
||||||
|
local prom_url pf_pid=""
|
||||||
|
if curl -s --connect-timeout 2 "http://prometheus-server.monitoring.svc.cluster.local/api/v1/query?query=up" &>/dev/null; then
|
||||||
|
prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query"
|
||||||
|
else
|
||||||
|
local pf_port
|
||||||
|
pf_port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()')
|
||||||
|
$KUBECTL port-forward -n monitoring svc/prometheus-server "$pf_port:80" &>/dev/null &
|
||||||
|
pf_pid=$!
|
||||||
|
sleep 2
|
||||||
|
prom_url="http://127.0.0.1:${pf_port}/api/v1/query"
|
||||||
|
fi
|
||||||
|
# Cleanup port-forward on exit from this function
|
||||||
|
trap '[[ -n "$pf_pid" ]] && kill $pf_pid 2>/dev/null || true' RETURN
|
||||||
|
|
||||||
# Try to query Prometheus for CPU metrics
|
# Try to query Prometheus for CPU metrics
|
||||||
local cpu_data
|
local cpu_data
|
||||||
cpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${cpu_query}" 2>/dev/null) || {
|
cpu_data=$(curl -s --connect-timeout 10 "${prom_url}?query=${cpu_query}" 2>/dev/null) || {
|
||||||
|
|
@ -1335,9 +1360,22 @@ except Exception as e:
|
||||||
# --- 26. Power Monitoring ---
|
# --- 26. Power Monitoring ---
|
||||||
check_power_monitoring() {
|
check_power_monitoring() {
|
||||||
section 26 "Power Monitoring"
|
section 26 "Power Monitoring"
|
||||||
local prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query"
|
|
||||||
local detail="" had_issue=false status="PASS"
|
local detail="" had_issue=false status="PASS"
|
||||||
|
|
||||||
|
# Start port-forward to Prometheus if not using in-cluster DNS
|
||||||
|
local prom_url pf_pid=""
|
||||||
|
if curl -s --connect-timeout 2 "http://prometheus-server.monitoring.svc.cluster.local/api/v1/query?query=up" &>/dev/null; then
|
||||||
|
prom_url="http://prometheus-server.monitoring.svc.cluster.local/api/v1/query"
|
||||||
|
else
|
||||||
|
local pf_port
|
||||||
|
pf_port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()')
|
||||||
|
$KUBECTL port-forward -n monitoring svc/prometheus-server "$pf_port:80" &>/dev/null &
|
||||||
|
pf_pid=$!
|
||||||
|
sleep 2
|
||||||
|
prom_url="http://127.0.0.1:${pf_port}/api/v1/query"
|
||||||
|
fi
|
||||||
|
trap '[[ -n "$pf_pid" ]] && kill $pf_pid 2>/dev/null || true' RETURN
|
||||||
|
|
||||||
# GPU Power monitoring
|
# GPU Power monitoring
|
||||||
local gpu_query="DCGM_FI_DEV_POWER_USAGE"
|
local gpu_query="DCGM_FI_DEV_POWER_USAGE"
|
||||||
local gpu_data
|
local gpu_data
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue