immich: fix slow context search — prewarm clip_index + latency alert/healthcheck
Context (smart) search latency was caused by the 665MB vchord clip_index
decaying out of PG shared_buffers (~33% resident -> ~1.8s cold ANN reads vs
~4ms warm), NOT by yesterday's ML MODEL_TTL/clip-keepalive change (CLIP textual
is warm ~15ms on GPU). The postStart prewarm runs once at pod start and
pg_prewarm.autoprewarm only re-warms at startup, so the index decays under job
buffer-pressure over days.
- clip-index-prewarm CronJob (immich, */5): pg_prewarm('clip_index') keeps the
whole index resident -> searches stay ~4ms.
- immich-search-probe CronJob (immich, */5): times a random-vector ANN query +
reads clip_index residency, pushes gauges to the Pushgateway.
- Prometheus alerts ImmichSmartSearchSlow / ImmichClipIndexColdCache /
ImmichSearchProbeStale (+ inhibition when the probe is stale).
- cluster_healthcheck.sh check #46 check_immich_search (TOTAL_CHECKS 45->46).
- Docs: infra CLAUDE.md immich note, monitoring.md, cluster-health skill.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
38c77048fd
commit
f201e4573e
6 changed files with 308 additions and 5 deletions
|
|
@ -27,7 +27,7 @@ KUBECONFIG_PATH="${KUBECONFIG:-${HOME}/.kube/config}"
|
|||
[[ -f "$KUBECONFIG_PATH" ]] || KUBECONFIG_PATH="$(pwd)/config"
|
||||
KUBECTL=""
|
||||
JSON_RESULTS=()
|
||||
TOTAL_CHECKS=45
|
||||
TOTAL_CHECKS=46
|
||||
|
||||
# Parallel execution settings. Each check function is self-contained — it
|
||||
# only reads cluster state and mutates the in-memory counters / JSON_RESULTS
|
||||
|
|
@ -2961,6 +2961,57 @@ PYEOF
|
|||
fi
|
||||
}
|
||||
|
||||
# --- 46. Immich Smart (Context) Search ---
|
||||
# Smart search = ML embedding (kept warm by clip-keepalive) + a pgvector ANN
|
||||
# query over the vchord clip_index. The index must stay resident in PG
|
||||
# shared_buffers (kept warm by clip-index-prewarm); if it decays out of cache a
|
||||
# query pays a ~1.8s cold storage read instead of ~4ms warm. We measure both
|
||||
# the live ANN latency and the clip_index residency to catch the regression.
|
||||
check_immich_search() {
|
||||
section 46 "Immich Smart Search"
|
||||
local pg pct dur_ms dur detail=""
|
||||
|
||||
pg=$($KUBECTL get pods -n immich --no-headers 2>/dev/null | awk '/^immich-postgresql-/ && $3=="Running"{print $1; exit}')
|
||||
if [[ -z "$pg" ]]; then
|
||||
warn "immich-postgresql pod not running — cannot probe smart search"
|
||||
json_add "immich_search" "WARN" "immich-postgresql pod not running"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# clip_index residency in shared_buffers (single-quoted SQL → pass as one arg)
|
||||
pct=$($KUBECTL exec -n immich -c immich-postgresql "$pg" -- psql -U postgres -d immich -tAc \
|
||||
"SELECT COALESCE(round(100.0*count(*)*8192/greatest(pg_relation_size('clip_index'::regclass),1),1),0) FROM pg_buffercache b JOIN pg_class c ON b.relfilenode=pg_relation_filenode(c.oid) WHERE c.relname='clip_index'" 2>/dev/null | tr -d ' ')
|
||||
|
||||
# Representative random-vector ANN latency, measured in-pod (excludes exec overhead)
|
||||
dur_ms=$($KUBECTL exec -n immich -c immich-postgresql "$pg" -- bash -c \
|
||||
's=$(date +%s%3N); psql -U postgres -d immich -tAc "SELECT count(*) FROM (SELECT \"assetId\" FROM smart_search ORDER BY embedding <=> (SELECT embedding FROM smart_search ORDER BY random() LIMIT 1) LIMIT 100) x" >/dev/null 2>&1; e=$(date +%s%3N); echo $((e-s))' 2>/dev/null | tr -d ' ')
|
||||
|
||||
if ! [[ "$dur_ms" =~ ^[0-9]+$ ]]; then
|
||||
warn "Smart-search probe query failed (clip_index residency: ${pct:-?}%)"
|
||||
json_add "immich_search" "WARN" "probe query failed; residency=${pct:-?}%"
|
||||
return 0
|
||||
fi
|
||||
dur=$(awk "BEGIN{printf \"%.2f\", $dur_ms/1000}")
|
||||
detail="latency=${dur}s clip_index_resident=${pct:-?}%"
|
||||
|
||||
if (( dur_ms > 1500 )); then
|
||||
[[ "$QUIET" == true ]] && section_always 46 "Immich Smart Search"
|
||||
fail "Smart search SLOW: $detail — clip_index likely evicted; check clip-index-prewarm CronJob"
|
||||
json_add "immich_search" "FAIL" "$detail"
|
||||
elif [[ "$pct" =~ ^[0-9.]+$ ]] && awk "BEGIN{exit !($pct < 50)}"; then
|
||||
[[ "$QUIET" == true ]] && section_always 46 "Immich Smart Search"
|
||||
fail "clip_index only ${pct}% resident in PG cache — searches cold ($detail)"
|
||||
json_add "immich_search" "FAIL" "$detail"
|
||||
elif (( dur_ms > 500 )) || { [[ "$pct" =~ ^[0-9.]+$ ]] && awk "BEGIN{exit !($pct < 90)}"; }; then
|
||||
[[ "$QUIET" == true ]] && section_always 46 "Immich Smart Search"
|
||||
warn "Smart search degraded: $detail"
|
||||
json_add "immich_search" "WARN" "$detail"
|
||||
else
|
||||
pass "Smart search healthy: $detail"
|
||||
json_add "immich_search" "PASS" "$detail"
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Summary ---
|
||||
print_summary() {
|
||||
if [[ "$JSON" == true ]]; then
|
||||
|
|
@ -3029,6 +3080,7 @@ main() {
|
|||
check_monitoring_prom_am check_monitoring_vault check_monitoring_css
|
||||
check_external_replicas check_external_divergence check_pve_thermals
|
||||
check_pve_load check_external_traefik_5xx check_ha_status_dashboard
|
||||
check_immich_search
|
||||
)
|
||||
|
||||
# Auto-fix mutates cluster state inside individual checks — keep that
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue