improve search relevance: hybrid scoring + AND-then-OR matching

- Blend BM25/ts_rank relevance with importance instead of sorting by one
  dimension only. Default mode: 40% relevance + 60% importance. Relevance
  mode: 70% relevance + 30% importance.
- Try AND-match first for precise results, fall back to OR-match when too
  few results are found. Prevents single-word matches from flooding results.
- Applied to both SQLite (local) and PostgreSQL (API) search paths.
This commit is contained in:
Viktor Barzin 2026-03-15 22:36:35 +00:00
parent 5a73dff622
commit 17206cd855
No known key found for this signature in database
GPG key ID: 0EB088298288D958
2 changed files with 59 additions and 20 deletions

View file

@ -151,10 +151,13 @@ async def recall_memories(body: MemoryRecall, user: AuthUser = Depends(get_curre
query_text = f"{body.context} {body.expanded_query}".strip()
order_clause = "ts_rank(search_vector, query) DESC"
# Hybrid scoring: blend ts_rank relevance (0-1) with importance (0-1)
hybrid_score = "(ts_rank(search_vector, query) * 0.7 + importance * 0.3)"
if body.sort_by == "importance":
order_clause = "importance DESC, ts_rank(search_vector, query) DESC"
elif body.sort_by == "recency":
hybrid_score = "(ts_rank(search_vector, query) * 0.4 + importance * 0.6)"
order_clause = f"{hybrid_score} DESC"
if body.sort_by == "recency":
order_clause = "created_at DESC"
category_filter = ""
@ -164,6 +167,8 @@ async def recall_memories(body: MemoryRecall, user: AuthUser = Depends(get_curre
params.append(body.category)
async with pool.acquire() as conn:
# Try AND-match first (plainto_tsquery ANDs by default), fall back to
# OR-match via individual word disjunction for broader results
rows = await conn.fetch(
f"""
SELECT id, content, category, tags, importance, is_sensitive,
@ -180,6 +185,35 @@ async def recall_memories(body: MemoryRecall, user: AuthUser = Depends(get_curre
*params,
)
# If AND-match returned too few results, broaden to OR-match
if len(rows) < body.limit and query_text:
words = query_text.split()
if len(words) > 1:
or_tsquery = " | ".join(w for w in words if w)
or_params: list = [user.user_id, or_tsquery, body.limit]
or_cat_filter = ""
if body.category:
or_cat_filter = "AND category = $4"
or_params.append(body.category)
seen_ids = {r["id"] for r in rows}
or_rows = await conn.fetch(
f"""
SELECT id, content, category, tags, importance, is_sensitive,
ts_rank(search_vector, query) AS rank,
created_at, updated_at
FROM memories, to_tsquery('english', $2) query
WHERE user_id = $1
AND deleted_at IS NULL
AND search_vector @@ query
{or_cat_filter}
ORDER BY {order_clause}
LIMIT $3
""",
*or_params,
)
rows = list(rows) + [r for r in or_rows if r["id"] not in seen_ids]
rows = rows[:body.limit]
results = []
for row in rows:
content = row["content"]

View file

@ -435,30 +435,35 @@ class MemoryServer:
import sqlite3
all_terms = f"{context} {expanded_query}".strip()
words = all_terms.split()
fts_query = " OR ".join(f'"{w.replace(chr(34), "")}"' for w in words if w)
words = [w.replace(chr(34), "") for w in all_terms.split() if w]
and_query = " AND ".join(f'"{w}"' for w in words)
or_query = " OR ".join(f'"{w}"' for w in words)
# Hybrid scoring: blend BM25 relevance with importance
# bm25() returns negative values (lower = better match), so negate it
order = (
"bm25(memories_fts), m.importance DESC"
"(-bm25(memories_fts) * 0.7 + m.importance * 0.3) DESC"
if sort_by == "relevance"
else "m.importance DESC, m.created_at DESC"
else "(-bm25(memories_fts) * 0.4 + m.importance * 0.6) DESC"
)
base_select = (
f"SELECT m.id, m.content, m.category, m.tags, m.importance, m.created_at "
f"FROM memories m JOIN memories_fts fts ON m.id = fts.rowid "
)
cursor = self.sqlite_conn.cursor()
try:
if category:
# Try AND first for precise matches, fall back to OR for broader results
cat_filter = "AND m.category = ?" if category else ""
for fts_query in (and_query, or_query):
params = [fts_query, category, limit] if category else [fts_query, limit]
cursor.execute(
f"SELECT m.id, m.content, m.category, m.tags, m.importance, m.created_at "
f"FROM memories m JOIN memories_fts fts ON m.id = fts.rowid "
f"WHERE memories_fts MATCH ? AND m.category = ? ORDER BY {order} LIMIT ?",
(fts_query, category, limit),
f"{base_select}WHERE memories_fts MATCH ? {cat_filter} ORDER BY {order} LIMIT ?",
tuple(p for p in params if p is not None),
)
else:
cursor.execute(
f"SELECT m.id, m.content, m.category, m.tags, m.importance, m.created_at "
f"FROM memories m JOIN memories_fts fts ON m.id = fts.rowid "
f"WHERE memories_fts MATCH ? ORDER BY {order} LIMIT ?",
(fts_query, limit),
)
rows = cursor.fetchall()
rows = cursor.fetchall()
if rows:
break
except sqlite3.OperationalError:
like = f"%{context}%"
if category: