wiki search people tested pipeline

This commit is contained in:
Alvis
2026-03-05 11:22:34 +00:00
parent ea77b2308b
commit ec45d255f0
19 changed files with 1717 additions and 257 deletions

View File

@@ -13,16 +13,19 @@ Tests:
8. Name recall — "what is your name?" → reply contains <RandomName>
9. Timing profile + bottleneck report
10. Easy benchmark — 10 easy questions → all must route to light
11. Medium benchmark — 10 medium questions → must route to medium (or light, never complex)
11. Medium benchmark — 11 medium questions → must route to medium (or light, never complex)
12. Hard benchmark — 10 /think questions → all must route to complex; VRAM flush verified
13. Memory benchmark — store 5 facts, recall with 10 questions → verify keyword presence
14. Dedup test — same fact sent twice → Qdrant must not grow by 2
Usage:
python3 test_pipeline.py [--chat-id CHAT_ID]
[--bench-only] skip sections 1-9, run 10+11+12
[--easy-only] skip 1-9 and 11+12, run only section 10
[--medium-only] skip 1-9 and 10+12, run only section 11
[--hard-only] skip 1-9 and 10+11, run only section 12
[--no-bench] skip sections 10-12
[--bench-only] skip sections 1-9, run 10+11+12+13
[--easy-only] skip 1-9 and 11+12+13, run only section 10
[--medium-only] skip 1-9 and 10+12+13, run only section 11
[--hard-only] skip 1-9 and 10+11+13, run only section 12
[--memory-only] skip 1-9 and 10+11+12, run only section 13
[--no-bench] skip sections 10-13
Timing is extracted from deepagents container logs, not estimated from sleeps.
"""
@@ -79,6 +82,7 @@ BENCHMARK = {
"do you remember what we talked about before?",
"search for the best coffee shops in Tokyo",
"what is happening in the tech industry this week?",
"what's the weather like today?",
],
"hard": [
"/think compare the top 3 Python web frameworks (Django, FastAPI, Flask) and recommend one for a production REST API",
@@ -187,18 +191,13 @@ def parse_run_block(lines, msg_prefix):
reply_data = None
for j, line in enumerate(block):
# Track last non-tool AIMessage (the final reply)
# Track last non-tool AIMessage (the final reply) — truncated at 150 chars in logs,
# used only as fallback if reply_text line is absent (older server versions)
if "AIMessage:" in line and "" not in line:
txt = line.split("AIMessage:", 1)[-1].strip()
if txt:
last_ai_text = txt
# For light tier: router reply is stored in _conversation_buffers directly
# so there may be no AIMessage log — grab from tier=light line
if "[agent] tier=light" in line and "message=" in line:
# Extract preview text logged elsewhere if available
pass
m = re.search(r"replied in ([\d.]+)s \(llm=([\d.]+)s, send=([\d.]+)s\)", line)
if m:
# Extract optional tier tag at end of line
@@ -209,13 +208,21 @@ def parse_run_block(lines, msg_prefix):
"llm": float(m.group(2)),
"send": float(m.group(3)),
"tier": tier,
"reply_text": last_ai_text,
"reply_text": last_ai_text, # may be overwritten by reply_text line below
"memory_s": None,
"memory_error": False,
"_j": j,
}
break
# Read full reply_text from dedicated log line (written immediately after replied-in line)
if reply_data is not None:
next_lines = block[reply_data["_j"] + 1: reply_data["_j"] + 3]
for line in next_lines:
if line.startswith("[agent] reply_text:"):
reply_data["reply_text"] = line[len("[agent] reply_text:"):].strip()
break
if reply_data is None:
return None # reply not in logs yet
@@ -281,16 +288,19 @@ parser.add_argument("--medium-only", action="store_true",
help="Skip sections 1-9 and 10, run only section 11 (medium benchmark)")
parser.add_argument("--hard-only", action="store_true",
help="Skip sections 1-9 and 10+11, run only section 12 (hard benchmark)")
parser.add_argument("--memory-only", action="store_true",
help="Skip sections 1-9 and 10+11+12, run only section 13 (memory benchmark)")
parser.add_argument("--no-bench", action="store_true",
help="Skip sections 10-12 (all benchmarks)")
help="Skip sections 10-13 (all benchmarks)")
args = parser.parse_args()
CHAT_ID = args.chat_id
# Derived flags for readability
_skip_pipeline = args.bench_only or args.easy_only or args.medium_only or args.hard_only
_run_easy = not args.no_bench and not args.medium_only and not args.hard_only
_run_medium = not args.no_bench and not args.easy_only and not args.hard_only
_run_hard = not args.no_bench and not args.easy_only and not args.medium_only
_skip_pipeline = args.bench_only or args.easy_only or args.medium_only or args.hard_only or args.memory_only
_run_easy = not args.no_bench and not args.medium_only and not args.hard_only and not args.memory_only
_run_medium = not args.no_bench and not args.easy_only and not args.hard_only and not args.memory_only
_run_hard = not args.no_bench and not args.easy_only and not args.medium_only and not args.memory_only
_run_memory = not args.no_bench and not args.easy_only and not args.medium_only and not args.hard_only
random_name = random.choice(NAMES)
@@ -880,6 +890,263 @@ if _run_hard:
)
# ── 13. Memory benchmark — store facts, recall with keyword verification ───────
if _run_memory:
_mem_name = random.choice([
"Alice", "Bruno", "Camille", "Diego", "Elena",
"Farid", "Greta", "Hiroshi", "Irina", "Jonas",
])
_mem_city = random.choice([
"Tokyo", "Berlin", "Cairo", "Sydney", "Oslo",
"Nairobi", "Lisbon", "Seoul", "Montreal", "Bangkok",
])
_mem_allergy = random.choice(["nuts", "gluten", "dairy", "shellfish", "eggs"])
_mem_job = random.choice([
("software engineer", "startup"),
("data scientist", "research lab"),
("product manager", "tech company"),
("DevOps engineer", "cloud provider"),
])
_mem_lang = random.choice(["Python", "Rust", "Go", "TypeScript", "Kotlin"])
_mem_pet_name = random.choice([
"Whiskers", "Biscuit", "Mango", "Pebble", "Shadow",
"Noodle", "Cheddar", "Cosmo", "Pippin", "Ziggy",
])
print(f"\n[{INFO}] 13. Memory benchmark")
print(f" name={_mem_name} city={_mem_city} allergy={_mem_allergy} "
f"job={_mem_job[0]}@{_mem_job[1]} lang={_mem_lang} pet={_mem_pet_name}")
print(f" Storing 5 facts, then querying with 10 recall questions")
print(f" Chat ID: {CHAT_ID}")
print()
# Wipe Qdrant collection and restart openmemory to eliminate stale data interference.
# Deleting the collection alone causes 404s — openmemory holds a live reference to it.
try:
import urllib.request as _ur
_req = _ur.Request(f"{QDRANT}/collections/adolf_memories", method="DELETE")
with _ur.urlopen(_req, timeout=5):
pass
print(f" [{INFO}] Wiped adolf_memories collection")
except Exception as e:
print(f" [{WARN}] Could not wipe collection: {e}")
try:
subprocess.run(
["docker", "compose", "-f", COMPOSE_FILE, "restart", "openmemory"],
capture_output=True, timeout=30,
)
time.sleep(6) # wait for openmemory to reinitialize and recreate collection
print(f" [{INFO}] Restarted openmemory — fresh collection ready")
except Exception as e:
print(f" [{WARN}] Could not restart openmemory: {e}")
MEMORY_FACTS = [
f"My name is {_mem_name} and I live in {_mem_city}",
f"I prefer vegetarian food and I'm allergic to {_mem_allergy}",
f"I work as a {_mem_job[0]} at a {_mem_job[1]}",
f"My favorite programming language is {_mem_lang}",
f"I have a cat named {_mem_pet_name}",
]
MEMORY_RECALLS = [
# (question, [keywords that must appear in reply])
("What is my name?", [_mem_name.lower()]),
("Where do I live?", [_mem_city.lower()]),
("Do I have any food allergies?", [_mem_allergy.lower()]),
("What is my job?", [_mem_job[0].split()[0].lower()]),
("What programming language do I prefer?", [_mem_lang.lower()]),
("Do I have any pets?", [_mem_pet_name.lower()]),
("Am I vegetarian or do I eat meat?", ["vegetarian"]),
("What city am I in?", [_mem_city.lower()]),
("Tell me what you know about me", [_mem_name.lower(), _mem_city.lower()]),
("What's the name of my pet?", [_mem_pet_name.lower()]),
]
MEMORY_STORE_TIMEOUT = 180 # seconds per fact
MEMORY_RECALL_TIMEOUT = 180 # seconds per question
# ── Store facts ──────────────────────────────────────────────────────────
print(f" Storing {len(MEMORY_FACTS)} facts...")
store_ok = 0
for i, fact in enumerate(MEMORY_FACTS, 1):
print(f" [mem-store-{i:02d}] {fact!r}")
try:
status, _ = post_json(f"{DEEPAGENTS}/chat",
{"message": fact, "chat_id": CHAT_ID}, timeout=5)
if status != 202:
print(f" → [{FAIL}] POST returned {status}")
continue
except Exception as e:
print(f" → [{FAIL}] POST error: {e}")
continue
found = wait_for(f"mem-store-{i:02d}", fact, timeout_s=MEMORY_STORE_TIMEOUT, need_memory=True)
if found:
store_ok += 1
print(f" → [{PASS}] stored tier={found['tier']} mem={found['memory_s']}s")
else:
print(f" → [{FAIL}] timeout")
report(f"All memory facts stored ({store_ok}/{len(MEMORY_FACTS)})",
store_ok == len(MEMORY_FACTS))
# Wait for async memory extraction to settle — poll Qdrant until point count stabilises
print(f"\n Waiting for memory extraction to settle (up to 60s)...")
_prev_count = -1
_stable_ticks = 0
for _ in range(30):
time.sleep(2)
try:
_, body = get(f"{QDRANT}/collections/adolf_memories")
_cur_count = json.loads(body).get("result", {}).get("points_count", 0)
except Exception:
_cur_count = _prev_count
if _cur_count == _prev_count:
_stable_ticks += 1
if _stable_ticks >= 3: # stable for 6s
break
else:
_stable_ticks = 0
_prev_count = _cur_count
print(f" Memory settled: {_cur_count} points in Qdrant")
# ── Recall questions ─────────────────────────────────────────────────────
print(f"\n Querying with {len(MEMORY_RECALLS)} recall questions...")
recall_results = [] # (question, keywords, reply_text, passed)
for i, (question, keywords) in enumerate(MEMORY_RECALLS, 1):
print(f" [mem-recall-{i:02d}] {question!r}")
try:
status, _ = post_json(f"{DEEPAGENTS}/chat",
{"message": question, "chat_id": CHAT_ID}, timeout=5)
if status != 202:
print(f" → [{FAIL}] POST returned {status}")
recall_results.append((question, keywords, None, False))
continue
except Exception as e:
print(f" → [{FAIL}] POST error: {e}")
recall_results.append((question, keywords, None, False))
continue
t_start = time.monotonic()
found = None
while time.monotonic() - t_start < MEMORY_RECALL_TIMEOUT:
since = int(time.monotonic() - t_start) + 30
lines = fetch_logs(since_s=since)
found = parse_run_block(lines, question)
if found:
break
time.sleep(2)
if not found:
print(f" → [{FAIL}] timeout")
recall_results.append((question, keywords, None, False))
continue
reply_text = (found.get("reply_text") or "").lower()
hit_keywords = [kw for kw in keywords if kw.lower() in reply_text]
passed = len(hit_keywords) == len(keywords)
tag_str = PASS if passed else WARN
missing = [kw for kw in keywords if kw.lower() not in reply_text]
detail = f"tier={found['tier']} lat={found['reply_total']:.1f}s"
if missing:
detail += f" missing keywords: {missing}"
print(f" → [{tag_str}] {detail}")
recall_results.append((question, keywords, found.get("reply_text"), passed))
time.sleep(1)
# Summary
print(f"\n {'#':<4} {'Pass':<5} {'Question':<45} {'Keywords'}")
print(f" {''*4} {''*5} {''*45} {''*30}")
for idx, (q, kws, reply, ok) in enumerate(recall_results, 1):
ok_str = "" if ok else ""
print(f" {ok_str} {idx:<3} {'yes' if ok else 'no':<5} {q[:45]:<45} {kws}")
recall_pass = sum(1 for _, _, _, ok in recall_results if ok)
total_recall = len(recall_results)
print(f"\n Memory recall score: {recall_pass}/{total_recall}")
report(f"Memory recall ({recall_pass}/{total_recall} keywords found)",
recall_pass == total_recall,
f"{recall_pass}/{total_recall} questions had all expected keywords in reply")
# ── 14. Deduplication test — same fact stored twice must not grow Qdrant by 2 ─
if _run_memory:
print(f"\n[{INFO}] 14. Memory deduplication test")
print(f" Sends the same fact twice — Qdrant point count must not increase by 2")
print(f" Chat ID: {CHAT_ID}")
print()
DEDUP_TIMEOUT = 120
_dedup_fact = f"My lucky number is {random.randint(1000, 9999)}"
print(f" Fact: {_dedup_fact!r}")
def _qdrant_count_dedup():
try:
_, body = get(f"{QDRANT}/collections/adolf_memories")
return json.loads(body).get("result", {}).get("points_count", 0)
except Exception:
return 0
pts_before = _qdrant_count_dedup()
print(f" Qdrant points before: {pts_before}")
# Send fact the first time
print(f" [dedup-1] sending fact (first time)")
try:
status, _ = post_json(f"{DEEPAGENTS}/chat",
{"message": _dedup_fact, "chat_id": CHAT_ID}, timeout=5)
if status != 202:
report("Dedup: first POST accepted", False, f"status={status}")
else:
found1 = wait_for("dedup-1", _dedup_fact, timeout_s=DEDUP_TIMEOUT, need_memory=True)
if found1:
print(f" [dedup-1] stored tier={found1['tier']} mem={found1['memory_s']}s")
else:
print(f" [dedup-1] timeout")
except Exception as e:
report("Dedup: first POST accepted", False, str(e))
found1 = None
pts_after_first = _qdrant_count_dedup()
new_first = pts_after_first - pts_before
print(f" Qdrant after first send: {pts_before}{pts_after_first} (+{new_first})")
# Send exact same fact again
print(f" [dedup-2] sending same fact (second time)")
try:
status, _ = post_json(f"{DEEPAGENTS}/chat",
{"message": _dedup_fact, "chat_id": CHAT_ID}, timeout=5)
if status != 202:
report("Dedup: second POST accepted", False, f"status={status}")
else:
found2 = wait_for("dedup-2", _dedup_fact, timeout_s=DEDUP_TIMEOUT, need_memory=True)
if found2:
print(f" [dedup-2] stored tier={found2['tier']} mem={found2['memory_s']}s")
else:
print(f" [dedup-2] timeout")
except Exception as e:
report("Dedup: second POST accepted", False, str(e))
pts_after_second = _qdrant_count_dedup()
new_second = pts_after_second - pts_after_first
print(f" Qdrant after second send: {pts_after_first}{pts_after_second} (+{new_second})")
# Pass: second store added no MORE points than the first (NOOP or UPDATE, not ADD)
# If first send stored 0 points (fact too trivial), dedup is vacuously satisfied.
dedup_ok = new_second <= new_first
report(
"Deduplication: second identical fact not added to Qdrant",
dedup_ok,
f"first send: +{new_first} pts, second send: +{new_second} pts (want second ≤ first)",
)
# ── summary ───────────────────────────────────────────────────────────────────
print(f"\n{''*55}")
total = len(results)