wiki search people tested pipeline
This commit is contained in:
305
test_pipeline.py
305
test_pipeline.py
@@ -13,16 +13,19 @@ Tests:
|
||||
8. Name recall — "what is your name?" → reply contains <RandomName>
|
||||
9. Timing profile + bottleneck report
|
||||
10. Easy benchmark — 10 easy questions → all must route to light
|
||||
11. Medium benchmark — 10 medium questions → must route to medium (or light, never complex)
|
||||
11. Medium benchmark — 11 medium questions → must route to medium (or light, never complex)
|
||||
12. Hard benchmark — 10 /think questions → all must route to complex; VRAM flush verified
|
||||
13. Memory benchmark — store 5 facts, recall with 10 questions → verify keyword presence
|
||||
14. Dedup test — same fact sent twice → Qdrant must not grow by 2
|
||||
|
||||
Usage:
|
||||
python3 test_pipeline.py [--chat-id CHAT_ID]
|
||||
[--bench-only] skip sections 1-9, run 10+11+12
|
||||
[--easy-only] skip 1-9 and 11+12, run only section 10
|
||||
[--medium-only] skip 1-9 and 10+12, run only section 11
|
||||
[--hard-only] skip 1-9 and 10+11, run only section 12
|
||||
[--no-bench] skip sections 10-12
|
||||
[--bench-only] skip sections 1-9, run 10+11+12+13
|
||||
[--easy-only] skip 1-9 and 11+12+13, run only section 10
|
||||
[--medium-only] skip 1-9 and 10+12+13, run only section 11
|
||||
[--hard-only] skip 1-9 and 10+11+13, run only section 12
|
||||
[--memory-only] skip 1-9 and 10+11+12, run only section 13
|
||||
[--no-bench] skip sections 10-13
|
||||
|
||||
Timing is extracted from deepagents container logs, not estimated from sleeps.
|
||||
"""
|
||||
@@ -79,6 +82,7 @@ BENCHMARK = {
|
||||
"do you remember what we talked about before?",
|
||||
"search for the best coffee shops in Tokyo",
|
||||
"what is happening in the tech industry this week?",
|
||||
"what's the weather like today?",
|
||||
],
|
||||
"hard": [
|
||||
"/think compare the top 3 Python web frameworks (Django, FastAPI, Flask) and recommend one for a production REST API",
|
||||
@@ -187,18 +191,13 @@ def parse_run_block(lines, msg_prefix):
|
||||
reply_data = None
|
||||
|
||||
for j, line in enumerate(block):
|
||||
# Track last non-tool AIMessage (the final reply)
|
||||
# Track last non-tool AIMessage (the final reply) — truncated at 150 chars in logs,
|
||||
# used only as fallback if reply_text line is absent (older server versions)
|
||||
if "AIMessage:" in line and "→" not in line:
|
||||
txt = line.split("AIMessage:", 1)[-1].strip()
|
||||
if txt:
|
||||
last_ai_text = txt
|
||||
|
||||
# For light tier: router reply is stored in _conversation_buffers directly
|
||||
# so there may be no AIMessage log — grab from tier=light line
|
||||
if "[agent] tier=light" in line and "message=" in line:
|
||||
# Extract preview text logged elsewhere if available
|
||||
pass
|
||||
|
||||
m = re.search(r"replied in ([\d.]+)s \(llm=([\d.]+)s, send=([\d.]+)s\)", line)
|
||||
if m:
|
||||
# Extract optional tier tag at end of line
|
||||
@@ -209,13 +208,21 @@ def parse_run_block(lines, msg_prefix):
|
||||
"llm": float(m.group(2)),
|
||||
"send": float(m.group(3)),
|
||||
"tier": tier,
|
||||
"reply_text": last_ai_text,
|
||||
"reply_text": last_ai_text, # may be overwritten by reply_text line below
|
||||
"memory_s": None,
|
||||
"memory_error": False,
|
||||
"_j": j,
|
||||
}
|
||||
break
|
||||
|
||||
# Read full reply_text from dedicated log line (written immediately after replied-in line)
|
||||
if reply_data is not None:
|
||||
next_lines = block[reply_data["_j"] + 1: reply_data["_j"] + 3]
|
||||
for line in next_lines:
|
||||
if line.startswith("[agent] reply_text:"):
|
||||
reply_data["reply_text"] = line[len("[agent] reply_text:"):].strip()
|
||||
break
|
||||
|
||||
if reply_data is None:
|
||||
return None # reply not in logs yet
|
||||
|
||||
@@ -281,16 +288,19 @@ parser.add_argument("--medium-only", action="store_true",
|
||||
help="Skip sections 1-9 and 10, run only section 11 (medium benchmark)")
|
||||
parser.add_argument("--hard-only", action="store_true",
|
||||
help="Skip sections 1-9 and 10+11, run only section 12 (hard benchmark)")
|
||||
parser.add_argument("--memory-only", action="store_true",
|
||||
help="Skip sections 1-9 and 10+11+12, run only section 13 (memory benchmark)")
|
||||
parser.add_argument("--no-bench", action="store_true",
|
||||
help="Skip sections 10-12 (all benchmarks)")
|
||||
help="Skip sections 10-13 (all benchmarks)")
|
||||
args = parser.parse_args()
|
||||
CHAT_ID = args.chat_id
|
||||
|
||||
# Derived flags for readability
|
||||
_skip_pipeline = args.bench_only or args.easy_only or args.medium_only or args.hard_only
|
||||
_run_easy = not args.no_bench and not args.medium_only and not args.hard_only
|
||||
_run_medium = not args.no_bench and not args.easy_only and not args.hard_only
|
||||
_run_hard = not args.no_bench and not args.easy_only and not args.medium_only
|
||||
_skip_pipeline = args.bench_only or args.easy_only or args.medium_only or args.hard_only or args.memory_only
|
||||
_run_easy = not args.no_bench and not args.medium_only and not args.hard_only and not args.memory_only
|
||||
_run_medium = not args.no_bench and not args.easy_only and not args.hard_only and not args.memory_only
|
||||
_run_hard = not args.no_bench and not args.easy_only and not args.medium_only and not args.memory_only
|
||||
_run_memory = not args.no_bench and not args.easy_only and not args.medium_only and not args.hard_only
|
||||
|
||||
random_name = random.choice(NAMES)
|
||||
|
||||
@@ -880,6 +890,263 @@ if _run_hard:
|
||||
)
|
||||
|
||||
|
||||
# ── 13. Memory benchmark — store facts, recall with keyword verification ───────
|
||||
if _run_memory:
|
||||
_mem_name = random.choice([
|
||||
"Alice", "Bruno", "Camille", "Diego", "Elena",
|
||||
"Farid", "Greta", "Hiroshi", "Irina", "Jonas",
|
||||
])
|
||||
_mem_city = random.choice([
|
||||
"Tokyo", "Berlin", "Cairo", "Sydney", "Oslo",
|
||||
"Nairobi", "Lisbon", "Seoul", "Montreal", "Bangkok",
|
||||
])
|
||||
_mem_allergy = random.choice(["nuts", "gluten", "dairy", "shellfish", "eggs"])
|
||||
_mem_job = random.choice([
|
||||
("software engineer", "startup"),
|
||||
("data scientist", "research lab"),
|
||||
("product manager", "tech company"),
|
||||
("DevOps engineer", "cloud provider"),
|
||||
])
|
||||
_mem_lang = random.choice(["Python", "Rust", "Go", "TypeScript", "Kotlin"])
|
||||
_mem_pet_name = random.choice([
|
||||
"Whiskers", "Biscuit", "Mango", "Pebble", "Shadow",
|
||||
"Noodle", "Cheddar", "Cosmo", "Pippin", "Ziggy",
|
||||
])
|
||||
|
||||
print(f"\n[{INFO}] 13. Memory benchmark")
|
||||
print(f" name={_mem_name} city={_mem_city} allergy={_mem_allergy} "
|
||||
f"job={_mem_job[0]}@{_mem_job[1]} lang={_mem_lang} pet={_mem_pet_name}")
|
||||
print(f" Storing 5 facts, then querying with 10 recall questions")
|
||||
print(f" Chat ID: {CHAT_ID}")
|
||||
print()
|
||||
|
||||
# Wipe Qdrant collection and restart openmemory to eliminate stale data interference.
|
||||
# Deleting the collection alone causes 404s — openmemory holds a live reference to it.
|
||||
try:
|
||||
import urllib.request as _ur
|
||||
_req = _ur.Request(f"{QDRANT}/collections/adolf_memories", method="DELETE")
|
||||
with _ur.urlopen(_req, timeout=5):
|
||||
pass
|
||||
print(f" [{INFO}] Wiped adolf_memories collection")
|
||||
except Exception as e:
|
||||
print(f" [{WARN}] Could not wipe collection: {e}")
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
["docker", "compose", "-f", COMPOSE_FILE, "restart", "openmemory"],
|
||||
capture_output=True, timeout=30,
|
||||
)
|
||||
time.sleep(6) # wait for openmemory to reinitialize and recreate collection
|
||||
print(f" [{INFO}] Restarted openmemory — fresh collection ready")
|
||||
except Exception as e:
|
||||
print(f" [{WARN}] Could not restart openmemory: {e}")
|
||||
|
||||
MEMORY_FACTS = [
|
||||
f"My name is {_mem_name} and I live in {_mem_city}",
|
||||
f"I prefer vegetarian food and I'm allergic to {_mem_allergy}",
|
||||
f"I work as a {_mem_job[0]} at a {_mem_job[1]}",
|
||||
f"My favorite programming language is {_mem_lang}",
|
||||
f"I have a cat named {_mem_pet_name}",
|
||||
]
|
||||
|
||||
MEMORY_RECALLS = [
|
||||
# (question, [keywords that must appear in reply])
|
||||
("What is my name?", [_mem_name.lower()]),
|
||||
("Where do I live?", [_mem_city.lower()]),
|
||||
("Do I have any food allergies?", [_mem_allergy.lower()]),
|
||||
("What is my job?", [_mem_job[0].split()[0].lower()]),
|
||||
("What programming language do I prefer?", [_mem_lang.lower()]),
|
||||
("Do I have any pets?", [_mem_pet_name.lower()]),
|
||||
("Am I vegetarian or do I eat meat?", ["vegetarian"]),
|
||||
("What city am I in?", [_mem_city.lower()]),
|
||||
("Tell me what you know about me", [_mem_name.lower(), _mem_city.lower()]),
|
||||
("What's the name of my pet?", [_mem_pet_name.lower()]),
|
||||
]
|
||||
|
||||
MEMORY_STORE_TIMEOUT = 180 # seconds per fact
|
||||
MEMORY_RECALL_TIMEOUT = 180 # seconds per question
|
||||
|
||||
# ── Store facts ──────────────────────────────────────────────────────────
|
||||
print(f" Storing {len(MEMORY_FACTS)} facts...")
|
||||
store_ok = 0
|
||||
for i, fact in enumerate(MEMORY_FACTS, 1):
|
||||
print(f" [mem-store-{i:02d}] {fact!r}")
|
||||
try:
|
||||
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
||||
{"message": fact, "chat_id": CHAT_ID}, timeout=5)
|
||||
if status != 202:
|
||||
print(f" → [{FAIL}] POST returned {status}")
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f" → [{FAIL}] POST error: {e}")
|
||||
continue
|
||||
|
||||
found = wait_for(f"mem-store-{i:02d}", fact, timeout_s=MEMORY_STORE_TIMEOUT, need_memory=True)
|
||||
if found:
|
||||
store_ok += 1
|
||||
print(f" → [{PASS}] stored tier={found['tier']} mem={found['memory_s']}s")
|
||||
else:
|
||||
print(f" → [{FAIL}] timeout")
|
||||
|
||||
report(f"All memory facts stored ({store_ok}/{len(MEMORY_FACTS)})",
|
||||
store_ok == len(MEMORY_FACTS))
|
||||
|
||||
# Wait for async memory extraction to settle — poll Qdrant until point count stabilises
|
||||
print(f"\n Waiting for memory extraction to settle (up to 60s)...")
|
||||
_prev_count = -1
|
||||
_stable_ticks = 0
|
||||
for _ in range(30):
|
||||
time.sleep(2)
|
||||
try:
|
||||
_, body = get(f"{QDRANT}/collections/adolf_memories")
|
||||
_cur_count = json.loads(body).get("result", {}).get("points_count", 0)
|
||||
except Exception:
|
||||
_cur_count = _prev_count
|
||||
if _cur_count == _prev_count:
|
||||
_stable_ticks += 1
|
||||
if _stable_ticks >= 3: # stable for 6s
|
||||
break
|
||||
else:
|
||||
_stable_ticks = 0
|
||||
_prev_count = _cur_count
|
||||
print(f" Memory settled: {_cur_count} points in Qdrant")
|
||||
|
||||
# ── Recall questions ─────────────────────────────────────────────────────
|
||||
print(f"\n Querying with {len(MEMORY_RECALLS)} recall questions...")
|
||||
recall_results = [] # (question, keywords, reply_text, passed)
|
||||
|
||||
for i, (question, keywords) in enumerate(MEMORY_RECALLS, 1):
|
||||
print(f" [mem-recall-{i:02d}] {question!r}")
|
||||
|
||||
try:
|
||||
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
||||
{"message": question, "chat_id": CHAT_ID}, timeout=5)
|
||||
if status != 202:
|
||||
print(f" → [{FAIL}] POST returned {status}")
|
||||
recall_results.append((question, keywords, None, False))
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f" → [{FAIL}] POST error: {e}")
|
||||
recall_results.append((question, keywords, None, False))
|
||||
continue
|
||||
|
||||
t_start = time.monotonic()
|
||||
found = None
|
||||
while time.monotonic() - t_start < MEMORY_RECALL_TIMEOUT:
|
||||
since = int(time.monotonic() - t_start) + 30
|
||||
lines = fetch_logs(since_s=since)
|
||||
found = parse_run_block(lines, question)
|
||||
if found:
|
||||
break
|
||||
time.sleep(2)
|
||||
|
||||
if not found:
|
||||
print(f" → [{FAIL}] timeout")
|
||||
recall_results.append((question, keywords, None, False))
|
||||
continue
|
||||
|
||||
reply_text = (found.get("reply_text") or "").lower()
|
||||
hit_keywords = [kw for kw in keywords if kw.lower() in reply_text]
|
||||
passed = len(hit_keywords) == len(keywords)
|
||||
tag_str = PASS if passed else WARN
|
||||
missing = [kw for kw in keywords if kw.lower() not in reply_text]
|
||||
detail = f"tier={found['tier']} lat={found['reply_total']:.1f}s"
|
||||
if missing:
|
||||
detail += f" missing keywords: {missing}"
|
||||
print(f" → [{tag_str}] {detail}")
|
||||
recall_results.append((question, keywords, found.get("reply_text"), passed))
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
# Summary
|
||||
print(f"\n {'#':<4} {'Pass':<5} {'Question':<45} {'Keywords'}")
|
||||
print(f" {'─'*4} {'─'*5} {'─'*45} {'─'*30}")
|
||||
for idx, (q, kws, reply, ok) in enumerate(recall_results, 1):
|
||||
ok_str = "✓" if ok else "✗"
|
||||
print(f" {ok_str} {idx:<3} {'yes' if ok else 'no':<5} {q[:45]:<45} {kws}")
|
||||
|
||||
recall_pass = sum(1 for _, _, _, ok in recall_results if ok)
|
||||
total_recall = len(recall_results)
|
||||
print(f"\n Memory recall score: {recall_pass}/{total_recall}")
|
||||
|
||||
report(f"Memory recall ({recall_pass}/{total_recall} keywords found)",
|
||||
recall_pass == total_recall,
|
||||
f"{recall_pass}/{total_recall} questions had all expected keywords in reply")
|
||||
|
||||
|
||||
# ── 14. Deduplication test — same fact stored twice must not grow Qdrant by 2 ─
|
||||
if _run_memory:
|
||||
print(f"\n[{INFO}] 14. Memory deduplication test")
|
||||
print(f" Sends the same fact twice — Qdrant point count must not increase by 2")
|
||||
print(f" Chat ID: {CHAT_ID}")
|
||||
print()
|
||||
|
||||
DEDUP_TIMEOUT = 120
|
||||
|
||||
_dedup_fact = f"My lucky number is {random.randint(1000, 9999)}"
|
||||
print(f" Fact: {_dedup_fact!r}")
|
||||
|
||||
def _qdrant_count_dedup():
|
||||
try:
|
||||
_, body = get(f"{QDRANT}/collections/adolf_memories")
|
||||
return json.loads(body).get("result", {}).get("points_count", 0)
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
pts_before = _qdrant_count_dedup()
|
||||
print(f" Qdrant points before: {pts_before}")
|
||||
|
||||
# Send fact the first time
|
||||
print(f" [dedup-1] sending fact (first time)")
|
||||
try:
|
||||
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
||||
{"message": _dedup_fact, "chat_id": CHAT_ID}, timeout=5)
|
||||
if status != 202:
|
||||
report("Dedup: first POST accepted", False, f"status={status}")
|
||||
else:
|
||||
found1 = wait_for("dedup-1", _dedup_fact, timeout_s=DEDUP_TIMEOUT, need_memory=True)
|
||||
if found1:
|
||||
print(f" [dedup-1] stored tier={found1['tier']} mem={found1['memory_s']}s")
|
||||
else:
|
||||
print(f" [dedup-1] timeout")
|
||||
except Exception as e:
|
||||
report("Dedup: first POST accepted", False, str(e))
|
||||
found1 = None
|
||||
|
||||
pts_after_first = _qdrant_count_dedup()
|
||||
new_first = pts_after_first - pts_before
|
||||
print(f" Qdrant after first send: {pts_before} → {pts_after_first} (+{new_first})")
|
||||
|
||||
# Send exact same fact again
|
||||
print(f" [dedup-2] sending same fact (second time)")
|
||||
try:
|
||||
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
||||
{"message": _dedup_fact, "chat_id": CHAT_ID}, timeout=5)
|
||||
if status != 202:
|
||||
report("Dedup: second POST accepted", False, f"status={status}")
|
||||
else:
|
||||
found2 = wait_for("dedup-2", _dedup_fact, timeout_s=DEDUP_TIMEOUT, need_memory=True)
|
||||
if found2:
|
||||
print(f" [dedup-2] stored tier={found2['tier']} mem={found2['memory_s']}s")
|
||||
else:
|
||||
print(f" [dedup-2] timeout")
|
||||
except Exception as e:
|
||||
report("Dedup: second POST accepted", False, str(e))
|
||||
|
||||
pts_after_second = _qdrant_count_dedup()
|
||||
new_second = pts_after_second - pts_after_first
|
||||
print(f" Qdrant after second send: {pts_after_first} → {pts_after_second} (+{new_second})")
|
||||
|
||||
# Pass: second store added no MORE points than the first (NOOP or UPDATE, not ADD)
|
||||
# If first send stored 0 points (fact too trivial), dedup is vacuously satisfied.
|
||||
dedup_ok = new_second <= new_first
|
||||
report(
|
||||
"Deduplication: second identical fact not added to Qdrant",
|
||||
dedup_ok,
|
||||
f"first send: +{new_first} pts, second send: +{new_second} pts (want second ≤ first)",
|
||||
)
|
||||
|
||||
|
||||
# ── summary ───────────────────────────────────────────────────────────────────
|
||||
print(f"\n{'─'*55}")
|
||||
total = len(results)
|
||||
|
||||
Reference in New Issue
Block a user