wiki search people tested pipeline

2026-03-05 11:22:34 +00:00
parent ea77b2308b
commit ec45d255f0
19 changed files with 1717 additions and 257 deletions
--- a/test_pipeline.py
+++ b/test_pipeline.py
@@ -13,16 +13,19 @@ Tests:
  8. Name recall — "what is your name?" → reply contains <RandomName>
  9. Timing profile + bottleneck report
 10. Easy benchmark   — 10 easy questions → all must route to light
- 11. Medium benchmark — 10 medium questions → must route to medium (or light, never complex)
+ 11. Medium benchmark — 11 medium questions → must route to medium (or light, never complex)
 12. Hard benchmark   — 10 /think questions → all must route to complex; VRAM flush verified
+ 13. Memory benchmark — store 5 facts, recall with 10 questions → verify keyword presence
+ 14. Dedup test      — same fact sent twice → Qdrant must not grow by 2

 Usage:
    python3 test_pipeline.py [--chat-id CHAT_ID]
-                             [--bench-only]       skip sections 1-9, run 10+11+12
-                             [--easy-only]        skip 1-9 and 11+12, run only section 10
-                             [--medium-only]      skip 1-9 and 10+12, run only section 11
-                             [--hard-only]        skip 1-9 and 10+11, run only section 12
-                             [--no-bench]         skip sections 10-12
+                             [--bench-only]       skip sections 1-9, run 10+11+12+13
+                             [--easy-only]        skip 1-9 and 11+12+13, run only section 10
+                             [--medium-only]      skip 1-9 and 10+12+13, run only section 11
+                             [--hard-only]        skip 1-9 and 10+11+13, run only section 12
+                             [--memory-only]      skip 1-9 and 10+11+12, run only section 13
+                             [--no-bench]         skip sections 10-13

 Timing is extracted from deepagents container logs, not estimated from sleeps.
 """
@@ -79,6 +82,7 @@ BENCHMARK = {
        "do you remember what we talked about before?",
        "search for the best coffee shops in Tokyo",
        "what is happening in the tech industry this week?",
+        "what's the weather like today?",
    ],
    "hard": [
        "/think compare the top 3 Python web frameworks (Django, FastAPI, Flask) and recommend one for a production REST API",
@@ -187,18 +191,13 @@ def parse_run_block(lines, msg_prefix):
    reply_data = None

    for j, line in enumerate(block):
-        # Track last non-tool AIMessage (the final reply)
+        # Track last non-tool AIMessage (the final reply) — truncated at 150 chars in logs,
+        # used only as fallback if reply_text line is absent (older server versions)
        if "AIMessage:" in line and "→" not in line:
            txt = line.split("AIMessage:", 1)[-1].strip()
            if txt:
                last_ai_text = txt

-        # For light tier: router reply is stored in _conversation_buffers directly
-        # so there may be no AIMessage log — grab from tier=light line
-        if "[agent] tier=light" in line and "message=" in line:
-            # Extract preview text logged elsewhere if available
-            pass
-
        m = re.search(r"replied in ([\d.]+)s \(llm=([\d.]+)s, send=([\d.]+)s\)", line)
        if m:
            # Extract optional tier tag at end of line
@@ -209,13 +208,21 @@ def parse_run_block(lines, msg_prefix):
                "llm":         float(m.group(2)),
                "send":        float(m.group(3)),
                "tier":        tier,
-                "reply_text":  last_ai_text,
+                "reply_text":  last_ai_text,  # may be overwritten by reply_text line below
                "memory_s":    None,
                "memory_error": False,
                "_j": j,
            }
            break

+    # Read full reply_text from dedicated log line (written immediately after replied-in line)
+    if reply_data is not None:
+        next_lines = block[reply_data["_j"] + 1: reply_data["_j"] + 3]
+        for line in next_lines:
+            if line.startswith("[agent] reply_text:"):
+                reply_data["reply_text"] = line[len("[agent] reply_text:"):].strip()
+                break
+
    if reply_data is None:
        return None  # reply not in logs yet

@@ -281,16 +288,19 @@ parser.add_argument("--medium-only", action="store_true",
                    help="Skip sections 1-9 and 10, run only section 11 (medium benchmark)")
 parser.add_argument("--hard-only", action="store_true",
                    help="Skip sections 1-9 and 10+11, run only section 12 (hard benchmark)")
+parser.add_argument("--memory-only", action="store_true",
+                    help="Skip sections 1-9 and 10+11+12, run only section 13 (memory benchmark)")
 parser.add_argument("--no-bench", action="store_true",
-                    help="Skip sections 10-12 (all benchmarks)")
+                    help="Skip sections 10-13 (all benchmarks)")
 args = parser.parse_args()
 CHAT_ID = args.chat_id

 # Derived flags for readability
-_skip_pipeline = args.bench_only or args.easy_only or args.medium_only or args.hard_only
-_run_easy   = not args.no_bench and not args.medium_only and not args.hard_only
-_run_medium = not args.no_bench and not args.easy_only   and not args.hard_only
-_run_hard   = not args.no_bench and not args.easy_only   and not args.medium_only
+_skip_pipeline = args.bench_only or args.easy_only or args.medium_only or args.hard_only or args.memory_only
+_run_easy   = not args.no_bench and not args.medium_only and not args.hard_only and not args.memory_only
+_run_medium = not args.no_bench and not args.easy_only   and not args.hard_only and not args.memory_only
+_run_hard   = not args.no_bench and not args.easy_only   and not args.medium_only and not args.memory_only
+_run_memory = not args.no_bench and not args.easy_only   and not args.medium_only and not args.hard_only

 random_name = random.choice(NAMES)

@@ -880,6 +890,263 @@ if _run_hard:
    )


+# ── 13. Memory benchmark — store facts, recall with keyword verification ───────
+if _run_memory:
+    _mem_name = random.choice([
+        "Alice", "Bruno", "Camille", "Diego", "Elena",
+        "Farid", "Greta", "Hiroshi", "Irina", "Jonas",
+    ])
+    _mem_city = random.choice([
+        "Tokyo", "Berlin", "Cairo", "Sydney", "Oslo",
+        "Nairobi", "Lisbon", "Seoul", "Montreal", "Bangkok",
+    ])
+    _mem_allergy = random.choice(["nuts", "gluten", "dairy", "shellfish", "eggs"])
+    _mem_job = random.choice([
+        ("software engineer", "startup"),
+        ("data scientist", "research lab"),
+        ("product manager", "tech company"),
+        ("DevOps engineer", "cloud provider"),
+    ])
+    _mem_lang = random.choice(["Python", "Rust", "Go", "TypeScript", "Kotlin"])
+    _mem_pet_name = random.choice([
+        "Whiskers", "Biscuit", "Mango", "Pebble", "Shadow",
+        "Noodle", "Cheddar", "Cosmo", "Pippin", "Ziggy",
+    ])
+
+    print(f"\n[{INFO}] 13. Memory benchmark")
+    print(f"  name={_mem_name}  city={_mem_city}  allergy={_mem_allergy}  "
+          f"job={_mem_job[0]}@{_mem_job[1]}  lang={_mem_lang}  pet={_mem_pet_name}")
+    print(f"  Storing 5 facts, then querying with 10 recall questions")
+    print(f"  Chat ID: {CHAT_ID}")
+    print()
+
+    # Wipe Qdrant collection and restart openmemory to eliminate stale data interference.
+    # Deleting the collection alone causes 404s — openmemory holds a live reference to it.
+    try:
+        import urllib.request as _ur
+        _req = _ur.Request(f"{QDRANT}/collections/adolf_memories", method="DELETE")
+        with _ur.urlopen(_req, timeout=5):
+            pass
+        print(f"  [{INFO}] Wiped adolf_memories collection")
+    except Exception as e:
+        print(f"  [{WARN}] Could not wipe collection: {e}")
+
+    try:
+        subprocess.run(
+            ["docker", "compose", "-f", COMPOSE_FILE, "restart", "openmemory"],
+            capture_output=True, timeout=30,
+        )
+        time.sleep(6)  # wait for openmemory to reinitialize and recreate collection
+        print(f"  [{INFO}] Restarted openmemory — fresh collection ready")
+    except Exception as e:
+        print(f"  [{WARN}] Could not restart openmemory: {e}")
+
+    MEMORY_FACTS = [
+        f"My name is {_mem_name} and I live in {_mem_city}",
+        f"I prefer vegetarian food and I'm allergic to {_mem_allergy}",
+        f"I work as a {_mem_job[0]} at a {_mem_job[1]}",
+        f"My favorite programming language is {_mem_lang}",
+        f"I have a cat named {_mem_pet_name}",
+    ]
+
+    MEMORY_RECALLS = [
+        # (question, [keywords that must appear in reply])
+        ("What is my name?",                          [_mem_name.lower()]),
+        ("Where do I live?",                          [_mem_city.lower()]),
+        ("Do I have any food allergies?",             [_mem_allergy.lower()]),
+        ("What is my job?",                           [_mem_job[0].split()[0].lower()]),
+        ("What programming language do I prefer?",    [_mem_lang.lower()]),
+        ("Do I have any pets?",                       [_mem_pet_name.lower()]),
+        ("Am I vegetarian or do I eat meat?",         ["vegetarian"]),
+        ("What city am I in?",                        [_mem_city.lower()]),
+        ("Tell me what you know about me",            [_mem_name.lower(), _mem_city.lower()]),
+        ("What's the name of my pet?",                [_mem_pet_name.lower()]),
+    ]
+
+    MEMORY_STORE_TIMEOUT  = 180  # seconds per fact
+    MEMORY_RECALL_TIMEOUT = 180  # seconds per question
+
+    # ── Store facts ──────────────────────────────────────────────────────────
+    print(f"  Storing {len(MEMORY_FACTS)} facts...")
+    store_ok = 0
+    for i, fact in enumerate(MEMORY_FACTS, 1):
+        print(f"  [mem-store-{i:02d}] {fact!r}")
+        try:
+            status, _ = post_json(f"{DEEPAGENTS}/chat",
+                                  {"message": fact, "chat_id": CHAT_ID}, timeout=5)
+            if status != 202:
+                print(f"              → [{FAIL}] POST returned {status}")
+                continue
+        except Exception as e:
+            print(f"              → [{FAIL}] POST error: {e}")
+            continue
+
+        found = wait_for(f"mem-store-{i:02d}", fact, timeout_s=MEMORY_STORE_TIMEOUT, need_memory=True)
+        if found:
+            store_ok += 1
+            print(f"              → [{PASS}] stored  tier={found['tier']}  mem={found['memory_s']}s")
+        else:
+            print(f"              → [{FAIL}] timeout")
+
+    report(f"All memory facts stored ({store_ok}/{len(MEMORY_FACTS)})",
+           store_ok == len(MEMORY_FACTS))
+
+    # Wait for async memory extraction to settle — poll Qdrant until point count stabilises
+    print(f"\n  Waiting for memory extraction to settle (up to 60s)...")
+    _prev_count = -1
+    _stable_ticks = 0
+    for _ in range(30):
+        time.sleep(2)
+        try:
+            _, body = get(f"{QDRANT}/collections/adolf_memories")
+            _cur_count = json.loads(body).get("result", {}).get("points_count", 0)
+        except Exception:
+            _cur_count = _prev_count
+        if _cur_count == _prev_count:
+            _stable_ticks += 1
+            if _stable_ticks >= 3:  # stable for 6s
+                break
+        else:
+            _stable_ticks = 0
+        _prev_count = _cur_count
+    print(f"  Memory settled: {_cur_count} points in Qdrant")
+
+    # ── Recall questions ─────────────────────────────────────────────────────
+    print(f"\n  Querying with {len(MEMORY_RECALLS)} recall questions...")
+    recall_results = []  # (question, keywords, reply_text, passed)
+
+    for i, (question, keywords) in enumerate(MEMORY_RECALLS, 1):
+        print(f"  [mem-recall-{i:02d}] {question!r}")
+
+        try:
+            status, _ = post_json(f"{DEEPAGENTS}/chat",
+                                  {"message": question, "chat_id": CHAT_ID}, timeout=5)
+            if status != 202:
+                print(f"               → [{FAIL}] POST returned {status}")
+                recall_results.append((question, keywords, None, False))
+                continue
+        except Exception as e:
+            print(f"               → [{FAIL}] POST error: {e}")
+            recall_results.append((question, keywords, None, False))
+            continue
+
+        t_start = time.monotonic()
+        found = None
+        while time.monotonic() - t_start < MEMORY_RECALL_TIMEOUT:
+            since = int(time.monotonic() - t_start) + 30
+            lines = fetch_logs(since_s=since)
+            found = parse_run_block(lines, question)
+            if found:
+                break
+            time.sleep(2)
+
+        if not found:
+            print(f"               → [{FAIL}] timeout")
+            recall_results.append((question, keywords, None, False))
+            continue
+
+        reply_text = (found.get("reply_text") or "").lower()
+        hit_keywords = [kw for kw in keywords if kw.lower() in reply_text]
+        passed = len(hit_keywords) == len(keywords)
+        tag_str = PASS if passed else WARN
+        missing = [kw for kw in keywords if kw.lower() not in reply_text]
+        detail = f"tier={found['tier']}  lat={found['reply_total']:.1f}s"
+        if missing:
+            detail += f"  missing keywords: {missing}"
+        print(f"               → [{tag_str}] {detail}")
+        recall_results.append((question, keywords, found.get("reply_text"), passed))
+
+        time.sleep(1)
+
+    # Summary
+    print(f"\n  {'#':<4}  {'Pass':<5}  {'Question':<45}  {'Keywords'}")
+    print(f"  {'─'*4}  {'─'*5}  {'─'*45}  {'─'*30}")
+    for idx, (q, kws, reply, ok) in enumerate(recall_results, 1):
+        ok_str = "✓" if ok else "✗"
+        print(f"  {ok_str} {idx:<3}  {'yes' if ok else 'no':<5}  {q[:45]:<45}  {kws}")
+
+    recall_pass = sum(1 for _, _, _, ok in recall_results if ok)
+    total_recall = len(recall_results)
+    print(f"\n  Memory recall score: {recall_pass}/{total_recall}")
+
+    report(f"Memory recall ({recall_pass}/{total_recall} keywords found)",
+           recall_pass == total_recall,
+           f"{recall_pass}/{total_recall} questions had all expected keywords in reply")
+
+
+# ── 14. Deduplication test — same fact stored twice must not grow Qdrant by 2 ─
+if _run_memory:
+    print(f"\n[{INFO}] 14. Memory deduplication test")
+    print(f"  Sends the same fact twice — Qdrant point count must not increase by 2")
+    print(f"  Chat ID: {CHAT_ID}")
+    print()
+
+    DEDUP_TIMEOUT = 120
+
+    _dedup_fact = f"My lucky number is {random.randint(1000, 9999)}"
+    print(f"  Fact: {_dedup_fact!r}")
+
+    def _qdrant_count_dedup():
+        try:
+            _, body = get(f"{QDRANT}/collections/adolf_memories")
+            return json.loads(body).get("result", {}).get("points_count", 0)
+        except Exception:
+            return 0
+
+    pts_before = _qdrant_count_dedup()
+    print(f"  Qdrant points before: {pts_before}")
+
+    # Send fact the first time
+    print(f"  [dedup-1] sending fact (first time)")
+    try:
+        status, _ = post_json(f"{DEEPAGENTS}/chat",
+                              {"message": _dedup_fact, "chat_id": CHAT_ID}, timeout=5)
+        if status != 202:
+            report("Dedup: first POST accepted", False, f"status={status}")
+        else:
+            found1 = wait_for("dedup-1", _dedup_fact, timeout_s=DEDUP_TIMEOUT, need_memory=True)
+            if found1:
+                print(f"  [dedup-1] stored  tier={found1['tier']}  mem={found1['memory_s']}s")
+            else:
+                print(f"  [dedup-1] timeout")
+    except Exception as e:
+        report("Dedup: first POST accepted", False, str(e))
+        found1 = None
+
+    pts_after_first = _qdrant_count_dedup()
+    new_first = pts_after_first - pts_before
+    print(f"  Qdrant after first send: {pts_before} → {pts_after_first} (+{new_first})")
+
+    # Send exact same fact again
+    print(f"  [dedup-2] sending same fact (second time)")
+    try:
+        status, _ = post_json(f"{DEEPAGENTS}/chat",
+                              {"message": _dedup_fact, "chat_id": CHAT_ID}, timeout=5)
+        if status != 202:
+            report("Dedup: second POST accepted", False, f"status={status}")
+        else:
+            found2 = wait_for("dedup-2", _dedup_fact, timeout_s=DEDUP_TIMEOUT, need_memory=True)
+            if found2:
+                print(f"  [dedup-2] stored  tier={found2['tier']}  mem={found2['memory_s']}s")
+            else:
+                print(f"  [dedup-2] timeout")
+    except Exception as e:
+        report("Dedup: second POST accepted", False, str(e))
+
+    pts_after_second = _qdrant_count_dedup()
+    new_second = pts_after_second - pts_after_first
+    print(f"  Qdrant after second send: {pts_after_first} → {pts_after_second} (+{new_second})")
+
+    # Pass: second store added no MORE points than the first (NOOP or UPDATE, not ADD)
+    # If first send stored 0 points (fact too trivial), dedup is vacuously satisfied.
+    dedup_ok = new_second <= new_first
+    report(
+        "Deduplication: second identical fact not added to Qdrant",
+        dedup_ok,
+        f"first send: +{new_first} pts, second send: +{new_second} pts (want second ≤ first)",
+    )
+
+
 # ── summary ───────────────────────────────────────────────────────────────────
 print(f"\n{'─'*55}")
 total  = len(results)