Split monolithic test_pipeline.py into focused integration test scripts

- common.py: shared config, URL constants, benchmark questions, all helpers (get, post_json, check_sse, qdrant_count, fetch_logs, parse_run_block, wait_for, etc.) - test_health.py: service health checks (deepagents, bifrost, GPU/CPU Ollama, Qdrant, SearXNG) - test_memory.py: name store/recall pipeline, memory benchmark (5 facts + 10 recalls), dedup test - test_routing.py: easy/medium/hard tier routing benchmarks with --easy/medium/hard-only flags - Removed test_pipeline.py Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-12 16:02:57 +00:00
parent 50097d6092
commit 021104f510
6 changed files with 1255 additions and 1304 deletions
--- a/tests/integration/test_routing.py
+++ b/tests/integration/test_routing.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3
+"""
+Adolf tier routing benchmark.
+
+Tests:
+  easy   — 10 questions that must route to 'light' tier
+  medium — 11 questions that must route to 'medium' (light acceptable for some; complex = fail)
+  hard   — 10 /think questions that must route to 'complex' (medium fallback acceptable)
+
+Usage:
+    python3 test_routing.py [--chat-id CHAT_ID]
+                            [--easy-only]    # only easy benchmark
+                            [--medium-only]  # only medium benchmark
+                            [--hard-only]    # only hard benchmark
+"""
+
+import argparse
+import sys
+import time
+
+from common import (
+    DEEPAGENTS, COMPOSE_FILE, DEFAULT_CHAT_ID,
+    BENCHMARK,
+    INFO, PASS, FAIL, WARN,
+    report, print_summary,
+    post_json, fetch_logs,
+    parse_run_block,
+)
+
+# ── args ──────────────────────────────────────────────────────────────────────
+parser = argparse.ArgumentParser(description="Adolf routing benchmark")
+parser.add_argument("--chat-id",     default=DEFAULT_CHAT_ID)
+parser.add_argument("--easy-only",   action="store_true")
+parser.add_argument("--medium-only", action="store_true")
+parser.add_argument("--hard-only",   action="store_true")
+args = parser.parse_args()
+
+CHAT_ID = args.chat_id
+_only = args.easy_only or args.medium_only or args.hard_only
+_run_easy   = not _only or args.easy_only
+_run_medium = not _only or args.medium_only
+_run_hard   = not _only or args.hard_only
+
+results = []
+
+
+# ── easy benchmark ────────────────────────────────────────────────────────────
+if _run_easy:
+    print(f"\n[{INFO}] Easy routing benchmark")
+    print(f"  {len(BENCHMARK['easy'])} questions — all must route to 'light'")
+    print(f"  Chat ID: {CHAT_ID}")
+    print()
+
+    bench_results = []
+    LIGHT_TIMEOUT = 60
+
+    for i, question in enumerate(BENCHMARK["easy"], 1):
+        tag = f"easy-{i:02d}"
+        print(f"  [{tag}] {question[:55]!r}")
+
+        t_send = time.monotonic()
+        try:
+            status, _ = post_json(f"{DEEPAGENTS}/chat",
+                                  {"message": question, "chat_id": CHAT_ID}, timeout=5)
+            if status != 202:
+                print(f"          → [{FAIL}] POST returned {status}")
+                bench_results.append((question, "?", None, False))
+                continue
+        except Exception as e:
+            print(f"          → [{FAIL}] POST error: {e}")
+            bench_results.append((question, "?", None, False))
+            continue
+
+        t_start = time.monotonic()
+        found = None
+        while time.monotonic() - t_start < LIGHT_TIMEOUT:
+            since = int(time.monotonic() - t_start) + 30
+            lines = fetch_logs(since_s=since)
+            found = parse_run_block(lines, question)
+            if found:
+                break
+            time.sleep(1)
+
+        if not found:
+            print(f"          → [{FAIL}] no reply within {LIGHT_TIMEOUT}s")
+            bench_results.append((question, "timeout", None, False))
+            continue
+
+        tier = found.get("tier", "unknown")
+        is_light = (tier == "light")
+        tag_str = PASS if is_light else FAIL
+        print(f"          → [{tag_str}] tier={tier}  latency={found['reply_total']:.1f}s  llm={found['llm']:.1f}s")
+        bench_results.append((question, tier, found["reply_total"], is_light))
+        time.sleep(1)
+
+    print(f"\n  {'#':<4}  {'Tier':<8}  {'Latency':>8}  {'Question'}")
+    print(f"  {'─'*4}  {'─'*8}  {'─'*8}  {'─'*50}")
+    for idx, (q, tier, lat, ok) in enumerate(bench_results, 1):
+        lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
+        ok_str = "✓" if ok else "✗"
+        print(f"  {ok_str} {idx:<3}  {tier:<8}  {lat_str:>8}  {q[:50]!r}")
+
+    light_count = sum(1 for _, _, _, ok in bench_results if ok)
+    total_bench = len(bench_results)
+    lats = [lat for _, _, lat, ok in bench_results if ok and lat is not None]
+    avg_lat = sum(lats) / len(lats) if lats else 0
+
+    print(f"\n  Light-path score: {light_count}/{total_bench}")
+    if lats:
+        print(f"  Avg latency (light): {avg_lat:.1f}s  min={min(lats):.1f}s  max={max(lats):.1f}s")
+
+    report(results, f"All easy questions routed to light ({light_count}/{total_bench})",
+           light_count == total_bench,
+           f"{light_count}/{total_bench} via light path, avg {avg_lat:.1f}s")
+
+
+# ── medium benchmark ──────────────────────────────────────────────────────────
+if _run_medium:
+    print(f"\n[{INFO}] Medium routing benchmark")
+    print(f"  {len(BENCHMARK['medium'])} questions — must route to medium (light ok for some; complex = fail)")
+    print(f"  Chat ID: {CHAT_ID}")
+    print()
+
+    LIGHT_ACCEPTABLE = {
+        "who won the last FIFA World Cup?",
+        "search for a good pasta carbonara recipe",
+        "find Python tutorials for beginners",
+        "search for the best coffee shops in Tokyo",
+    }
+
+    med_results = []
+    MEDIUM_TIMEOUT = 120
+
+    for i, question in enumerate(BENCHMARK["medium"], 1):
+        tag = f"med-{i:02d}"
+        print(f"  [{tag}] {question[:60]!r}")
+
+        t_send = time.monotonic()
+        try:
+            status, _ = post_json(f"{DEEPAGENTS}/chat",
+                                  {"message": question, "chat_id": CHAT_ID}, timeout=5)
+            if status != 202:
+                print(f"          → [{FAIL}] POST returned {status}")
+                med_results.append((question, "?", None, False))
+                continue
+        except Exception as e:
+            print(f"          → [{FAIL}] POST error: {e}")
+            med_results.append((question, "?", None, False))
+            continue
+
+        t_start = time.monotonic()
+        found = None
+        while time.monotonic() - t_start < MEDIUM_TIMEOUT:
+            since = int(time.monotonic() - t_start) + 60
+            lines = fetch_logs(since_s=since)
+            found = parse_run_block(lines, question)
+            if found:
+                break
+            time.sleep(3)
+
+        if not found:
+            print(f"          → [{FAIL}] no reply within {MEDIUM_TIMEOUT}s")
+            med_results.append((question, "timeout", None, False))
+            continue
+
+        tier = found.get("tier", "unknown")
+        light_ok = question in LIGHT_ACCEPTABLE
+
+        if tier == "medium":
+            correct, label, note = True, PASS, "medium ✓"
+        elif tier == "light":
+            correct = light_ok
+            label = PASS if light_ok else WARN
+            note = "light (acceptable)" if light_ok else "light (should be medium)"
+        elif tier == "complex":
+            correct, label, note = False, FAIL, "complex — wrong escalation"
+        else:
+            correct, label, note = False, FAIL, f"unknown tier {tier!r}"
+
+        print(f"          → [{label}] {note}  latency={found['reply_total']:.1f}s  llm={found['llm']:.1f}s")
+        med_results.append((question, tier, found["reply_total"], correct))
+        time.sleep(1)
+
+    print(f"\n  {'#':<4}  {'Tier':<8}  {'Latency':>8}  {'Question'}")
+    print(f"  {'─'*4}  {'─'*8}  {'─'*8}  {'─'*55}")
+    for idx, (q, tier, lat, ok) in enumerate(med_results, 1):
+        lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
+        ok_str = "✓" if ok else ("~" if tier == "light" else "✗")
+        print(f"  {ok_str} {idx:<3}  {tier:<8}  {lat_str:>8}  {q[:55]!r}")
+
+    total_med     = len(med_results)
+    medium_count  = sum(1 for _, tier, _, _ in med_results if tier == "medium")
+    light_count   = sum(1 for _, tier, _, _ in med_results if tier == "light")
+    complex_count = sum(1 for _, tier, _, _ in med_results if tier == "complex")
+    timeout_count = sum(1 for _, tier, _, _ in med_results if tier == "timeout")
+    light_misroute = sum(1 for q, tier, _, _ in med_results
+                         if tier == "light" and q not in LIGHT_ACCEPTABLE)
+    lats = [lat for _, _, lat, _ in med_results if lat is not None]
+
+    print(f"\n  Breakdown: medium={medium_count}  light={light_count}  "
+          f"complex={complex_count}  timeout={timeout_count}")
+    if light_misroute:
+        print(f"  [{WARN}] {light_misroute} question(s) answered via light when medium expected")
+    if lats:
+        print(f"  Avg latency: {sum(lats)/len(lats):.1f}s  min={min(lats):.1f}s  max={max(lats):.1f}s")
+
+    report(results,
+           f"Medium questions: no complex escalation ({medium_count + light_count}/{total_med} routed)",
+           complex_count == 0,
+           f"medium={medium_count} light={light_count} complex={complex_count} timeout={timeout_count}")
+    if timeout_count:
+        report(results, f"Medium questions: all completed within {MEDIUM_TIMEOUT}s", False,
+               f"{timeout_count} question(s) timed out")
+
+
+# ── hard benchmark ────────────────────────────────────────────────────────────
+if _run_hard:
+    print(f"\n[{INFO}] Hard routing benchmark")
+    print(f"  {len(BENCHMARK['hard'])} /think questions — must route to 'complex'")
+    print(f"  Acceptable fallback: 'medium' if VRAM eviction timed out")
+    print(f"  Fail condition: tier=light or timeout")
+    print(f"  Chat ID: {CHAT_ID}")
+    print()
+
+    hard_results  = []
+    COMPLEX_TIMEOUT = 300
+    _VRAM_ENTER = "[vram] enter_complex_mode"
+    _VRAM_EXIT  = "[vram] exit_complex_mode"
+
+    for i, question in enumerate(BENCHMARK["hard"], 1):
+        tag = f"hard-{i:02d}"
+        short_q = question[len("/think "):].strip()[:60]
+        print(f"  [{tag}] /think {short_q!r}")
+
+        t_send = time.monotonic()
+        try:
+            status, _ = post_json(f"{DEEPAGENTS}/chat",
+                                  {"message": question, "chat_id": CHAT_ID}, timeout=5)
+            if status != 202:
+                print(f"          → [{FAIL}] POST returned {status}")
+                hard_results.append((question, "?", None, False))
+                continue
+        except Exception as e:
+            print(f"          → [{FAIL}] POST error: {e}")
+            hard_results.append((question, "?", None, False))
+            continue
+
+        t_start = time.monotonic()
+        found = None
+        while time.monotonic() - t_start < COMPLEX_TIMEOUT:
+            since = int(time.monotonic() - t_start) + 90
+            lines = fetch_logs(since_s=since)
+            found = parse_run_block(lines, question[len("/think "):].strip())
+            if found:
+                break
+            time.sleep(5)
+
+        elapsed = time.monotonic() - t_send
+
+        if not found:
+            print(f"          → [{FAIL}] no reply within {COMPLEX_TIMEOUT}s")
+            hard_results.append((question, "timeout", None, False))
+            continue
+
+        tier = found.get("tier", "unknown")
+
+        if tier == "complex":
+            ok, label, note = True, PASS, "complex ✓"
+        elif tier == "medium":
+            ok, label, note = True, WARN, "medium (VRAM fallback — check [vram] logs)"
+        else:
+            ok, label, note = False, FAIL, f"tier={tier} — unexpected"
+
+        lines_block = fetch_logs(since_s=int(elapsed) + 120)
+        recent = "\n".join(lines_block[-200:])
+        vram_enter_seen = _VRAM_ENTER in recent
+        vram_note = ""
+        if tier == "complex":
+            vram_note = " [vram:flush✓]" if vram_enter_seen else f" [{WARN}:no vram flush log]"
+
+        print(f"          → [{label}] {note}  latency={found['reply_total']:.1f}s  llm={found['llm']:.1f}s{vram_note}")
+        hard_results.append((question, tier, found["reply_total"], ok))
+        time.sleep(5)
+
+    print(f"\n  {'#':<4}  {'Tier':<8}  {'Latency':>8}  {'Question (/think ...)'}")
+    print(f"  {'─'*4}  {'─'*8}  {'─'*8}  {'─'*55}")
+    for idx, (q, tier, lat, ok) in enumerate(hard_results, 1):
+        lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
+        ok_str = "✓" if tier == "complex" else ("~" if tier == "medium" else "✗")
+        short = q[len("/think "):].strip()[:55]
+        print(f"  {ok_str} {idx:<3}  {tier:<8}  {lat_str:>8}  {short!r}")
+
+    total_hard    = len(hard_results)
+    complex_count = sum(1 for _, t, _, _ in hard_results if t == "complex")
+    medium_fb     = sum(1 for _, t, _, _ in hard_results if t == "medium")
+    light_count   = sum(1 for _, t, _, _ in hard_results if t == "light")
+    timeout_count = sum(1 for _, t, _, _ in hard_results if t == "timeout")
+    lats = [lat for _, _, lat, _ in hard_results if lat is not None]
+
+    print(f"\n  Breakdown: complex={complex_count}  medium(fallback)={medium_fb}  "
+          f"light={light_count}  timeout={timeout_count}")
+    if medium_fb:
+        print(f"  [{WARN}] {medium_fb} question(s) fell back to medium (VRAM eviction timeout)")
+    if light_count:
+        print(f"  [{FAIL}] {light_count} question(s) routed to light — /think prefix not detected")
+    if lats:
+        print(f"  Avg latency: {sum(lats)/len(lats):.1f}s  min={min(lats):.1f}s  max={max(lats):.1f}s")
+
+    report(results,
+           f"Hard questions routed to complex (not light) ({complex_count + medium_fb}/{total_hard})",
+           light_count == 0 and timeout_count == 0,
+           f"complex={complex_count} medium_fallback={medium_fb} light={light_count} timeout={timeout_count}")
+
+
+# ── summary ───────────────────────────────────────────────────────────────────
+print_summary(results)
+sys.exit(0 if all(ok for _, ok in results) else 1)