Split monolithic test_pipeline.py into focused integration test scripts
- common.py: shared config, URL constants, benchmark questions, all helpers (get, post_json, check_sse, qdrant_count, fetch_logs, parse_run_block, wait_for, etc.) - test_health.py: service health checks (deepagents, bifrost, GPU/CPU Ollama, Qdrant, SearXNG) - test_memory.py: name store/recall pipeline, memory benchmark (5 facts + 10 recalls), dedup test - test_routing.py: easy/medium/hard tier routing benchmarks with --easy/medium/hard-only flags - Removed test_pipeline.py Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
317
tests/integration/test_routing.py
Normal file
317
tests/integration/test_routing.py
Normal file
@@ -0,0 +1,317 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Adolf tier routing benchmark.
|
||||
|
||||
Tests:
|
||||
easy — 10 questions that must route to 'light' tier
|
||||
medium — 11 questions that must route to 'medium' (light acceptable for some; complex = fail)
|
||||
hard — 10 /think questions that must route to 'complex' (medium fallback acceptable)
|
||||
|
||||
Usage:
|
||||
python3 test_routing.py [--chat-id CHAT_ID]
|
||||
[--easy-only] # only easy benchmark
|
||||
[--medium-only] # only medium benchmark
|
||||
[--hard-only] # only hard benchmark
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
|
||||
from common import (
|
||||
DEEPAGENTS, COMPOSE_FILE, DEFAULT_CHAT_ID,
|
||||
BENCHMARK,
|
||||
INFO, PASS, FAIL, WARN,
|
||||
report, print_summary,
|
||||
post_json, fetch_logs,
|
||||
parse_run_block,
|
||||
)
|
||||
|
||||
# ── args ──────────────────────────────────────────────────────────────────────
|
||||
parser = argparse.ArgumentParser(description="Adolf routing benchmark")
|
||||
parser.add_argument("--chat-id", default=DEFAULT_CHAT_ID)
|
||||
parser.add_argument("--easy-only", action="store_true")
|
||||
parser.add_argument("--medium-only", action="store_true")
|
||||
parser.add_argument("--hard-only", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
CHAT_ID = args.chat_id
|
||||
_only = args.easy_only or args.medium_only or args.hard_only
|
||||
_run_easy = not _only or args.easy_only
|
||||
_run_medium = not _only or args.medium_only
|
||||
_run_hard = not _only or args.hard_only
|
||||
|
||||
results = []
|
||||
|
||||
|
||||
# ── easy benchmark ────────────────────────────────────────────────────────────
|
||||
if _run_easy:
|
||||
print(f"\n[{INFO}] Easy routing benchmark")
|
||||
print(f" {len(BENCHMARK['easy'])} questions — all must route to 'light'")
|
||||
print(f" Chat ID: {CHAT_ID}")
|
||||
print()
|
||||
|
||||
bench_results = []
|
||||
LIGHT_TIMEOUT = 60
|
||||
|
||||
for i, question in enumerate(BENCHMARK["easy"], 1):
|
||||
tag = f"easy-{i:02d}"
|
||||
print(f" [{tag}] {question[:55]!r}")
|
||||
|
||||
t_send = time.monotonic()
|
||||
try:
|
||||
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
||||
{"message": question, "chat_id": CHAT_ID}, timeout=5)
|
||||
if status != 202:
|
||||
print(f" → [{FAIL}] POST returned {status}")
|
||||
bench_results.append((question, "?", None, False))
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f" → [{FAIL}] POST error: {e}")
|
||||
bench_results.append((question, "?", None, False))
|
||||
continue
|
||||
|
||||
t_start = time.monotonic()
|
||||
found = None
|
||||
while time.monotonic() - t_start < LIGHT_TIMEOUT:
|
||||
since = int(time.monotonic() - t_start) + 30
|
||||
lines = fetch_logs(since_s=since)
|
||||
found = parse_run_block(lines, question)
|
||||
if found:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not found:
|
||||
print(f" → [{FAIL}] no reply within {LIGHT_TIMEOUT}s")
|
||||
bench_results.append((question, "timeout", None, False))
|
||||
continue
|
||||
|
||||
tier = found.get("tier", "unknown")
|
||||
is_light = (tier == "light")
|
||||
tag_str = PASS if is_light else FAIL
|
||||
print(f" → [{tag_str}] tier={tier} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s")
|
||||
bench_results.append((question, tier, found["reply_total"], is_light))
|
||||
time.sleep(1)
|
||||
|
||||
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question'}")
|
||||
print(f" {'─'*4} {'─'*8} {'─'*8} {'─'*50}")
|
||||
for idx, (q, tier, lat, ok) in enumerate(bench_results, 1):
|
||||
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
|
||||
ok_str = "✓" if ok else "✗"
|
||||
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {q[:50]!r}")
|
||||
|
||||
light_count = sum(1 for _, _, _, ok in bench_results if ok)
|
||||
total_bench = len(bench_results)
|
||||
lats = [lat for _, _, lat, ok in bench_results if ok and lat is not None]
|
||||
avg_lat = sum(lats) / len(lats) if lats else 0
|
||||
|
||||
print(f"\n Light-path score: {light_count}/{total_bench}")
|
||||
if lats:
|
||||
print(f" Avg latency (light): {avg_lat:.1f}s min={min(lats):.1f}s max={max(lats):.1f}s")
|
||||
|
||||
report(results, f"All easy questions routed to light ({light_count}/{total_bench})",
|
||||
light_count == total_bench,
|
||||
f"{light_count}/{total_bench} via light path, avg {avg_lat:.1f}s")
|
||||
|
||||
|
||||
# ── medium benchmark ──────────────────────────────────────────────────────────
|
||||
if _run_medium:
|
||||
print(f"\n[{INFO}] Medium routing benchmark")
|
||||
print(f" {len(BENCHMARK['medium'])} questions — must route to medium (light ok for some; complex = fail)")
|
||||
print(f" Chat ID: {CHAT_ID}")
|
||||
print()
|
||||
|
||||
LIGHT_ACCEPTABLE = {
|
||||
"who won the last FIFA World Cup?",
|
||||
"search for a good pasta carbonara recipe",
|
||||
"find Python tutorials for beginners",
|
||||
"search for the best coffee shops in Tokyo",
|
||||
}
|
||||
|
||||
med_results = []
|
||||
MEDIUM_TIMEOUT = 120
|
||||
|
||||
for i, question in enumerate(BENCHMARK["medium"], 1):
|
||||
tag = f"med-{i:02d}"
|
||||
print(f" [{tag}] {question[:60]!r}")
|
||||
|
||||
t_send = time.monotonic()
|
||||
try:
|
||||
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
||||
{"message": question, "chat_id": CHAT_ID}, timeout=5)
|
||||
if status != 202:
|
||||
print(f" → [{FAIL}] POST returned {status}")
|
||||
med_results.append((question, "?", None, False))
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f" → [{FAIL}] POST error: {e}")
|
||||
med_results.append((question, "?", None, False))
|
||||
continue
|
||||
|
||||
t_start = time.monotonic()
|
||||
found = None
|
||||
while time.monotonic() - t_start < MEDIUM_TIMEOUT:
|
||||
since = int(time.monotonic() - t_start) + 60
|
||||
lines = fetch_logs(since_s=since)
|
||||
found = parse_run_block(lines, question)
|
||||
if found:
|
||||
break
|
||||
time.sleep(3)
|
||||
|
||||
if not found:
|
||||
print(f" → [{FAIL}] no reply within {MEDIUM_TIMEOUT}s")
|
||||
med_results.append((question, "timeout", None, False))
|
||||
continue
|
||||
|
||||
tier = found.get("tier", "unknown")
|
||||
light_ok = question in LIGHT_ACCEPTABLE
|
||||
|
||||
if tier == "medium":
|
||||
correct, label, note = True, PASS, "medium ✓"
|
||||
elif tier == "light":
|
||||
correct = light_ok
|
||||
label = PASS if light_ok else WARN
|
||||
note = "light (acceptable)" if light_ok else "light (should be medium)"
|
||||
elif tier == "complex":
|
||||
correct, label, note = False, FAIL, "complex — wrong escalation"
|
||||
else:
|
||||
correct, label, note = False, FAIL, f"unknown tier {tier!r}"
|
||||
|
||||
print(f" → [{label}] {note} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s")
|
||||
med_results.append((question, tier, found["reply_total"], correct))
|
||||
time.sleep(1)
|
||||
|
||||
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question'}")
|
||||
print(f" {'─'*4} {'─'*8} {'─'*8} {'─'*55}")
|
||||
for idx, (q, tier, lat, ok) in enumerate(med_results, 1):
|
||||
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
|
||||
ok_str = "✓" if ok else ("~" if tier == "light" else "✗")
|
||||
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {q[:55]!r}")
|
||||
|
||||
total_med = len(med_results)
|
||||
medium_count = sum(1 for _, tier, _, _ in med_results if tier == "medium")
|
||||
light_count = sum(1 for _, tier, _, _ in med_results if tier == "light")
|
||||
complex_count = sum(1 for _, tier, _, _ in med_results if tier == "complex")
|
||||
timeout_count = sum(1 for _, tier, _, _ in med_results if tier == "timeout")
|
||||
light_misroute = sum(1 for q, tier, _, _ in med_results
|
||||
if tier == "light" and q not in LIGHT_ACCEPTABLE)
|
||||
lats = [lat for _, _, lat, _ in med_results if lat is not None]
|
||||
|
||||
print(f"\n Breakdown: medium={medium_count} light={light_count} "
|
||||
f"complex={complex_count} timeout={timeout_count}")
|
||||
if light_misroute:
|
||||
print(f" [{WARN}] {light_misroute} question(s) answered via light when medium expected")
|
||||
if lats:
|
||||
print(f" Avg latency: {sum(lats)/len(lats):.1f}s min={min(lats):.1f}s max={max(lats):.1f}s")
|
||||
|
||||
report(results,
|
||||
f"Medium questions: no complex escalation ({medium_count + light_count}/{total_med} routed)",
|
||||
complex_count == 0,
|
||||
f"medium={medium_count} light={light_count} complex={complex_count} timeout={timeout_count}")
|
||||
if timeout_count:
|
||||
report(results, f"Medium questions: all completed within {MEDIUM_TIMEOUT}s", False,
|
||||
f"{timeout_count} question(s) timed out")
|
||||
|
||||
|
||||
# ── hard benchmark ────────────────────────────────────────────────────────────
|
||||
if _run_hard:
|
||||
print(f"\n[{INFO}] Hard routing benchmark")
|
||||
print(f" {len(BENCHMARK['hard'])} /think questions — must route to 'complex'")
|
||||
print(f" Acceptable fallback: 'medium' if VRAM eviction timed out")
|
||||
print(f" Fail condition: tier=light or timeout")
|
||||
print(f" Chat ID: {CHAT_ID}")
|
||||
print()
|
||||
|
||||
hard_results = []
|
||||
COMPLEX_TIMEOUT = 300
|
||||
_VRAM_ENTER = "[vram] enter_complex_mode"
|
||||
_VRAM_EXIT = "[vram] exit_complex_mode"
|
||||
|
||||
for i, question in enumerate(BENCHMARK["hard"], 1):
|
||||
tag = f"hard-{i:02d}"
|
||||
short_q = question[len("/think "):].strip()[:60]
|
||||
print(f" [{tag}] /think {short_q!r}")
|
||||
|
||||
t_send = time.monotonic()
|
||||
try:
|
||||
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
||||
{"message": question, "chat_id": CHAT_ID}, timeout=5)
|
||||
if status != 202:
|
||||
print(f" → [{FAIL}] POST returned {status}")
|
||||
hard_results.append((question, "?", None, False))
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f" → [{FAIL}] POST error: {e}")
|
||||
hard_results.append((question, "?", None, False))
|
||||
continue
|
||||
|
||||
t_start = time.monotonic()
|
||||
found = None
|
||||
while time.monotonic() - t_start < COMPLEX_TIMEOUT:
|
||||
since = int(time.monotonic() - t_start) + 90
|
||||
lines = fetch_logs(since_s=since)
|
||||
found = parse_run_block(lines, question[len("/think "):].strip())
|
||||
if found:
|
||||
break
|
||||
time.sleep(5)
|
||||
|
||||
elapsed = time.monotonic() - t_send
|
||||
|
||||
if not found:
|
||||
print(f" → [{FAIL}] no reply within {COMPLEX_TIMEOUT}s")
|
||||
hard_results.append((question, "timeout", None, False))
|
||||
continue
|
||||
|
||||
tier = found.get("tier", "unknown")
|
||||
|
||||
if tier == "complex":
|
||||
ok, label, note = True, PASS, "complex ✓"
|
||||
elif tier == "medium":
|
||||
ok, label, note = True, WARN, "medium (VRAM fallback — check [vram] logs)"
|
||||
else:
|
||||
ok, label, note = False, FAIL, f"tier={tier} — unexpected"
|
||||
|
||||
lines_block = fetch_logs(since_s=int(elapsed) + 120)
|
||||
recent = "\n".join(lines_block[-200:])
|
||||
vram_enter_seen = _VRAM_ENTER in recent
|
||||
vram_note = ""
|
||||
if tier == "complex":
|
||||
vram_note = " [vram:flush✓]" if vram_enter_seen else f" [{WARN}:no vram flush log]"
|
||||
|
||||
print(f" → [{label}] {note} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s{vram_note}")
|
||||
hard_results.append((question, tier, found["reply_total"], ok))
|
||||
time.sleep(5)
|
||||
|
||||
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question (/think ...)'}")
|
||||
print(f" {'─'*4} {'─'*8} {'─'*8} {'─'*55}")
|
||||
for idx, (q, tier, lat, ok) in enumerate(hard_results, 1):
|
||||
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
|
||||
ok_str = "✓" if tier == "complex" else ("~" if tier == "medium" else "✗")
|
||||
short = q[len("/think "):].strip()[:55]
|
||||
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {short!r}")
|
||||
|
||||
total_hard = len(hard_results)
|
||||
complex_count = sum(1 for _, t, _, _ in hard_results if t == "complex")
|
||||
medium_fb = sum(1 for _, t, _, _ in hard_results if t == "medium")
|
||||
light_count = sum(1 for _, t, _, _ in hard_results if t == "light")
|
||||
timeout_count = sum(1 for _, t, _, _ in hard_results if t == "timeout")
|
||||
lats = [lat for _, _, lat, _ in hard_results if lat is not None]
|
||||
|
||||
print(f"\n Breakdown: complex={complex_count} medium(fallback)={medium_fb} "
|
||||
f"light={light_count} timeout={timeout_count}")
|
||||
if medium_fb:
|
||||
print(f" [{WARN}] {medium_fb} question(s) fell back to medium (VRAM eviction timeout)")
|
||||
if light_count:
|
||||
print(f" [{FAIL}] {light_count} question(s) routed to light — /think prefix not detected")
|
||||
if lats:
|
||||
print(f" Avg latency: {sum(lats)/len(lats):.1f}s min={min(lats):.1f}s max={max(lats):.1f}s")
|
||||
|
||||
report(results,
|
||||
f"Hard questions routed to complex (not light) ({complex_count + medium_fb}/{total_hard})",
|
||||
light_count == 0 and timeout_count == 0,
|
||||
f"complex={complex_count} medium_fallback={medium_fb} light={light_count} timeout={timeout_count}")
|
||||
|
||||
|
||||
# ── summary ───────────────────────────────────────────────────────────────────
|
||||
print_summary(results)
|
||||
sys.exit(0 if all(ok for _, ok in results) else 1)
|
||||
Reference in New Issue
Block a user