Files
adolf/tests/integration/test_routing.py
Alvis 021104f510 Split monolithic test_pipeline.py into focused integration test scripts
- common.py: shared config, URL constants, benchmark questions, all helpers
  (get, post_json, check_sse, qdrant_count, fetch_logs, parse_run_block, wait_for, etc.)
- test_health.py: service health checks (deepagents, bifrost, GPU/CPU Ollama, Qdrant, SearXNG)
- test_memory.py: name store/recall pipeline, memory benchmark (5 facts + 10 recalls), dedup test
- test_routing.py: easy/medium/hard tier routing benchmarks with --easy/medium/hard-only flags
- Removed test_pipeline.py

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-12 16:02:57 +00:00

318 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Adolf tier routing benchmark.
Tests:
easy — 10 questions that must route to 'light' tier
medium — 11 questions that must route to 'medium' (light acceptable for some; complex = fail)
hard — 10 /think questions that must route to 'complex' (medium fallback acceptable)
Usage:
python3 test_routing.py [--chat-id CHAT_ID]
[--easy-only] # only easy benchmark
[--medium-only] # only medium benchmark
[--hard-only] # only hard benchmark
"""
import argparse
import sys
import time
from common import (
DEEPAGENTS, COMPOSE_FILE, DEFAULT_CHAT_ID,
BENCHMARK,
INFO, PASS, FAIL, WARN,
report, print_summary,
post_json, fetch_logs,
parse_run_block,
)
# ── args ──────────────────────────────────────────────────────────────────────
parser = argparse.ArgumentParser(description="Adolf routing benchmark")
parser.add_argument("--chat-id", default=DEFAULT_CHAT_ID)
parser.add_argument("--easy-only", action="store_true")
parser.add_argument("--medium-only", action="store_true")
parser.add_argument("--hard-only", action="store_true")
args = parser.parse_args()
CHAT_ID = args.chat_id
_only = args.easy_only or args.medium_only or args.hard_only
_run_easy = not _only or args.easy_only
_run_medium = not _only or args.medium_only
_run_hard = not _only or args.hard_only
results = []
# ── easy benchmark ────────────────────────────────────────────────────────────
if _run_easy:
print(f"\n[{INFO}] Easy routing benchmark")
print(f" {len(BENCHMARK['easy'])} questions — all must route to 'light'")
print(f" Chat ID: {CHAT_ID}")
print()
bench_results = []
LIGHT_TIMEOUT = 60
for i, question in enumerate(BENCHMARK["easy"], 1):
tag = f"easy-{i:02d}"
print(f" [{tag}] {question[:55]!r}")
t_send = time.monotonic()
try:
status, _ = post_json(f"{DEEPAGENTS}/chat",
{"message": question, "chat_id": CHAT_ID}, timeout=5)
if status != 202:
print(f" → [{FAIL}] POST returned {status}")
bench_results.append((question, "?", None, False))
continue
except Exception as e:
print(f" → [{FAIL}] POST error: {e}")
bench_results.append((question, "?", None, False))
continue
t_start = time.monotonic()
found = None
while time.monotonic() - t_start < LIGHT_TIMEOUT:
since = int(time.monotonic() - t_start) + 30
lines = fetch_logs(since_s=since)
found = parse_run_block(lines, question)
if found:
break
time.sleep(1)
if not found:
print(f" → [{FAIL}] no reply within {LIGHT_TIMEOUT}s")
bench_results.append((question, "timeout", None, False))
continue
tier = found.get("tier", "unknown")
is_light = (tier == "light")
tag_str = PASS if is_light else FAIL
print(f" → [{tag_str}] tier={tier} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s")
bench_results.append((question, tier, found["reply_total"], is_light))
time.sleep(1)
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question'}")
print(f" {''*4} {''*8} {''*8} {''*50}")
for idx, (q, tier, lat, ok) in enumerate(bench_results, 1):
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
ok_str = "" if ok else ""
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {q[:50]!r}")
light_count = sum(1 for _, _, _, ok in bench_results if ok)
total_bench = len(bench_results)
lats = [lat for _, _, lat, ok in bench_results if ok and lat is not None]
avg_lat = sum(lats) / len(lats) if lats else 0
print(f"\n Light-path score: {light_count}/{total_bench}")
if lats:
print(f" Avg latency (light): {avg_lat:.1f}s min={min(lats):.1f}s max={max(lats):.1f}s")
report(results, f"All easy questions routed to light ({light_count}/{total_bench})",
light_count == total_bench,
f"{light_count}/{total_bench} via light path, avg {avg_lat:.1f}s")
# ── medium benchmark ──────────────────────────────────────────────────────────
if _run_medium:
print(f"\n[{INFO}] Medium routing benchmark")
print(f" {len(BENCHMARK['medium'])} questions — must route to medium (light ok for some; complex = fail)")
print(f" Chat ID: {CHAT_ID}")
print()
LIGHT_ACCEPTABLE = {
"who won the last FIFA World Cup?",
"search for a good pasta carbonara recipe",
"find Python tutorials for beginners",
"search for the best coffee shops in Tokyo",
}
med_results = []
MEDIUM_TIMEOUT = 120
for i, question in enumerate(BENCHMARK["medium"], 1):
tag = f"med-{i:02d}"
print(f" [{tag}] {question[:60]!r}")
t_send = time.monotonic()
try:
status, _ = post_json(f"{DEEPAGENTS}/chat",
{"message": question, "chat_id": CHAT_ID}, timeout=5)
if status != 202:
print(f" → [{FAIL}] POST returned {status}")
med_results.append((question, "?", None, False))
continue
except Exception as e:
print(f" → [{FAIL}] POST error: {e}")
med_results.append((question, "?", None, False))
continue
t_start = time.monotonic()
found = None
while time.monotonic() - t_start < MEDIUM_TIMEOUT:
since = int(time.monotonic() - t_start) + 60
lines = fetch_logs(since_s=since)
found = parse_run_block(lines, question)
if found:
break
time.sleep(3)
if not found:
print(f" → [{FAIL}] no reply within {MEDIUM_TIMEOUT}s")
med_results.append((question, "timeout", None, False))
continue
tier = found.get("tier", "unknown")
light_ok = question in LIGHT_ACCEPTABLE
if tier == "medium":
correct, label, note = True, PASS, "medium ✓"
elif tier == "light":
correct = light_ok
label = PASS if light_ok else WARN
note = "light (acceptable)" if light_ok else "light (should be medium)"
elif tier == "complex":
correct, label, note = False, FAIL, "complex — wrong escalation"
else:
correct, label, note = False, FAIL, f"unknown tier {tier!r}"
print(f" → [{label}] {note} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s")
med_results.append((question, tier, found["reply_total"], correct))
time.sleep(1)
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question'}")
print(f" {''*4} {''*8} {''*8} {''*55}")
for idx, (q, tier, lat, ok) in enumerate(med_results, 1):
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
ok_str = "" if ok else ("~" if tier == "light" else "")
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {q[:55]!r}")
total_med = len(med_results)
medium_count = sum(1 for _, tier, _, _ in med_results if tier == "medium")
light_count = sum(1 for _, tier, _, _ in med_results if tier == "light")
complex_count = sum(1 for _, tier, _, _ in med_results if tier == "complex")
timeout_count = sum(1 for _, tier, _, _ in med_results if tier == "timeout")
light_misroute = sum(1 for q, tier, _, _ in med_results
if tier == "light" and q not in LIGHT_ACCEPTABLE)
lats = [lat for _, _, lat, _ in med_results if lat is not None]
print(f"\n Breakdown: medium={medium_count} light={light_count} "
f"complex={complex_count} timeout={timeout_count}")
if light_misroute:
print(f" [{WARN}] {light_misroute} question(s) answered via light when medium expected")
if lats:
print(f" Avg latency: {sum(lats)/len(lats):.1f}s min={min(lats):.1f}s max={max(lats):.1f}s")
report(results,
f"Medium questions: no complex escalation ({medium_count + light_count}/{total_med} routed)",
complex_count == 0,
f"medium={medium_count} light={light_count} complex={complex_count} timeout={timeout_count}")
if timeout_count:
report(results, f"Medium questions: all completed within {MEDIUM_TIMEOUT}s", False,
f"{timeout_count} question(s) timed out")
# ── hard benchmark ────────────────────────────────────────────────────────────
if _run_hard:
print(f"\n[{INFO}] Hard routing benchmark")
print(f" {len(BENCHMARK['hard'])} /think questions — must route to 'complex'")
print(f" Acceptable fallback: 'medium' if VRAM eviction timed out")
print(f" Fail condition: tier=light or timeout")
print(f" Chat ID: {CHAT_ID}")
print()
hard_results = []
COMPLEX_TIMEOUT = 300
_VRAM_ENTER = "[vram] enter_complex_mode"
_VRAM_EXIT = "[vram] exit_complex_mode"
for i, question in enumerate(BENCHMARK["hard"], 1):
tag = f"hard-{i:02d}"
short_q = question[len("/think "):].strip()[:60]
print(f" [{tag}] /think {short_q!r}")
t_send = time.monotonic()
try:
status, _ = post_json(f"{DEEPAGENTS}/chat",
{"message": question, "chat_id": CHAT_ID}, timeout=5)
if status != 202:
print(f" → [{FAIL}] POST returned {status}")
hard_results.append((question, "?", None, False))
continue
except Exception as e:
print(f" → [{FAIL}] POST error: {e}")
hard_results.append((question, "?", None, False))
continue
t_start = time.monotonic()
found = None
while time.monotonic() - t_start < COMPLEX_TIMEOUT:
since = int(time.monotonic() - t_start) + 90
lines = fetch_logs(since_s=since)
found = parse_run_block(lines, question[len("/think "):].strip())
if found:
break
time.sleep(5)
elapsed = time.monotonic() - t_send
if not found:
print(f" → [{FAIL}] no reply within {COMPLEX_TIMEOUT}s")
hard_results.append((question, "timeout", None, False))
continue
tier = found.get("tier", "unknown")
if tier == "complex":
ok, label, note = True, PASS, "complex ✓"
elif tier == "medium":
ok, label, note = True, WARN, "medium (VRAM fallback — check [vram] logs)"
else:
ok, label, note = False, FAIL, f"tier={tier} — unexpected"
lines_block = fetch_logs(since_s=int(elapsed) + 120)
recent = "\n".join(lines_block[-200:])
vram_enter_seen = _VRAM_ENTER in recent
vram_note = ""
if tier == "complex":
vram_note = " [vram:flush✓]" if vram_enter_seen else f" [{WARN}:no vram flush log]"
print(f" → [{label}] {note} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s{vram_note}")
hard_results.append((question, tier, found["reply_total"], ok))
time.sleep(5)
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question (/think ...)'}")
print(f" {''*4} {''*8} {''*8} {''*55}")
for idx, (q, tier, lat, ok) in enumerate(hard_results, 1):
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
ok_str = "" if tier == "complex" else ("~" if tier == "medium" else "")
short = q[len("/think "):].strip()[:55]
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {short!r}")
total_hard = len(hard_results)
complex_count = sum(1 for _, t, _, _ in hard_results if t == "complex")
medium_fb = sum(1 for _, t, _, _ in hard_results if t == "medium")
light_count = sum(1 for _, t, _, _ in hard_results if t == "light")
timeout_count = sum(1 for _, t, _, _ in hard_results if t == "timeout")
lats = [lat for _, _, lat, _ in hard_results if lat is not None]
print(f"\n Breakdown: complex={complex_count} medium(fallback)={medium_fb} "
f"light={light_count} timeout={timeout_count}")
if medium_fb:
print(f" [{WARN}] {medium_fb} question(s) fell back to medium (VRAM eviction timeout)")
if light_count:
print(f" [{FAIL}] {light_count} question(s) routed to light — /think prefix not detected")
if lats:
print(f" Avg latency: {sum(lats)/len(lats):.1f}s min={min(lats):.1f}s max={max(lats):.1f}s")
report(results,
f"Hard questions routed to complex (not light) ({complex_count + medium_fb}/{total_hard})",
light_count == 0 and timeout_count == 0,
f"complex={complex_count} medium_fallback={medium_fb} light={light_count} timeout={timeout_count}")
# ── summary ───────────────────────────────────────────────────────────────────
print_summary(results)
sys.exit(0 if all(ok for _, ok in results) else 1)