- common.py: shared config, URL constants, benchmark questions, all helpers (get, post_json, check_sse, qdrant_count, fetch_logs, parse_run_block, wait_for, etc.) - test_health.py: service health checks (deepagents, bifrost, GPU/CPU Ollama, Qdrant, SearXNG) - test_memory.py: name store/recall pipeline, memory benchmark (5 facts + 10 recalls), dedup test - test_routing.py: easy/medium/hard tier routing benchmarks with --easy/medium/hard-only flags - Removed test_pipeline.py Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
318 lines
14 KiB
Python
318 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Adolf tier routing benchmark.
|
|
|
|
Tests:
|
|
easy — 10 questions that must route to 'light' tier
|
|
medium — 11 questions that must route to 'medium' (light acceptable for some; complex = fail)
|
|
hard — 10 /think questions that must route to 'complex' (medium fallback acceptable)
|
|
|
|
Usage:
|
|
python3 test_routing.py [--chat-id CHAT_ID]
|
|
[--easy-only] # only easy benchmark
|
|
[--medium-only] # only medium benchmark
|
|
[--hard-only] # only hard benchmark
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
import time
|
|
|
|
from common import (
|
|
DEEPAGENTS, COMPOSE_FILE, DEFAULT_CHAT_ID,
|
|
BENCHMARK,
|
|
INFO, PASS, FAIL, WARN,
|
|
report, print_summary,
|
|
post_json, fetch_logs,
|
|
parse_run_block,
|
|
)
|
|
|
|
# ── args ──────────────────────────────────────────────────────────────────────
|
|
parser = argparse.ArgumentParser(description="Adolf routing benchmark")
|
|
parser.add_argument("--chat-id", default=DEFAULT_CHAT_ID)
|
|
parser.add_argument("--easy-only", action="store_true")
|
|
parser.add_argument("--medium-only", action="store_true")
|
|
parser.add_argument("--hard-only", action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
CHAT_ID = args.chat_id
|
|
_only = args.easy_only or args.medium_only or args.hard_only
|
|
_run_easy = not _only or args.easy_only
|
|
_run_medium = not _only or args.medium_only
|
|
_run_hard = not _only or args.hard_only
|
|
|
|
results = []
|
|
|
|
|
|
# ── easy benchmark ────────────────────────────────────────────────────────────
|
|
if _run_easy:
|
|
print(f"\n[{INFO}] Easy routing benchmark")
|
|
print(f" {len(BENCHMARK['easy'])} questions — all must route to 'light'")
|
|
print(f" Chat ID: {CHAT_ID}")
|
|
print()
|
|
|
|
bench_results = []
|
|
LIGHT_TIMEOUT = 60
|
|
|
|
for i, question in enumerate(BENCHMARK["easy"], 1):
|
|
tag = f"easy-{i:02d}"
|
|
print(f" [{tag}] {question[:55]!r}")
|
|
|
|
t_send = time.monotonic()
|
|
try:
|
|
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
|
{"message": question, "chat_id": CHAT_ID}, timeout=5)
|
|
if status != 202:
|
|
print(f" → [{FAIL}] POST returned {status}")
|
|
bench_results.append((question, "?", None, False))
|
|
continue
|
|
except Exception as e:
|
|
print(f" → [{FAIL}] POST error: {e}")
|
|
bench_results.append((question, "?", None, False))
|
|
continue
|
|
|
|
t_start = time.monotonic()
|
|
found = None
|
|
while time.monotonic() - t_start < LIGHT_TIMEOUT:
|
|
since = int(time.monotonic() - t_start) + 30
|
|
lines = fetch_logs(since_s=since)
|
|
found = parse_run_block(lines, question)
|
|
if found:
|
|
break
|
|
time.sleep(1)
|
|
|
|
if not found:
|
|
print(f" → [{FAIL}] no reply within {LIGHT_TIMEOUT}s")
|
|
bench_results.append((question, "timeout", None, False))
|
|
continue
|
|
|
|
tier = found.get("tier", "unknown")
|
|
is_light = (tier == "light")
|
|
tag_str = PASS if is_light else FAIL
|
|
print(f" → [{tag_str}] tier={tier} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s")
|
|
bench_results.append((question, tier, found["reply_total"], is_light))
|
|
time.sleep(1)
|
|
|
|
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question'}")
|
|
print(f" {'─'*4} {'─'*8} {'─'*8} {'─'*50}")
|
|
for idx, (q, tier, lat, ok) in enumerate(bench_results, 1):
|
|
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
|
|
ok_str = "✓" if ok else "✗"
|
|
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {q[:50]!r}")
|
|
|
|
light_count = sum(1 for _, _, _, ok in bench_results if ok)
|
|
total_bench = len(bench_results)
|
|
lats = [lat for _, _, lat, ok in bench_results if ok and lat is not None]
|
|
avg_lat = sum(lats) / len(lats) if lats else 0
|
|
|
|
print(f"\n Light-path score: {light_count}/{total_bench}")
|
|
if lats:
|
|
print(f" Avg latency (light): {avg_lat:.1f}s min={min(lats):.1f}s max={max(lats):.1f}s")
|
|
|
|
report(results, f"All easy questions routed to light ({light_count}/{total_bench})",
|
|
light_count == total_bench,
|
|
f"{light_count}/{total_bench} via light path, avg {avg_lat:.1f}s")
|
|
|
|
|
|
# ── medium benchmark ──────────────────────────────────────────────────────────
|
|
if _run_medium:
|
|
print(f"\n[{INFO}] Medium routing benchmark")
|
|
print(f" {len(BENCHMARK['medium'])} questions — must route to medium (light ok for some; complex = fail)")
|
|
print(f" Chat ID: {CHAT_ID}")
|
|
print()
|
|
|
|
LIGHT_ACCEPTABLE = {
|
|
"who won the last FIFA World Cup?",
|
|
"search for a good pasta carbonara recipe",
|
|
"find Python tutorials for beginners",
|
|
"search for the best coffee shops in Tokyo",
|
|
}
|
|
|
|
med_results = []
|
|
MEDIUM_TIMEOUT = 120
|
|
|
|
for i, question in enumerate(BENCHMARK["medium"], 1):
|
|
tag = f"med-{i:02d}"
|
|
print(f" [{tag}] {question[:60]!r}")
|
|
|
|
t_send = time.monotonic()
|
|
try:
|
|
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
|
{"message": question, "chat_id": CHAT_ID}, timeout=5)
|
|
if status != 202:
|
|
print(f" → [{FAIL}] POST returned {status}")
|
|
med_results.append((question, "?", None, False))
|
|
continue
|
|
except Exception as e:
|
|
print(f" → [{FAIL}] POST error: {e}")
|
|
med_results.append((question, "?", None, False))
|
|
continue
|
|
|
|
t_start = time.monotonic()
|
|
found = None
|
|
while time.monotonic() - t_start < MEDIUM_TIMEOUT:
|
|
since = int(time.monotonic() - t_start) + 60
|
|
lines = fetch_logs(since_s=since)
|
|
found = parse_run_block(lines, question)
|
|
if found:
|
|
break
|
|
time.sleep(3)
|
|
|
|
if not found:
|
|
print(f" → [{FAIL}] no reply within {MEDIUM_TIMEOUT}s")
|
|
med_results.append((question, "timeout", None, False))
|
|
continue
|
|
|
|
tier = found.get("tier", "unknown")
|
|
light_ok = question in LIGHT_ACCEPTABLE
|
|
|
|
if tier == "medium":
|
|
correct, label, note = True, PASS, "medium ✓"
|
|
elif tier == "light":
|
|
correct = light_ok
|
|
label = PASS if light_ok else WARN
|
|
note = "light (acceptable)" if light_ok else "light (should be medium)"
|
|
elif tier == "complex":
|
|
correct, label, note = False, FAIL, "complex — wrong escalation"
|
|
else:
|
|
correct, label, note = False, FAIL, f"unknown tier {tier!r}"
|
|
|
|
print(f" → [{label}] {note} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s")
|
|
med_results.append((question, tier, found["reply_total"], correct))
|
|
time.sleep(1)
|
|
|
|
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question'}")
|
|
print(f" {'─'*4} {'─'*8} {'─'*8} {'─'*55}")
|
|
for idx, (q, tier, lat, ok) in enumerate(med_results, 1):
|
|
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
|
|
ok_str = "✓" if ok else ("~" if tier == "light" else "✗")
|
|
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {q[:55]!r}")
|
|
|
|
total_med = len(med_results)
|
|
medium_count = sum(1 for _, tier, _, _ in med_results if tier == "medium")
|
|
light_count = sum(1 for _, tier, _, _ in med_results if tier == "light")
|
|
complex_count = sum(1 for _, tier, _, _ in med_results if tier == "complex")
|
|
timeout_count = sum(1 for _, tier, _, _ in med_results if tier == "timeout")
|
|
light_misroute = sum(1 for q, tier, _, _ in med_results
|
|
if tier == "light" and q not in LIGHT_ACCEPTABLE)
|
|
lats = [lat for _, _, lat, _ in med_results if lat is not None]
|
|
|
|
print(f"\n Breakdown: medium={medium_count} light={light_count} "
|
|
f"complex={complex_count} timeout={timeout_count}")
|
|
if light_misroute:
|
|
print(f" [{WARN}] {light_misroute} question(s) answered via light when medium expected")
|
|
if lats:
|
|
print(f" Avg latency: {sum(lats)/len(lats):.1f}s min={min(lats):.1f}s max={max(lats):.1f}s")
|
|
|
|
report(results,
|
|
f"Medium questions: no complex escalation ({medium_count + light_count}/{total_med} routed)",
|
|
complex_count == 0,
|
|
f"medium={medium_count} light={light_count} complex={complex_count} timeout={timeout_count}")
|
|
if timeout_count:
|
|
report(results, f"Medium questions: all completed within {MEDIUM_TIMEOUT}s", False,
|
|
f"{timeout_count} question(s) timed out")
|
|
|
|
|
|
# ── hard benchmark ────────────────────────────────────────────────────────────
|
|
if _run_hard:
|
|
print(f"\n[{INFO}] Hard routing benchmark")
|
|
print(f" {len(BENCHMARK['hard'])} /think questions — must route to 'complex'")
|
|
print(f" Acceptable fallback: 'medium' if VRAM eviction timed out")
|
|
print(f" Fail condition: tier=light or timeout")
|
|
print(f" Chat ID: {CHAT_ID}")
|
|
print()
|
|
|
|
hard_results = []
|
|
COMPLEX_TIMEOUT = 300
|
|
_VRAM_ENTER = "[vram] enter_complex_mode"
|
|
_VRAM_EXIT = "[vram] exit_complex_mode"
|
|
|
|
for i, question in enumerate(BENCHMARK["hard"], 1):
|
|
tag = f"hard-{i:02d}"
|
|
short_q = question[len("/think "):].strip()[:60]
|
|
print(f" [{tag}] /think {short_q!r}")
|
|
|
|
t_send = time.monotonic()
|
|
try:
|
|
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
|
{"message": question, "chat_id": CHAT_ID}, timeout=5)
|
|
if status != 202:
|
|
print(f" → [{FAIL}] POST returned {status}")
|
|
hard_results.append((question, "?", None, False))
|
|
continue
|
|
except Exception as e:
|
|
print(f" → [{FAIL}] POST error: {e}")
|
|
hard_results.append((question, "?", None, False))
|
|
continue
|
|
|
|
t_start = time.monotonic()
|
|
found = None
|
|
while time.monotonic() - t_start < COMPLEX_TIMEOUT:
|
|
since = int(time.monotonic() - t_start) + 90
|
|
lines = fetch_logs(since_s=since)
|
|
found = parse_run_block(lines, question[len("/think "):].strip())
|
|
if found:
|
|
break
|
|
time.sleep(5)
|
|
|
|
elapsed = time.monotonic() - t_send
|
|
|
|
if not found:
|
|
print(f" → [{FAIL}] no reply within {COMPLEX_TIMEOUT}s")
|
|
hard_results.append((question, "timeout", None, False))
|
|
continue
|
|
|
|
tier = found.get("tier", "unknown")
|
|
|
|
if tier == "complex":
|
|
ok, label, note = True, PASS, "complex ✓"
|
|
elif tier == "medium":
|
|
ok, label, note = True, WARN, "medium (VRAM fallback — check [vram] logs)"
|
|
else:
|
|
ok, label, note = False, FAIL, f"tier={tier} — unexpected"
|
|
|
|
lines_block = fetch_logs(since_s=int(elapsed) + 120)
|
|
recent = "\n".join(lines_block[-200:])
|
|
vram_enter_seen = _VRAM_ENTER in recent
|
|
vram_note = ""
|
|
if tier == "complex":
|
|
vram_note = " [vram:flush✓]" if vram_enter_seen else f" [{WARN}:no vram flush log]"
|
|
|
|
print(f" → [{label}] {note} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s{vram_note}")
|
|
hard_results.append((question, tier, found["reply_total"], ok))
|
|
time.sleep(5)
|
|
|
|
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question (/think ...)'}")
|
|
print(f" {'─'*4} {'─'*8} {'─'*8} {'─'*55}")
|
|
for idx, (q, tier, lat, ok) in enumerate(hard_results, 1):
|
|
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
|
|
ok_str = "✓" if tier == "complex" else ("~" if tier == "medium" else "✗")
|
|
short = q[len("/think "):].strip()[:55]
|
|
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {short!r}")
|
|
|
|
total_hard = len(hard_results)
|
|
complex_count = sum(1 for _, t, _, _ in hard_results if t == "complex")
|
|
medium_fb = sum(1 for _, t, _, _ in hard_results if t == "medium")
|
|
light_count = sum(1 for _, t, _, _ in hard_results if t == "light")
|
|
timeout_count = sum(1 for _, t, _, _ in hard_results if t == "timeout")
|
|
lats = [lat for _, _, lat, _ in hard_results if lat is not None]
|
|
|
|
print(f"\n Breakdown: complex={complex_count} medium(fallback)={medium_fb} "
|
|
f"light={light_count} timeout={timeout_count}")
|
|
if medium_fb:
|
|
print(f" [{WARN}] {medium_fb} question(s) fell back to medium (VRAM eviction timeout)")
|
|
if light_count:
|
|
print(f" [{FAIL}] {light_count} question(s) routed to light — /think prefix not detected")
|
|
if lats:
|
|
print(f" Avg latency: {sum(lats)/len(lats):.1f}s min={min(lats):.1f}s max={max(lats):.1f}s")
|
|
|
|
report(results,
|
|
f"Hard questions routed to complex (not light) ({complex_count + medium_fb}/{total_hard})",
|
|
light_count == 0 and timeout_count == 0,
|
|
f"complex={complex_count} medium_fallback={medium_fb} light={light_count} timeout={timeout_count}")
|
|
|
|
|
|
# ── summary ───────────────────────────────────────────────────────────────────
|
|
print_summary(results)
|
|
sys.exit(0 if all(ok for _, ok in results) else 1)
|