#!/usr/bin/env python3 """ Adolf tier routing benchmark. Tests: easy — 10 questions that must route to 'light' tier medium — 11 questions that must route to 'medium' (light acceptable for some; complex = fail) hard — 10 /think questions that must route to 'complex' (medium fallback acceptable) Usage: python3 test_routing.py [--chat-id CHAT_ID] [--easy-only] # only easy benchmark [--medium-only] # only medium benchmark [--hard-only] # only hard benchmark """ import argparse import sys import time from common import ( DEEPAGENTS, COMPOSE_FILE, DEFAULT_CHAT_ID, BENCHMARK, INFO, PASS, FAIL, WARN, report, print_summary, post_json, fetch_logs, parse_run_block, ) # ── args ────────────────────────────────────────────────────────────────────── parser = argparse.ArgumentParser(description="Adolf routing benchmark") parser.add_argument("--chat-id", default=DEFAULT_CHAT_ID) parser.add_argument("--easy-only", action="store_true") parser.add_argument("--medium-only", action="store_true") parser.add_argument("--hard-only", action="store_true") args = parser.parse_args() CHAT_ID = args.chat_id _only = args.easy_only or args.medium_only or args.hard_only _run_easy = not _only or args.easy_only _run_medium = not _only or args.medium_only _run_hard = not _only or args.hard_only results = [] # ── easy benchmark ──────────────────────────────────────────────────────────── if _run_easy: print(f"\n[{INFO}] Easy routing benchmark") print(f" {len(BENCHMARK['easy'])} questions — all must route to 'light'") print(f" Chat ID: {CHAT_ID}") print() bench_results = [] LIGHT_TIMEOUT = 60 for i, question in enumerate(BENCHMARK["easy"], 1): tag = f"easy-{i:02d}" print(f" [{tag}] {question[:55]!r}") t_send = time.monotonic() try: status, _ = post_json(f"{DEEPAGENTS}/chat", {"message": question, "chat_id": CHAT_ID}, timeout=5) if status != 202: print(f" → [{FAIL}] POST returned {status}") bench_results.append((question, "?", None, False)) continue except Exception as e: print(f" → [{FAIL}] POST error: {e}") bench_results.append((question, "?", None, False)) continue t_start = time.monotonic() found = None while time.monotonic() - t_start < LIGHT_TIMEOUT: since = int(time.monotonic() - t_start) + 30 lines = fetch_logs(since_s=since) found = parse_run_block(lines, question) if found: break time.sleep(1) if not found: print(f" → [{FAIL}] no reply within {LIGHT_TIMEOUT}s") bench_results.append((question, "timeout", None, False)) continue tier = found.get("tier", "unknown") is_light = (tier == "light") tag_str = PASS if is_light else FAIL print(f" → [{tag_str}] tier={tier} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s") bench_results.append((question, tier, found["reply_total"], is_light)) time.sleep(1) print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question'}") print(f" {'─'*4} {'─'*8} {'─'*8} {'─'*50}") for idx, (q, tier, lat, ok) in enumerate(bench_results, 1): lat_str = f"{lat:.1f}s" if lat is not None else "timeout" ok_str = "✓" if ok else "✗" print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {q[:50]!r}") light_count = sum(1 for _, _, _, ok in bench_results if ok) total_bench = len(bench_results) lats = [lat for _, _, lat, ok in bench_results if ok and lat is not None] avg_lat = sum(lats) / len(lats) if lats else 0 print(f"\n Light-path score: {light_count}/{total_bench}") if lats: print(f" Avg latency (light): {avg_lat:.1f}s min={min(lats):.1f}s max={max(lats):.1f}s") report(results, f"All easy questions routed to light ({light_count}/{total_bench})", light_count == total_bench, f"{light_count}/{total_bench} via light path, avg {avg_lat:.1f}s") # ── medium benchmark ────────────────────────────────────────────────────────── if _run_medium: print(f"\n[{INFO}] Medium routing benchmark") print(f" {len(BENCHMARK['medium'])} questions — must route to medium (light ok for some; complex = fail)") print(f" Chat ID: {CHAT_ID}") print() LIGHT_ACCEPTABLE = { "who won the last FIFA World Cup?", "search for a good pasta carbonara recipe", "find Python tutorials for beginners", "search for the best coffee shops in Tokyo", } med_results = [] MEDIUM_TIMEOUT = 120 for i, question in enumerate(BENCHMARK["medium"], 1): tag = f"med-{i:02d}" print(f" [{tag}] {question[:60]!r}") t_send = time.monotonic() try: status, _ = post_json(f"{DEEPAGENTS}/chat", {"message": question, "chat_id": CHAT_ID}, timeout=5) if status != 202: print(f" → [{FAIL}] POST returned {status}") med_results.append((question, "?", None, False)) continue except Exception as e: print(f" → [{FAIL}] POST error: {e}") med_results.append((question, "?", None, False)) continue t_start = time.monotonic() found = None while time.monotonic() - t_start < MEDIUM_TIMEOUT: since = int(time.monotonic() - t_start) + 60 lines = fetch_logs(since_s=since) found = parse_run_block(lines, question) if found: break time.sleep(3) if not found: print(f" → [{FAIL}] no reply within {MEDIUM_TIMEOUT}s") med_results.append((question, "timeout", None, False)) continue tier = found.get("tier", "unknown") light_ok = question in LIGHT_ACCEPTABLE if tier == "medium": correct, label, note = True, PASS, "medium ✓" elif tier == "light": correct = light_ok label = PASS if light_ok else WARN note = "light (acceptable)" if light_ok else "light (should be medium)" elif tier == "complex": correct, label, note = False, FAIL, "complex — wrong escalation" else: correct, label, note = False, FAIL, f"unknown tier {tier!r}" print(f" → [{label}] {note} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s") med_results.append((question, tier, found["reply_total"], correct)) time.sleep(1) print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question'}") print(f" {'─'*4} {'─'*8} {'─'*8} {'─'*55}") for idx, (q, tier, lat, ok) in enumerate(med_results, 1): lat_str = f"{lat:.1f}s" if lat is not None else "timeout" ok_str = "✓" if ok else ("~" if tier == "light" else "✗") print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {q[:55]!r}") total_med = len(med_results) medium_count = sum(1 for _, tier, _, _ in med_results if tier == "medium") light_count = sum(1 for _, tier, _, _ in med_results if tier == "light") complex_count = sum(1 for _, tier, _, _ in med_results if tier == "complex") timeout_count = sum(1 for _, tier, _, _ in med_results if tier == "timeout") light_misroute = sum(1 for q, tier, _, _ in med_results if tier == "light" and q not in LIGHT_ACCEPTABLE) lats = [lat for _, _, lat, _ in med_results if lat is not None] print(f"\n Breakdown: medium={medium_count} light={light_count} " f"complex={complex_count} timeout={timeout_count}") if light_misroute: print(f" [{WARN}] {light_misroute} question(s) answered via light when medium expected") if lats: print(f" Avg latency: {sum(lats)/len(lats):.1f}s min={min(lats):.1f}s max={max(lats):.1f}s") report(results, f"Medium questions: no complex escalation ({medium_count + light_count}/{total_med} routed)", complex_count == 0, f"medium={medium_count} light={light_count} complex={complex_count} timeout={timeout_count}") if timeout_count: report(results, f"Medium questions: all completed within {MEDIUM_TIMEOUT}s", False, f"{timeout_count} question(s) timed out") # ── hard benchmark ──────────────────────────────────────────────────────────── if _run_hard: print(f"\n[{INFO}] Hard routing benchmark") print(f" {len(BENCHMARK['hard'])} /think questions — must route to 'complex'") print(f" Acceptable fallback: 'medium' if VRAM eviction timed out") print(f" Fail condition: tier=light or timeout") print(f" Chat ID: {CHAT_ID}") print() hard_results = [] COMPLEX_TIMEOUT = 300 _VRAM_ENTER = "[vram] enter_complex_mode" _VRAM_EXIT = "[vram] exit_complex_mode" for i, question in enumerate(BENCHMARK["hard"], 1): tag = f"hard-{i:02d}" short_q = question[len("/think "):].strip()[:60] print(f" [{tag}] /think {short_q!r}") t_send = time.monotonic() try: status, _ = post_json(f"{DEEPAGENTS}/chat", {"message": question, "chat_id": CHAT_ID}, timeout=5) if status != 202: print(f" → [{FAIL}] POST returned {status}") hard_results.append((question, "?", None, False)) continue except Exception as e: print(f" → [{FAIL}] POST error: {e}") hard_results.append((question, "?", None, False)) continue t_start = time.monotonic() found = None while time.monotonic() - t_start < COMPLEX_TIMEOUT: since = int(time.monotonic() - t_start) + 90 lines = fetch_logs(since_s=since) found = parse_run_block(lines, question[len("/think "):].strip()) if found: break time.sleep(5) elapsed = time.monotonic() - t_send if not found: print(f" → [{FAIL}] no reply within {COMPLEX_TIMEOUT}s") hard_results.append((question, "timeout", None, False)) continue tier = found.get("tier", "unknown") if tier == "complex": ok, label, note = True, PASS, "complex ✓" elif tier == "medium": ok, label, note = True, WARN, "medium (VRAM fallback — check [vram] logs)" else: ok, label, note = False, FAIL, f"tier={tier} — unexpected" lines_block = fetch_logs(since_s=int(elapsed) + 120) recent = "\n".join(lines_block[-200:]) vram_enter_seen = _VRAM_ENTER in recent vram_note = "" if tier == "complex": vram_note = " [vram:flush✓]" if vram_enter_seen else f" [{WARN}:no vram flush log]" print(f" → [{label}] {note} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s{vram_note}") hard_results.append((question, tier, found["reply_total"], ok)) time.sleep(5) print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question (/think ...)'}") print(f" {'─'*4} {'─'*8} {'─'*8} {'─'*55}") for idx, (q, tier, lat, ok) in enumerate(hard_results, 1): lat_str = f"{lat:.1f}s" if lat is not None else "timeout" ok_str = "✓" if tier == "complex" else ("~" if tier == "medium" else "✗") short = q[len("/think "):].strip()[:55] print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {short!r}") total_hard = len(hard_results) complex_count = sum(1 for _, t, _, _ in hard_results if t == "complex") medium_fb = sum(1 for _, t, _, _ in hard_results if t == "medium") light_count = sum(1 for _, t, _, _ in hard_results if t == "light") timeout_count = sum(1 for _, t, _, _ in hard_results if t == "timeout") lats = [lat for _, _, lat, _ in hard_results if lat is not None] print(f"\n Breakdown: complex={complex_count} medium(fallback)={medium_fb} " f"light={light_count} timeout={timeout_count}") if medium_fb: print(f" [{WARN}] {medium_fb} question(s) fell back to medium (VRAM eviction timeout)") if light_count: print(f" [{FAIL}] {light_count} question(s) routed to light — /think prefix not detected") if lats: print(f" Avg latency: {sum(lats)/len(lats):.1f}s min={min(lats):.1f}s max={max(lats):.1f}s") report(results, f"Hard questions routed to complex (not light) ({complex_count + medium_fb}/{total_hard})", light_count == 0 and timeout_count == 0, f"complex={complex_count} medium_fallback={medium_fb} light={light_count} timeout={timeout_count}") # ── summary ─────────────────────────────────────────────────────────────────── print_summary(results) sys.exit(0 if all(ok for _, ok in results) else 1)