adolf/tests/integration/test_routing.py

#!/usr/bin/env python3
"""
Adolf tier routing benchmark.

Tests:
  easy   — 10 questions that must route to 'light' tier
  medium — 11 questions that must route to 'medium' (light acceptable for some; complex = fail)
  hard   — 10 /think questions that must route to 'complex' (medium fallback acceptable)

Usage:
    python3 test_routing.py [--chat-id CHAT_ID]
                            [--easy-only]    # only easy benchmark
                            [--medium-only]  # only medium benchmark
                            [--hard-only]    # only hard benchmark
"""

import argparse
import sys
import time

from common import (
    DEEPAGENTS, COMPOSE_FILE, DEFAULT_CHAT_ID,
    BENCHMARK,
    INFO, PASS, FAIL, WARN,
    report, print_summary,
    post_json, fetch_logs,
    parse_run_block,
)

# ── args ──────────────────────────────────────────────────────────────────────
parser = argparse.ArgumentParser(description="Adolf routing benchmark")
parser.add_argument("--chat-id",     default=DEFAULT_CHAT_ID)
parser.add_argument("--easy-only",   action="store_true")
parser.add_argument("--medium-only", action="store_true")
parser.add_argument("--hard-only",   action="store_true")
args = parser.parse_args()

CHAT_ID = args.chat_id
_only = args.easy_only or args.medium_only or args.hard_only
_run_easy   = not _only or args.easy_only
_run_medium = not _only or args.medium_only
_run_hard   = not _only or args.hard_only

results = []


# ── easy benchmark ────────────────────────────────────────────────────────────
if _run_easy:
    print(f"\n[{INFO}] Easy routing benchmark")
    print(f"  {len(BENCHMARK['easy'])} questions — all must route to 'light'")
    print(f"  Chat ID: {CHAT_ID}")
    print()

    bench_results = []
    LIGHT_TIMEOUT = 60

    for i, question in enumerate(BENCHMARK["easy"], 1):
        tag = f"easy-{i:02d}"
        print(f"  [{tag}] {question[:55]!r}")

        t_send = time.monotonic()
        try:
            status, _ = post_json(f"{DEEPAGENTS}/chat",
                                  {"message": question, "chat_id": CHAT_ID}, timeout=5)
            if status != 202:
                print(f"          → [{FAIL}] POST returned {status}")
                bench_results.append((question, "?", None, False))
                continue
        except Exception as e:
            print(f"          → [{FAIL}] POST error: {e}")
            bench_results.append((question, "?", None, False))
            continue

        t_start = time.monotonic()
        found = None
        while time.monotonic() - t_start < LIGHT_TIMEOUT:
            since = int(time.monotonic() - t_start) + 30
            lines = fetch_logs(since_s=since)
            found = parse_run_block(lines, question)
            if found:
                break
            time.sleep(1)

        if not found:
            print(f"          → [{FAIL}] no reply within {LIGHT_TIMEOUT}s")
            bench_results.append((question, "timeout", None, False))
            continue

        tier = found.get("tier", "unknown")
        is_light = (tier == "light")
        tag_str = PASS if is_light else FAIL
        print(f"          → [{tag_str}] tier={tier}  latency={found['reply_total']:.1f}s  llm={found['llm']:.1f}s")
        bench_results.append((question, tier, found["reply_total"], is_light))
        time.sleep(1)

    print(f"\n  {'#':<4}  {'Tier':<8}  {'Latency':>8}  {'Question'}")
    print(f"  {'─'*4}  {'─'*8}  {'─'*8}  {'─'*50}")
    for idx, (q, tier, lat, ok) in enumerate(bench_results, 1):
        lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
        ok_str = "✓" if ok else "✗"
        print(f"  {ok_str} {idx:<3}  {tier:<8}  {lat_str:>8}  {q[:50]!r}")

    light_count = sum(1 for _, _, _, ok in bench_results if ok)
    total_bench = len(bench_results)
    lats = [lat for _, _, lat, ok in bench_results if ok and lat is not None]
    avg_lat = sum(lats) / len(lats) if lats else 0

    print(f"\n  Light-path score: {light_count}/{total_bench}")
    if lats:
        print(f"  Avg latency (light): {avg_lat:.1f}s  min={min(lats):.1f}s  max={max(lats):.1f}s")

    report(results, f"All easy questions routed to light ({light_count}/{total_bench})",
           light_count == total_bench,
           f"{light_count}/{total_bench} via light path, avg {avg_lat:.1f}s")


# ── medium benchmark ──────────────────────────────────────────────────────────
if _run_medium:
    print(f"\n[{INFO}] Medium routing benchmark")
    print(f"  {len(BENCHMARK['medium'])} questions — must route to medium (light ok for some; complex = fail)")
    print(f"  Chat ID: {CHAT_ID}")
    print()

    LIGHT_ACCEPTABLE = {
        "who won the last FIFA World Cup?",
        "search for a good pasta carbonara recipe",
        "find Python tutorials for beginners",
        "search for the best coffee shops in Tokyo",
    }

    med_results = []
    MEDIUM_TIMEOUT = 120

    for i, question in enumerate(BENCHMARK["medium"], 1):
        tag = f"med-{i:02d}"
        print(f"  [{tag}] {question[:60]!r}")

        t_send = time.monotonic()
        try:
            status, _ = post_json(f"{DEEPAGENTS}/chat",
                                  {"message": question, "chat_id": CHAT_ID}, timeout=5)
            if status != 202:
                print(f"          → [{FAIL}] POST returned {status}")
                med_results.append((question, "?", None, False))
                continue
        except Exception as e:
            print(f"          → [{FAIL}] POST error: {e}")
            med_results.append((question, "?", None, False))
            continue

        t_start = time.monotonic()
        found = None
        while time.monotonic() - t_start < MEDIUM_TIMEOUT:
            since = int(time.monotonic() - t_start) + 60
            lines = fetch_logs(since_s=since)
            found = parse_run_block(lines, question)
            if found:
                break
            time.sleep(3)

        if not found:
            print(f"          → [{FAIL}] no reply within {MEDIUM_TIMEOUT}s")
            med_results.append((question, "timeout", None, False))
            continue

        tier = found.get("tier", "unknown")
        light_ok = question in LIGHT_ACCEPTABLE

        if tier == "medium":
            correct, label, note = True, PASS, "medium ✓"
        elif tier == "light":
            correct = light_ok
            label = PASS if light_ok else WARN
            note = "light (acceptable)" if light_ok else "light (should be medium)"
        elif tier == "complex":
            correct, label, note = False, FAIL, "complex — wrong escalation"
        else:
            correct, label, note = False, FAIL, f"unknown tier {tier!r}"

        print(f"          → [{label}] {note}  latency={found['reply_total']:.1f}s  llm={found['llm']:.1f}s")
        med_results.append((question, tier, found["reply_total"], correct))
        time.sleep(1)

    print(f"\n  {'#':<4}  {'Tier':<8}  {'Latency':>8}  {'Question'}")
    print(f"  {'─'*4}  {'─'*8}  {'─'*8}  {'─'*55}")
    for idx, (q, tier, lat, ok) in enumerate(med_results, 1):
        lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
        ok_str = "✓" if ok else ("~" if tier == "light" else "✗")
        print(f"  {ok_str} {idx:<3}  {tier:<8}  {lat_str:>8}  {q[:55]!r}")

    total_med     = len(med_results)
    medium_count  = sum(1 for _, tier, _, _ in med_results if tier == "medium")
    light_count   = sum(1 for _, tier, _, _ in med_results if tier == "light")
    complex_count = sum(1 for _, tier, _, _ in med_results if tier == "complex")
    timeout_count = sum(1 for _, tier, _, _ in med_results if tier == "timeout")
    light_misroute = sum(1 for q, tier, _, _ in med_results
                         if tier == "light" and q not in LIGHT_ACCEPTABLE)
    lats = [lat for _, _, lat, _ in med_results if lat is not None]

    print(f"\n  Breakdown: medium={medium_count}  light={light_count}  "
          f"complex={complex_count}  timeout={timeout_count}")
    if light_misroute:
        print(f"  [{WARN}] {light_misroute} question(s) answered via light when medium expected")
    if lats:
        print(f"  Avg latency: {sum(lats)/len(lats):.1f}s  min={min(lats):.1f}s  max={max(lats):.1f}s")

    report(results,
           f"Medium questions: no complex escalation ({medium_count + light_count}/{total_med} routed)",
           complex_count == 0,
           f"medium={medium_count} light={light_count} complex={complex_count} timeout={timeout_count}")
    if timeout_count:
        report(results, f"Medium questions: all completed within {MEDIUM_TIMEOUT}s", False,
               f"{timeout_count} question(s) timed out")


# ── hard benchmark ────────────────────────────────────────────────────────────
if _run_hard:
    print(f"\n[{INFO}] Hard routing benchmark")
    print(f"  {len(BENCHMARK['hard'])} /think questions — must route to 'complex'")
    print(f"  Acceptable fallback: 'medium' if VRAM eviction timed out")
    print(f"  Fail condition: tier=light or timeout")
    print(f"  Chat ID: {CHAT_ID}")
    print()

    hard_results  = []
    COMPLEX_TIMEOUT = 300
    _VRAM_ENTER = "[vram] enter_complex_mode"
    _VRAM_EXIT  = "[vram] exit_complex_mode"

    for i, question in enumerate(BENCHMARK["hard"], 1):
        tag = f"hard-{i:02d}"
        short_q = question[len("/think "):].strip()[:60]
        print(f"  [{tag}] /think {short_q!r}")

        t_send = time.monotonic()
        try:
            status, _ = post_json(f"{DEEPAGENTS}/chat",
                                  {"message": question, "chat_id": CHAT_ID}, timeout=5)
            if status != 202:
                print(f"          → [{FAIL}] POST returned {status}")
                hard_results.append((question, "?", None, False))
                continue
        except Exception as e:
            print(f"          → [{FAIL}] POST error: {e}")
            hard_results.append((question, "?", None, False))
            continue

        t_start = time.monotonic()
        found = None
        while time.monotonic() - t_start < COMPLEX_TIMEOUT:
            since = int(time.monotonic() - t_start) + 90
            lines = fetch_logs(since_s=since)
            found = parse_run_block(lines, question[len("/think "):].strip())
            if found:
                break
            time.sleep(5)

        elapsed = time.monotonic() - t_send

        if not found:
            print(f"          → [{FAIL}] no reply within {COMPLEX_TIMEOUT}s")
            hard_results.append((question, "timeout", None, False))
            continue

        tier = found.get("tier", "unknown")

        if tier == "complex":
            ok, label, note = True, PASS, "complex ✓"
        elif tier == "medium":
            ok, label, note = True, WARN, "medium (VRAM fallback — check [vram] logs)"
        else:
            ok, label, note = False, FAIL, f"tier={tier} — unexpected"

        lines_block = fetch_logs(since_s=int(elapsed) + 120)
        recent = "\n".join(lines_block[-200:])
        vram_enter_seen = _VRAM_ENTER in recent
        vram_note = ""
        if tier == "complex":
            vram_note = " [vram:flush✓]" if vram_enter_seen else f" [{WARN}:no vram flush log]"

        print(f"          → [{label}] {note}  latency={found['reply_total']:.1f}s  llm={found['llm']:.1f}s{vram_note}")
        hard_results.append((question, tier, found["reply_total"], ok))
        time.sleep(5)

    print(f"\n  {'#':<4}  {'Tier':<8}  {'Latency':>8}  {'Question (/think ...)'}")
    print(f"  {'─'*4}  {'─'*8}  {'─'*8}  {'─'*55}")
    for idx, (q, tier, lat, ok) in enumerate(hard_results, 1):
        lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
        ok_str = "✓" if tier == "complex" else ("~" if tier == "medium" else "✗")
        short = q[len("/think "):].strip()[:55]
        print(f"  {ok_str} {idx:<3}  {tier:<8}  {lat_str:>8}  {short!r}")

    total_hard    = len(hard_results)
    complex_count = sum(1 for _, t, _, _ in hard_results if t == "complex")
    medium_fb     = sum(1 for _, t, _, _ in hard_results if t == "medium")
    light_count   = sum(1 for _, t, _, _ in hard_results if t == "light")
    timeout_count = sum(1 for _, t, _, _ in hard_results if t == "timeout")
    lats = [lat for _, _, lat, _ in hard_results if lat is not None]

    print(f"\n  Breakdown: complex={complex_count}  medium(fallback)={medium_fb}  "
          f"light={light_count}  timeout={timeout_count}")
    if medium_fb:
        print(f"  [{WARN}] {medium_fb} question(s) fell back to medium (VRAM eviction timeout)")
    if light_count:
        print(f"  [{FAIL}] {light_count} question(s) routed to light — /think prefix not detected")
    if lats:
        print(f"  Avg latency: {sum(lats)/len(lats):.1f}s  min={min(lats):.1f}s  max={max(lats):.1f}s")

    report(results,
           f"Hard questions routed to complex (not light) ({complex_count + medium_fb}/{total_hard})",
           light_count == 0 and timeout_count == 0,
           f"complex={complex_count} medium_fallback={medium_fb} light={light_count} timeout={timeout_count}")


# ── summary ───────────────────────────────────────────────────────────────────
print_summary(results)
sys.exit(0 if all(ok for _, ok in results) else 1)