Move benchmark scripts into benchmarks/ subdir

- benchmarks/run_benchmark.py (was run_benchmark.py) - benchmarks/run_voice_benchmark.py (was run_voice_benchmark.py) - Scripts use Path(__file__).parent so paths resolve correctly in subdir - .gitignore updated: ignore benchmarks/benchmark.json, results_latest.json, voice_results*.json, voice_audio/ Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 02:02:46 +00:00
parent ab68bba935
commit bd951f943f
3 changed files with 4 additions and 2 deletions
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -0,0 +1,318 @@
+#!/usr/bin/env python3
+"""
+Adolf routing benchmark.
+
+Sends each query to Adolf's /message endpoint, waits briefly for the routing
+decision to appear in docker logs, then records the actual tier.
+
+Usage:
+    python3 run_benchmark.py [options]
+    python3 run_benchmark.py --tier light|medium|complex
+    python3 run_benchmark.py --category <name>
+    python3 run_benchmark.py --ids 1,2,3
+    python3 run_benchmark.py --list-categories
+    python3 run_benchmark.py --dry-run         # complex queries use medium model (no API cost)
+
+IMPORTANT: Always check GPU is free before running. This script does it automatically.
+
+Adolf must be running at http://localhost:8000.
+"""
+
+import argparse
+import asyncio
+import json
+import re
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+import httpx
+
+ADOLF_URL = "http://localhost:8000"
+OLLAMA_URL = "http://localhost:11436"  # GPU Ollama
+DATASET = Path(__file__).parent / "benchmark.json"
+RESULTS = Path(__file__).parent / "results_latest.json"
+
+# Max time to wait for each query to fully complete via SSE stream
+QUERY_TIMEOUT = 300  # seconds — generous to handle GPU semaphore waits
+
+# Memory thresholds
+MIN_FREE_RAM_MB = 1500   # abort if less than this is free
+MIN_FREE_VRAM_MB = 500   # warn if less than this is free on GPU
+
+
+# ── Pre-flight checks ──────────────────────────────────────────────────────────
+
+def check_ram() -> tuple[bool, str]:
+    """Check available system RAM. Returns (ok, message)."""
+    try:
+        with open("/proc/meminfo") as f:
+            info = {}
+            for line in f:
+                parts = line.split()
+                if len(parts) >= 2:
+                    info[parts[0].rstrip(":")] = int(parts[1])
+        free_mb = (info.get("MemAvailable", 0)) // 1024
+        total_mb = info.get("MemTotal", 0) // 1024
+        msg = f"RAM: {free_mb} MB free / {total_mb} MB total"
+        if free_mb < MIN_FREE_RAM_MB:
+            return False, f"CRITICAL: {msg} — need at least {MIN_FREE_RAM_MB} MB free"
+        return True, msg
+    except Exception as e:
+        return True, f"RAM check failed (non-fatal): {e}"
+
+
+def check_gpu() -> tuple[bool, str]:
+    """Check GPU VRAM via Ollama /api/ps. Returns (ok, message)."""
+    try:
+        r = httpx.get(f"{OLLAMA_URL}/api/ps", timeout=5)
+        r.raise_for_status()
+        data = r.json()
+        models = data.get("models", [])
+        if models:
+            names = [m.get("name", "?") for m in models]
+            sizes_mb = [m.get("size_vram", 0) // (1024 * 1024) for m in models]
+            loaded = ", ".join(f"{n} ({s}MB)" for n, s in zip(names, sizes_mb))
+            total_vram = sum(sizes_mb)
+            if total_vram > 7000:
+                return False, f"GPU BUSY: models loaded = {loaded} — total VRAM used {total_vram}MB. Wait for models to unload."
+            return True, f"GPU: models loaded = {loaded} (total {total_vram}MB VRAM)"
+        return True, "GPU: idle (no models loaded)"
+    except httpx.ConnectError:
+        return True, "GPU check skipped (Ollama not reachable at localhost:11436)"
+    except Exception as e:
+        return True, f"GPU check failed (non-fatal): {e}"
+
+
+def preflight_checks(skip_gpu_check: bool = False) -> bool:
+    """Run all pre-flight checks. Returns True if safe to proceed."""
+    print("\n── Pre-flight checks ──────────────────────────────────────────")
+
+    ram_ok, ram_msg = check_ram()
+    print(f"  {'✓' if ram_ok else '✗'} {ram_msg}")
+    if not ram_ok:
+        print("\nABORTING: not enough RAM. Free up memory before running benchmark.")
+        return False
+
+    if not skip_gpu_check:
+        gpu_ok, gpu_msg = check_gpu()
+        print(f"  {'✓' if gpu_ok else '✗'} {gpu_msg}")
+        if not gpu_ok:
+            print("\nABORTING: GPU is busy. Wait for current inference to finish, then retry.")
+            return False
+
+    print("  All checks passed.\n")
+    return True
+
+
+# ── Log helpers ────────────────────────────────────────────────────────────────
+
+def get_log_tail(n: int = 50) -> str:
+    result = subprocess.run(
+        ["docker", "logs", "deepagents", "--tail", str(n)],
+        capture_output=True, text=True,
+    )
+    return result.stdout + result.stderr
+
+
+def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
+    """Find new tier= lines that appeared after we sent the query."""
+    before_lines = set(logs_before.splitlines())
+    new_lines = [l for l in logs_after.splitlines() if l not in before_lines]
+    for line in reversed(new_lines):
+        m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line)
+        if m:
+            tier_raw = m.group(1)
+            # Normalise: "complex (dry-run)" → "complex"
+            return tier_raw.split()[0]
+    return None
+
+
+# ── Request helpers ────────────────────────────────────────────────────────────
+
+async def post_message(
+    client: httpx.AsyncClient,
+    query_id: int,
+    query: str,
+    dry_run: bool = False,
+) -> bool:
+    payload = {
+        "text": query,
+        "session_id": f"benchmark-{query_id}",
+        "channel": "cli",
+        "user_id": "benchmark",
+        "metadata": {"dry_run": dry_run, "benchmark": True},
+    }
+    try:
+        r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
+        r.raise_for_status()
+        return True
+    except Exception as e:
+        print(f" POST_ERROR: {e}", end="")
+        return False
+
+
+# ── Dataset ────────────────────────────────────────────────────────────────────
+
+def load_dataset() -> list[dict]:
+    with open(DATASET) as f:
+        return json.load(f)["queries"]
+
+
+def filter_queries(queries, tier, category, ids):
+    if tier:
+        queries = [q for q in queries if q["tier"] == tier]
+    if category:
+        queries = [q for q in queries if q["category"] == category]
+    if ids:
+        queries = [q for q in queries if q["id"] in ids]
+    return queries
+
+
+# ── Main run ───────────────────────────────────────────────────────────────────
+
+async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
+    results = []
+
+    async with httpx.AsyncClient() as client:
+        try:
+            r = await client.get(f"{ADOLF_URL}/health", timeout=5)
+            r.raise_for_status()
+        except Exception as e:
+            print(f"ERROR: Adolf not reachable: {e}", file=sys.stderr)
+            sys.exit(1)
+
+        total = len(queries)
+        correct = 0
+
+        dry_label = " [DRY-RUN: complex→medium]" if dry_run else ""
+        print(f"\nRunning {total} queries{dry_label}\n")
+        print(f"{'ID':>3}  {'EXPECTED':8}  {'ACTUAL':8}  {'OK':3}  {'TIME':6}  {'CATEGORY':22}  QUERY")
+        print("─" * 110)
+
+        for q in queries:
+            qid = q["id"]
+            expected = q["tier"]
+            category = q["category"]
+            query_text = q["query"]
+
+            # In dry-run, complex queries still use complex classification (logged), but medium infers
+            send_dry = dry_run and expected == "complex"
+            session_id = f"benchmark-{qid}"
+
+            print(f"{qid:>3}  {expected:8}  ", end="", flush=True)
+
+            logs_before = get_log_tail(80)
+            t0 = time.monotonic()
+
+            ok_post = await post_message(client, qid, query_text, dry_run=send_dry)
+            if not ok_post:
+                print(f"{'?':8}  {'ERR':3}  {'?':6}  {category:22}  {query_text[:40]}")
+                results.append({"id": qid, "expected": expected, "actual": None, "ok": False})
+                continue
+
+            # Wait for query to complete via SSE stream (handles GPU semaphore waits)
+            try:
+                async with client.stream(
+                    "GET", f"{ADOLF_URL}/stream/{session_id}", timeout=QUERY_TIMEOUT
+                ) as sse:
+                    async for line in sse.aiter_lines():
+                        if "data: [DONE]" in line:
+                            break
+            except Exception:
+                pass  # timeout or connection issue — check logs anyway
+
+            # Now the query is done — check logs for tier
+            await asyncio.sleep(0.3)
+            logs_after = get_log_tail(80)
+            actual = extract_tier_from_logs(logs_before, logs_after)
+
+            elapsed = time.monotonic() - t0
+            match = actual == expected or (actual == "fast" and expected == "medium")
+            if match:
+                correct += 1
+
+            mark = "✓" if match else "✗"
+            actual_str = actual or "?"
+            print(f"{actual_str:8}  {mark:3}  {elapsed:5.1f}s  {category:22}  {query_text[:40]}")
+
+            results.append({
+                "id": qid,
+                "expected": expected,
+                "actual": actual_str,
+                "ok": match,
+                "elapsed": round(elapsed, 1),
+                "category": category,
+                "query": query_text,
+                "dry_run": send_dry,
+            })
+
+        print("─" * 110)
+        accuracy = correct / total * 100 if total else 0
+        print(f"\nAccuracy: {correct}/{total} ({accuracy:.0f}%)")
+
+        for tier_name in ["light", "medium", "complex"]:
+            tier_qs = [r for r in results if r["expected"] == tier_name]
+            if tier_qs:
+                tier_ok = sum(1 for r in tier_qs if r["ok"])
+                print(f"  {tier_name:8}: {tier_ok}/{len(tier_qs)}")
+
+        wrong = [r for r in results if not r["ok"]]
+        if wrong:
+            print(f"\nMisclassified ({len(wrong)}):")
+            for r in wrong:
+                print(f"  id={r['id']:3}  expected={r['expected']:8}  actual={r['actual']:8}  {r['query'][:60]}")
+
+    with open(RESULTS, "w") as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    print(f"\nResults saved to {RESULTS}")
+
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Adolf routing benchmark",
+        epilog="IMPORTANT: Always check GPU is free before running. This is done automatically."
+    )
+    parser.add_argument("--tier", choices=["light", "medium", "complex"])
+    parser.add_argument("--category")
+    parser.add_argument("--ids", help="Comma-separated IDs")
+    parser.add_argument("--list-categories", action="store_true")
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="For complex queries: route classification is tested but medium model is used for inference (no API cost)",
+    )
+    parser.add_argument(
+        "--skip-gpu-check",
+        action="store_true",
+        help="Skip GPU availability check (use only if you know GPU is free)",
+    )
+    args = parser.parse_args()
+
+    queries = load_dataset()
+
+    if args.list_categories:
+        cats = sorted(set(q["category"] for q in queries))
+        tiers = {t: sum(1 for q in queries if q["tier"] == t) for t in ["light", "medium", "complex"]}
+        print(f"Total: {len(queries)} | Tiers: {tiers}")
+        print(f"Categories: {cats}")
+        return
+
+    # ALWAYS check GPU and RAM before running
+    if not preflight_checks(skip_gpu_check=args.skip_gpu_check):
+        sys.exit(1)
+
+    ids = [int(i) for i in args.ids.split(",")] if args.ids else None
+    queries = filter_queries(queries, args.tier, args.category, ids)
+    if not queries:
+        print("No queries match filters.")
+        sys.exit(1)
+
+    asyncio.run(run(queries, dry_run=args.dry_run))
+
+
+if __name__ == "__main__":
+    main()