Files
adolf/run_benchmark.py
Alvis ab68bba935 Add routing benchmark scripts; gitignore dataset and results
- run_benchmark.py: sends queries to /message, extracts tier= from docker
  logs, reports per-tier accuracy, saves results_latest.json
- run_voice_benchmark.py: voice path benchmark
- .gitignore: ignore benchmark.json (dataset) and results_latest.json
  (runtime output); benchmark scripts are tracked, data files are not

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 02:00:17 +00:00

319 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Adolf routing benchmark.
Sends each query to Adolf's /message endpoint, waits briefly for the routing
decision to appear in docker logs, then records the actual tier.
Usage:
python3 run_benchmark.py [options]
python3 run_benchmark.py --tier light|medium|complex
python3 run_benchmark.py --category <name>
python3 run_benchmark.py --ids 1,2,3
python3 run_benchmark.py --list-categories
python3 run_benchmark.py --dry-run # complex queries use medium model (no API cost)
IMPORTANT: Always check GPU is free before running. This script does it automatically.
Adolf must be running at http://localhost:8000.
"""
import argparse
import asyncio
import json
import re
import subprocess
import sys
import time
from pathlib import Path
import httpx
ADOLF_URL = "http://localhost:8000"
OLLAMA_URL = "http://localhost:11436" # GPU Ollama
DATASET = Path(__file__).parent / "benchmark.json"
RESULTS = Path(__file__).parent / "results_latest.json"
# Max time to wait for each query to fully complete via SSE stream
QUERY_TIMEOUT = 300 # seconds — generous to handle GPU semaphore waits
# Memory thresholds
MIN_FREE_RAM_MB = 1500 # abort if less than this is free
MIN_FREE_VRAM_MB = 500 # warn if less than this is free on GPU
# ── Pre-flight checks ──────────────────────────────────────────────────────────
def check_ram() -> tuple[bool, str]:
"""Check available system RAM. Returns (ok, message)."""
try:
with open("/proc/meminfo") as f:
info = {}
for line in f:
parts = line.split()
if len(parts) >= 2:
info[parts[0].rstrip(":")] = int(parts[1])
free_mb = (info.get("MemAvailable", 0)) // 1024
total_mb = info.get("MemTotal", 0) // 1024
msg = f"RAM: {free_mb} MB free / {total_mb} MB total"
if free_mb < MIN_FREE_RAM_MB:
return False, f"CRITICAL: {msg} — need at least {MIN_FREE_RAM_MB} MB free"
return True, msg
except Exception as e:
return True, f"RAM check failed (non-fatal): {e}"
def check_gpu() -> tuple[bool, str]:
"""Check GPU VRAM via Ollama /api/ps. Returns (ok, message)."""
try:
r = httpx.get(f"{OLLAMA_URL}/api/ps", timeout=5)
r.raise_for_status()
data = r.json()
models = data.get("models", [])
if models:
names = [m.get("name", "?") for m in models]
sizes_mb = [m.get("size_vram", 0) // (1024 * 1024) for m in models]
loaded = ", ".join(f"{n} ({s}MB)" for n, s in zip(names, sizes_mb))
total_vram = sum(sizes_mb)
if total_vram > 7000:
return False, f"GPU BUSY: models loaded = {loaded} — total VRAM used {total_vram}MB. Wait for models to unload."
return True, f"GPU: models loaded = {loaded} (total {total_vram}MB VRAM)"
return True, "GPU: idle (no models loaded)"
except httpx.ConnectError:
return True, "GPU check skipped (Ollama not reachable at localhost:11436)"
except Exception as e:
return True, f"GPU check failed (non-fatal): {e}"
def preflight_checks(skip_gpu_check: bool = False) -> bool:
"""Run all pre-flight checks. Returns True if safe to proceed."""
print("\n── Pre-flight checks ──────────────────────────────────────────")
ram_ok, ram_msg = check_ram()
print(f" {'' if ram_ok else ''} {ram_msg}")
if not ram_ok:
print("\nABORTING: not enough RAM. Free up memory before running benchmark.")
return False
if not skip_gpu_check:
gpu_ok, gpu_msg = check_gpu()
print(f" {'' if gpu_ok else ''} {gpu_msg}")
if not gpu_ok:
print("\nABORTING: GPU is busy. Wait for current inference to finish, then retry.")
return False
print(" All checks passed.\n")
return True
# ── Log helpers ────────────────────────────────────────────────────────────────
def get_log_tail(n: int = 50) -> str:
result = subprocess.run(
["docker", "logs", "deepagents", "--tail", str(n)],
capture_output=True, text=True,
)
return result.stdout + result.stderr
def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
"""Find new tier= lines that appeared after we sent the query."""
before_lines = set(logs_before.splitlines())
new_lines = [l for l in logs_after.splitlines() if l not in before_lines]
for line in reversed(new_lines):
m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line)
if m:
tier_raw = m.group(1)
# Normalise: "complex (dry-run)" → "complex"
return tier_raw.split()[0]
return None
# ── Request helpers ────────────────────────────────────────────────────────────
async def post_message(
client: httpx.AsyncClient,
query_id: int,
query: str,
dry_run: bool = False,
) -> bool:
payload = {
"text": query,
"session_id": f"benchmark-{query_id}",
"channel": "cli",
"user_id": "benchmark",
"metadata": {"dry_run": dry_run, "benchmark": True},
}
try:
r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
r.raise_for_status()
return True
except Exception as e:
print(f" POST_ERROR: {e}", end="")
return False
# ── Dataset ────────────────────────────────────────────────────────────────────
def load_dataset() -> list[dict]:
with open(DATASET) as f:
return json.load(f)["queries"]
def filter_queries(queries, tier, category, ids):
if tier:
queries = [q for q in queries if q["tier"] == tier]
if category:
queries = [q for q in queries if q["category"] == category]
if ids:
queries = [q for q in queries if q["id"] in ids]
return queries
# ── Main run ───────────────────────────────────────────────────────────────────
async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
results = []
async with httpx.AsyncClient() as client:
try:
r = await client.get(f"{ADOLF_URL}/health", timeout=5)
r.raise_for_status()
except Exception as e:
print(f"ERROR: Adolf not reachable: {e}", file=sys.stderr)
sys.exit(1)
total = len(queries)
correct = 0
dry_label = " [DRY-RUN: complex→medium]" if dry_run else ""
print(f"\nRunning {total} queries{dry_label}\n")
print(f"{'ID':>3} {'EXPECTED':8} {'ACTUAL':8} {'OK':3} {'TIME':6} {'CATEGORY':22} QUERY")
print("" * 110)
for q in queries:
qid = q["id"]
expected = q["tier"]
category = q["category"]
query_text = q["query"]
# In dry-run, complex queries still use complex classification (logged), but medium infers
send_dry = dry_run and expected == "complex"
session_id = f"benchmark-{qid}"
print(f"{qid:>3} {expected:8} ", end="", flush=True)
logs_before = get_log_tail(80)
t0 = time.monotonic()
ok_post = await post_message(client, qid, query_text, dry_run=send_dry)
if not ok_post:
print(f"{'?':8} {'ERR':3} {'?':6} {category:22} {query_text[:40]}")
results.append({"id": qid, "expected": expected, "actual": None, "ok": False})
continue
# Wait for query to complete via SSE stream (handles GPU semaphore waits)
try:
async with client.stream(
"GET", f"{ADOLF_URL}/stream/{session_id}", timeout=QUERY_TIMEOUT
) as sse:
async for line in sse.aiter_lines():
if "data: [DONE]" in line:
break
except Exception:
pass # timeout or connection issue — check logs anyway
# Now the query is done — check logs for tier
await asyncio.sleep(0.3)
logs_after = get_log_tail(80)
actual = extract_tier_from_logs(logs_before, logs_after)
elapsed = time.monotonic() - t0
match = actual == expected or (actual == "fast" and expected == "medium")
if match:
correct += 1
mark = "" if match else ""
actual_str = actual or "?"
print(f"{actual_str:8} {mark:3} {elapsed:5.1f}s {category:22} {query_text[:40]}")
results.append({
"id": qid,
"expected": expected,
"actual": actual_str,
"ok": match,
"elapsed": round(elapsed, 1),
"category": category,
"query": query_text,
"dry_run": send_dry,
})
print("" * 110)
accuracy = correct / total * 100 if total else 0
print(f"\nAccuracy: {correct}/{total} ({accuracy:.0f}%)")
for tier_name in ["light", "medium", "complex"]:
tier_qs = [r for r in results if r["expected"] == tier_name]
if tier_qs:
tier_ok = sum(1 for r in tier_qs if r["ok"])
print(f" {tier_name:8}: {tier_ok}/{len(tier_qs)}")
wrong = [r for r in results if not r["ok"]]
if wrong:
print(f"\nMisclassified ({len(wrong)}):")
for r in wrong:
print(f" id={r['id']:3} expected={r['expected']:8} actual={r['actual']:8} {r['query'][:60]}")
with open(RESULTS, "w") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\nResults saved to {RESULTS}")
return results
def main():
parser = argparse.ArgumentParser(
description="Adolf routing benchmark",
epilog="IMPORTANT: Always check GPU is free before running. This is done automatically."
)
parser.add_argument("--tier", choices=["light", "medium", "complex"])
parser.add_argument("--category")
parser.add_argument("--ids", help="Comma-separated IDs")
parser.add_argument("--list-categories", action="store_true")
parser.add_argument(
"--dry-run",
action="store_true",
help="For complex queries: route classification is tested but medium model is used for inference (no API cost)",
)
parser.add_argument(
"--skip-gpu-check",
action="store_true",
help="Skip GPU availability check (use only if you know GPU is free)",
)
args = parser.parse_args()
queries = load_dataset()
if args.list_categories:
cats = sorted(set(q["category"] for q in queries))
tiers = {t: sum(1 for q in queries if q["tier"] == t) for t in ["light", "medium", "complex"]}
print(f"Total: {len(queries)} | Tiers: {tiers}")
print(f"Categories: {cats}")
return
# ALWAYS check GPU and RAM before running
if not preflight_checks(skip_gpu_check=args.skip_gpu_check):
sys.exit(1)
ids = [int(i) for i in args.ids.split(",")] if args.ids else None
queries = filter_queries(queries, args.tier, args.category, ids)
if not queries:
print("No queries match filters.")
sys.exit(1)
asyncio.run(run(queries, dry_run=args.dry_run))
if __name__ == "__main__":
main()