From 77db73981915ebc2f4c0dce3b2616732e89fac3d Mon Sep 17 00:00:00 2001
From: alvis <allogn@gmail.com>
Date: Tue, 24 Mar 2026 03:49:09 +0000
Subject: [PATCH] Rename --dry-run to --no-inference, apply to all tiers in
 run_benchmark.py

No-inference mode now skips LLM for all tiers (not just complex),
GPU check is auto-skipped, and the metadata key matches agent.py.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/run_benchmark.py | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)
diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index 6ea1af5..6142321 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -11,7 +11,7 @@ Usage:
     python3 run_benchmark.py --category <name>
     python3 run_benchmark.py --ids 1,2,3
     python3 run_benchmark.py --list-categories
-    python3 run_benchmark.py --dry-run         # complex queries use medium model (no API cost)
+    python3 run_benchmark.py --no-inference    # skip all LLM inference — routing decisions only, all tiers
 
 IMPORTANT: Always check GPU is free before running. This script does it automatically.
 
@@ -121,10 +121,10 @@ def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
     before_lines = set(logs_before.splitlines())
     new_lines = [l for l in logs_after.splitlines() if l not in before_lines]
     for line in new_lines:
-        m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line)
+        m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
         if m:
             tier_raw = m.group(1)
-            # Normalise: "complex (dry-run)" → "complex"
+            # Normalise: "complex (no-inference)" → "complex"
             return tier_raw.split()[0]
     return None
 
@@ -135,14 +135,14 @@ async def post_message(
     client: httpx.AsyncClient,
     query_id: int,
     query: str,
-    dry_run: bool = False,
+    no_inference: bool = False,
 ) -> bool:
     payload = {
         "text": query,
         "session_id": f"benchmark-{query_id}",
         "channel": "cli",
         "user_id": "benchmark",
-        "metadata": {"dry_run": dry_run, "benchmark": True},
+        "metadata": {"no_inference": no_inference, "benchmark": True},
     }
     try:
         r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
@@ -172,7 +172,7 @@ def filter_queries(queries, tier, category, ids):
 
 # ── Main run ───────────────────────────────────────────────────────────────────
 
-async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
+async def run(queries: list[dict], no_inference: bool = False) -> list[dict]:
     results = []
 
     async with httpx.AsyncClient() as client:
@@ -186,7 +186,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
         total = len(queries)
         correct = 0
 
-        dry_label = " [DRY-RUN: complex→medium]" if dry_run else ""
+        dry_label = " [NO-INFERENCE: routing only]" if no_inference else ""
         print(f"\nRunning {total} queries{dry_label}\n")
         print(f"{'ID':>3}  {'EXPECTED':8}  {'ACTUAL':8}  {'OK':3}  {'TIME':6}  {'CATEGORY':22}  QUERY")
         print("─" * 110)
@@ -197,8 +197,6 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
             category = q["category"]
             query_text = q["query"]
 
-            # In dry-run, complex queries still use complex classification (logged), but medium infers
-            send_dry = dry_run and expected == "complex"
             session_id = f"benchmark-{qid}"
 
             print(f"{qid:>3}  {expected:8}  ", end="", flush=True)
@@ -206,7 +204,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
             logs_before = get_log_tail(300)
             t0 = time.monotonic()
 
-            ok_post = await post_message(client, qid, query_text, dry_run=send_dry)
+            ok_post = await post_message(client, qid, query_text, no_inference=no_inference)
             if not ok_post:
                 print(f"{'?':8}  {'ERR':3}  {'?':6}  {category:22}  {query_text[:40]}")
                 results.append({"id": qid, "expected": expected, "actual": None, "ok": False})
@@ -245,7 +243,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
                 "elapsed": round(elapsed, 1),
                 "category": category,
                 "query": query_text,
-                "dry_run": send_dry,
+                "no_inference": no_inference,
             })
 
         print("─" * 110)
@@ -281,9 +279,9 @@ def main():
     parser.add_argument("--ids", help="Comma-separated IDs")
     parser.add_argument("--list-categories", action="store_true")
     parser.add_argument(
-        "--dry-run",
+        "--no-inference",
         action="store_true",
-        help="For complex queries: route classification is tested but medium model is used for inference (no API cost)",
+        help="Skip LLM inference for all tiers — only routing decisions are tested (no GPU/API cost)",
     )
     parser.add_argument(
         "--skip-gpu-check",
@@ -302,7 +300,7 @@ def main():
         return
 
     # ALWAYS check GPU and RAM before running
-    if not preflight_checks(skip_gpu_check=args.skip_gpu_check):
+    if not preflight_checks(skip_gpu_check=args.no_inference):
         sys.exit(1)
 
     ids = [int(i) for i in args.ids.split(",")] if args.ids else None
@@ -311,7 +309,7 @@ def main():
         print("No queries match filters.")
         sys.exit(1)
 
-    asyncio.run(run(queries, dry_run=args.dry_run))
+    asyncio.run(run(queries, no_inference=args.no_inference))
 
 
 if __name__ == "__main__":