From 77db73981915ebc2f4c0dce3b2616732e89fac3d Mon Sep 17 00:00:00 2001 From: alvis Date: Tue, 24 Mar 2026 03:49:09 +0000 Subject: [PATCH] Rename --dry-run to --no-inference, apply to all tiers in run_benchmark.py No-inference mode now skips LLM for all tiers (not just complex), GPU check is auto-skipped, and the metadata key matches agent.py. Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/run_benchmark.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py index 6ea1af5..6142321 100644 --- a/benchmarks/run_benchmark.py +++ b/benchmarks/run_benchmark.py @@ -11,7 +11,7 @@ Usage: python3 run_benchmark.py --category python3 run_benchmark.py --ids 1,2,3 python3 run_benchmark.py --list-categories - python3 run_benchmark.py --dry-run # complex queries use medium model (no API cost) + python3 run_benchmark.py --no-inference # skip all LLM inference — routing decisions only, all tiers IMPORTANT: Always check GPU is free before running. This script does it automatically. @@ -121,10 +121,10 @@ def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None: before_lines = set(logs_before.splitlines()) new_lines = [l for l in logs_after.splitlines() if l not in before_lines] for line in new_lines: - m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line) + m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line) if m: tier_raw = m.group(1) - # Normalise: "complex (dry-run)" → "complex" + # Normalise: "complex (no-inference)" → "complex" return tier_raw.split()[0] return None @@ -135,14 +135,14 @@ async def post_message( client: httpx.AsyncClient, query_id: int, query: str, - dry_run: bool = False, + no_inference: bool = False, ) -> bool: payload = { "text": query, "session_id": f"benchmark-{query_id}", "channel": "cli", "user_id": "benchmark", - "metadata": {"dry_run": dry_run, "benchmark": True}, + "metadata": {"no_inference": no_inference, "benchmark": True}, } try: r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10) @@ -172,7 +172,7 @@ def filter_queries(queries, tier, category, ids): # ── Main run ─────────────────────────────────────────────────────────────────── -async def run(queries: list[dict], dry_run: bool = False) -> list[dict]: +async def run(queries: list[dict], no_inference: bool = False) -> list[dict]: results = [] async with httpx.AsyncClient() as client: @@ -186,7 +186,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]: total = len(queries) correct = 0 - dry_label = " [DRY-RUN: complex→medium]" if dry_run else "" + dry_label = " [NO-INFERENCE: routing only]" if no_inference else "" print(f"\nRunning {total} queries{dry_label}\n") print(f"{'ID':>3} {'EXPECTED':8} {'ACTUAL':8} {'OK':3} {'TIME':6} {'CATEGORY':22} QUERY") print("─" * 110) @@ -197,8 +197,6 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]: category = q["category"] query_text = q["query"] - # In dry-run, complex queries still use complex classification (logged), but medium infers - send_dry = dry_run and expected == "complex" session_id = f"benchmark-{qid}" print(f"{qid:>3} {expected:8} ", end="", flush=True) @@ -206,7 +204,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]: logs_before = get_log_tail(300) t0 = time.monotonic() - ok_post = await post_message(client, qid, query_text, dry_run=send_dry) + ok_post = await post_message(client, qid, query_text, no_inference=no_inference) if not ok_post: print(f"{'?':8} {'ERR':3} {'?':6} {category:22} {query_text[:40]}") results.append({"id": qid, "expected": expected, "actual": None, "ok": False}) @@ -245,7 +243,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]: "elapsed": round(elapsed, 1), "category": category, "query": query_text, - "dry_run": send_dry, + "no_inference": no_inference, }) print("─" * 110) @@ -281,9 +279,9 @@ def main(): parser.add_argument("--ids", help="Comma-separated IDs") parser.add_argument("--list-categories", action="store_true") parser.add_argument( - "--dry-run", + "--no-inference", action="store_true", - help="For complex queries: route classification is tested but medium model is used for inference (no API cost)", + help="Skip LLM inference for all tiers — only routing decisions are tested (no GPU/API cost)", ) parser.add_argument( "--skip-gpu-check", @@ -302,7 +300,7 @@ def main(): return # ALWAYS check GPU and RAM before running - if not preflight_checks(skip_gpu_check=args.skip_gpu_check): + if not preflight_checks(skip_gpu_check=args.no_inference): sys.exit(1) ids = [int(i) for i in args.ids.split(",")] if args.ids else None @@ -311,7 +309,7 @@ def main(): print("No queries match filters.") sys.exit(1) - asyncio.run(run(queries, dry_run=args.dry_run)) + asyncio.run(run(queries, no_inference=args.no_inference)) if __name__ == "__main__":