Compare commits

...

2 Commits

Author SHA1 Message Date
47a1166be6 Merge pull request 'feat: rename --dry-run to --no-inference in run_benchmark.py' (#18) from feat/no-inference-benchmark into main
Reviewed-on: #18
2026-03-24 07:26:44 +00:00
77db739819 Rename --dry-run to --no-inference, apply to all tiers in run_benchmark.py
No-inference mode now skips LLM for all tiers (not just complex),
GPU check is auto-skipped, and the metadata key matches agent.py.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 03:49:09 +00:00

View File

@@ -11,7 +11,7 @@ Usage:
python3 run_benchmark.py --category <name> python3 run_benchmark.py --category <name>
python3 run_benchmark.py --ids 1,2,3 python3 run_benchmark.py --ids 1,2,3
python3 run_benchmark.py --list-categories python3 run_benchmark.py --list-categories
python3 run_benchmark.py --dry-run # complex queries use medium model (no API cost) python3 run_benchmark.py --no-inference # skip all LLM inference — routing decisions only, all tiers
IMPORTANT: Always check GPU is free before running. This script does it automatically. IMPORTANT: Always check GPU is free before running. This script does it automatically.
@@ -121,10 +121,10 @@ def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
before_lines = set(logs_before.splitlines()) before_lines = set(logs_before.splitlines())
new_lines = [l for l in logs_after.splitlines() if l not in before_lines] new_lines = [l for l in logs_after.splitlines() if l not in before_lines]
for line in new_lines: for line in new_lines:
m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line) m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
if m: if m:
tier_raw = m.group(1) tier_raw = m.group(1)
# Normalise: "complex (dry-run)" → "complex" # Normalise: "complex (no-inference)" → "complex"
return tier_raw.split()[0] return tier_raw.split()[0]
return None return None
@@ -135,14 +135,14 @@ async def post_message(
client: httpx.AsyncClient, client: httpx.AsyncClient,
query_id: int, query_id: int,
query: str, query: str,
dry_run: bool = False, no_inference: bool = False,
) -> bool: ) -> bool:
payload = { payload = {
"text": query, "text": query,
"session_id": f"benchmark-{query_id}", "session_id": f"benchmark-{query_id}",
"channel": "cli", "channel": "cli",
"user_id": "benchmark", "user_id": "benchmark",
"metadata": {"dry_run": dry_run, "benchmark": True}, "metadata": {"no_inference": no_inference, "benchmark": True},
} }
try: try:
r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10) r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
@@ -172,7 +172,7 @@ def filter_queries(queries, tier, category, ids):
# ── Main run ─────────────────────────────────────────────────────────────────── # ── Main run ───────────────────────────────────────────────────────────────────
async def run(queries: list[dict], dry_run: bool = False) -> list[dict]: async def run(queries: list[dict], no_inference: bool = False) -> list[dict]:
results = [] results = []
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
@@ -186,7 +186,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
total = len(queries) total = len(queries)
correct = 0 correct = 0
dry_label = " [DRY-RUN: complex→medium]" if dry_run else "" dry_label = " [NO-INFERENCE: routing only]" if no_inference else ""
print(f"\nRunning {total} queries{dry_label}\n") print(f"\nRunning {total} queries{dry_label}\n")
print(f"{'ID':>3} {'EXPECTED':8} {'ACTUAL':8} {'OK':3} {'TIME':6} {'CATEGORY':22} QUERY") print(f"{'ID':>3} {'EXPECTED':8} {'ACTUAL':8} {'OK':3} {'TIME':6} {'CATEGORY':22} QUERY")
print("" * 110) print("" * 110)
@@ -197,8 +197,6 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
category = q["category"] category = q["category"]
query_text = q["query"] query_text = q["query"]
# In dry-run, complex queries still use complex classification (logged), but medium infers
send_dry = dry_run and expected == "complex"
session_id = f"benchmark-{qid}" session_id = f"benchmark-{qid}"
print(f"{qid:>3} {expected:8} ", end="", flush=True) print(f"{qid:>3} {expected:8} ", end="", flush=True)
@@ -206,7 +204,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
logs_before = get_log_tail(300) logs_before = get_log_tail(300)
t0 = time.monotonic() t0 = time.monotonic()
ok_post = await post_message(client, qid, query_text, dry_run=send_dry) ok_post = await post_message(client, qid, query_text, no_inference=no_inference)
if not ok_post: if not ok_post:
print(f"{'?':8} {'ERR':3} {'?':6} {category:22} {query_text[:40]}") print(f"{'?':8} {'ERR':3} {'?':6} {category:22} {query_text[:40]}")
results.append({"id": qid, "expected": expected, "actual": None, "ok": False}) results.append({"id": qid, "expected": expected, "actual": None, "ok": False})
@@ -245,7 +243,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
"elapsed": round(elapsed, 1), "elapsed": round(elapsed, 1),
"category": category, "category": category,
"query": query_text, "query": query_text,
"dry_run": send_dry, "no_inference": no_inference,
}) })
print("" * 110) print("" * 110)
@@ -281,9 +279,9 @@ def main():
parser.add_argument("--ids", help="Comma-separated IDs") parser.add_argument("--ids", help="Comma-separated IDs")
parser.add_argument("--list-categories", action="store_true") parser.add_argument("--list-categories", action="store_true")
parser.add_argument( parser.add_argument(
"--dry-run", "--no-inference",
action="store_true", action="store_true",
help="For complex queries: route classification is tested but medium model is used for inference (no API cost)", help="Skip LLM inference for all tiers — only routing decisions are tested (no GPU/API cost)",
) )
parser.add_argument( parser.add_argument(
"--skip-gpu-check", "--skip-gpu-check",
@@ -302,7 +300,7 @@ def main():
return return
# ALWAYS check GPU and RAM before running # ALWAYS check GPU and RAM before running
if not preflight_checks(skip_gpu_check=args.skip_gpu_check): if not preflight_checks(skip_gpu_check=args.no_inference):
sys.exit(1) sys.exit(1)
ids = [int(i) for i in args.ids.split(",")] if args.ids else None ids = [int(i) for i in args.ids.split(",")] if args.ids else None
@@ -311,7 +309,7 @@ def main():
print("No queries match filters.") print("No queries match filters.")
sys.exit(1) sys.exit(1)
asyncio.run(run(queries, dry_run=args.dry_run)) asyncio.run(run(queries, no_inference=args.no_inference))
if __name__ == "__main__": if __name__ == "__main__":