Compare commits
2 Commits
74e5b1758d
...
47a1166be6
| Author | SHA1 | Date | |
|---|---|---|---|
| 47a1166be6 | |||
| 77db739819 |
@@ -11,7 +11,7 @@ Usage:
|
||||
python3 run_benchmark.py --category <name>
|
||||
python3 run_benchmark.py --ids 1,2,3
|
||||
python3 run_benchmark.py --list-categories
|
||||
python3 run_benchmark.py --dry-run # complex queries use medium model (no API cost)
|
||||
python3 run_benchmark.py --no-inference # skip all LLM inference — routing decisions only, all tiers
|
||||
|
||||
IMPORTANT: Always check GPU is free before running. This script does it automatically.
|
||||
|
||||
@@ -121,10 +121,10 @@ def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
|
||||
before_lines = set(logs_before.splitlines())
|
||||
new_lines = [l for l in logs_after.splitlines() if l not in before_lines]
|
||||
for line in new_lines:
|
||||
m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line)
|
||||
m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
|
||||
if m:
|
||||
tier_raw = m.group(1)
|
||||
# Normalise: "complex (dry-run)" → "complex"
|
||||
# Normalise: "complex (no-inference)" → "complex"
|
||||
return tier_raw.split()[0]
|
||||
return None
|
||||
|
||||
@@ -135,14 +135,14 @@ async def post_message(
|
||||
client: httpx.AsyncClient,
|
||||
query_id: int,
|
||||
query: str,
|
||||
dry_run: bool = False,
|
||||
no_inference: bool = False,
|
||||
) -> bool:
|
||||
payload = {
|
||||
"text": query,
|
||||
"session_id": f"benchmark-{query_id}",
|
||||
"channel": "cli",
|
||||
"user_id": "benchmark",
|
||||
"metadata": {"dry_run": dry_run, "benchmark": True},
|
||||
"metadata": {"no_inference": no_inference, "benchmark": True},
|
||||
}
|
||||
try:
|
||||
r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
|
||||
@@ -172,7 +172,7 @@ def filter_queries(queries, tier, category, ids):
|
||||
|
||||
# ── Main run ───────────────────────────────────────────────────────────────────
|
||||
|
||||
async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
||||
async def run(queries: list[dict], no_inference: bool = False) -> list[dict]:
|
||||
results = []
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
@@ -186,7 +186,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
||||
total = len(queries)
|
||||
correct = 0
|
||||
|
||||
dry_label = " [DRY-RUN: complex→medium]" if dry_run else ""
|
||||
dry_label = " [NO-INFERENCE: routing only]" if no_inference else ""
|
||||
print(f"\nRunning {total} queries{dry_label}\n")
|
||||
print(f"{'ID':>3} {'EXPECTED':8} {'ACTUAL':8} {'OK':3} {'TIME':6} {'CATEGORY':22} QUERY")
|
||||
print("─" * 110)
|
||||
@@ -197,8 +197,6 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
||||
category = q["category"]
|
||||
query_text = q["query"]
|
||||
|
||||
# In dry-run, complex queries still use complex classification (logged), but medium infers
|
||||
send_dry = dry_run and expected == "complex"
|
||||
session_id = f"benchmark-{qid}"
|
||||
|
||||
print(f"{qid:>3} {expected:8} ", end="", flush=True)
|
||||
@@ -206,7 +204,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
||||
logs_before = get_log_tail(300)
|
||||
t0 = time.monotonic()
|
||||
|
||||
ok_post = await post_message(client, qid, query_text, dry_run=send_dry)
|
||||
ok_post = await post_message(client, qid, query_text, no_inference=no_inference)
|
||||
if not ok_post:
|
||||
print(f"{'?':8} {'ERR':3} {'?':6} {category:22} {query_text[:40]}")
|
||||
results.append({"id": qid, "expected": expected, "actual": None, "ok": False})
|
||||
@@ -245,7 +243,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
||||
"elapsed": round(elapsed, 1),
|
||||
"category": category,
|
||||
"query": query_text,
|
||||
"dry_run": send_dry,
|
||||
"no_inference": no_inference,
|
||||
})
|
||||
|
||||
print("─" * 110)
|
||||
@@ -281,9 +279,9 @@ def main():
|
||||
parser.add_argument("--ids", help="Comma-separated IDs")
|
||||
parser.add_argument("--list-categories", action="store_true")
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
"--no-inference",
|
||||
action="store_true",
|
||||
help="For complex queries: route classification is tested but medium model is used for inference (no API cost)",
|
||||
help="Skip LLM inference for all tiers — only routing decisions are tested (no GPU/API cost)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-gpu-check",
|
||||
@@ -302,7 +300,7 @@ def main():
|
||||
return
|
||||
|
||||
# ALWAYS check GPU and RAM before running
|
||||
if not preflight_checks(skip_gpu_check=args.skip_gpu_check):
|
||||
if not preflight_checks(skip_gpu_check=args.no_inference):
|
||||
sys.exit(1)
|
||||
|
||||
ids = [int(i) for i in args.ids.split(",")] if args.ids else None
|
||||
@@ -311,7 +309,7 @@ def main():
|
||||
print("No queries match filters.")
|
||||
sys.exit(1)
|
||||
|
||||
asyncio.run(run(queries, dry_run=args.dry_run))
|
||||
asyncio.run(run(queries, no_inference=args.no_inference))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user