Merge pull request 'feat: rename --dry-run to --no-inference in run_benchmark.py' (#18) from feat/no-inference-benchmark into main
Reviewed-on: #18
This commit was merged in pull request #18.
This commit is contained in:
@@ -11,7 +11,7 @@ Usage:
|
|||||||
python3 run_benchmark.py --category <name>
|
python3 run_benchmark.py --category <name>
|
||||||
python3 run_benchmark.py --ids 1,2,3
|
python3 run_benchmark.py --ids 1,2,3
|
||||||
python3 run_benchmark.py --list-categories
|
python3 run_benchmark.py --list-categories
|
||||||
python3 run_benchmark.py --dry-run # complex queries use medium model (no API cost)
|
python3 run_benchmark.py --no-inference # skip all LLM inference — routing decisions only, all tiers
|
||||||
|
|
||||||
IMPORTANT: Always check GPU is free before running. This script does it automatically.
|
IMPORTANT: Always check GPU is free before running. This script does it automatically.
|
||||||
|
|
||||||
@@ -121,10 +121,10 @@ def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
|
|||||||
before_lines = set(logs_before.splitlines())
|
before_lines = set(logs_before.splitlines())
|
||||||
new_lines = [l for l in logs_after.splitlines() if l not in before_lines]
|
new_lines = [l for l in logs_after.splitlines() if l not in before_lines]
|
||||||
for line in new_lines:
|
for line in new_lines:
|
||||||
m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line)
|
m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
|
||||||
if m:
|
if m:
|
||||||
tier_raw = m.group(1)
|
tier_raw = m.group(1)
|
||||||
# Normalise: "complex (dry-run)" → "complex"
|
# Normalise: "complex (no-inference)" → "complex"
|
||||||
return tier_raw.split()[0]
|
return tier_raw.split()[0]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -135,14 +135,14 @@ async def post_message(
|
|||||||
client: httpx.AsyncClient,
|
client: httpx.AsyncClient,
|
||||||
query_id: int,
|
query_id: int,
|
||||||
query: str,
|
query: str,
|
||||||
dry_run: bool = False,
|
no_inference: bool = False,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
payload = {
|
payload = {
|
||||||
"text": query,
|
"text": query,
|
||||||
"session_id": f"benchmark-{query_id}",
|
"session_id": f"benchmark-{query_id}",
|
||||||
"channel": "cli",
|
"channel": "cli",
|
||||||
"user_id": "benchmark",
|
"user_id": "benchmark",
|
||||||
"metadata": {"dry_run": dry_run, "benchmark": True},
|
"metadata": {"no_inference": no_inference, "benchmark": True},
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
|
r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
|
||||||
@@ -172,7 +172,7 @@ def filter_queries(queries, tier, category, ids):
|
|||||||
|
|
||||||
# ── Main run ───────────────────────────────────────────────────────────────────
|
# ── Main run ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
async def run(queries: list[dict], no_inference: bool = False) -> list[dict]:
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
@@ -186,7 +186,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
|||||||
total = len(queries)
|
total = len(queries)
|
||||||
correct = 0
|
correct = 0
|
||||||
|
|
||||||
dry_label = " [DRY-RUN: complex→medium]" if dry_run else ""
|
dry_label = " [NO-INFERENCE: routing only]" if no_inference else ""
|
||||||
print(f"\nRunning {total} queries{dry_label}\n")
|
print(f"\nRunning {total} queries{dry_label}\n")
|
||||||
print(f"{'ID':>3} {'EXPECTED':8} {'ACTUAL':8} {'OK':3} {'TIME':6} {'CATEGORY':22} QUERY")
|
print(f"{'ID':>3} {'EXPECTED':8} {'ACTUAL':8} {'OK':3} {'TIME':6} {'CATEGORY':22} QUERY")
|
||||||
print("─" * 110)
|
print("─" * 110)
|
||||||
@@ -197,8 +197,6 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
|||||||
category = q["category"]
|
category = q["category"]
|
||||||
query_text = q["query"]
|
query_text = q["query"]
|
||||||
|
|
||||||
# In dry-run, complex queries still use complex classification (logged), but medium infers
|
|
||||||
send_dry = dry_run and expected == "complex"
|
|
||||||
session_id = f"benchmark-{qid}"
|
session_id = f"benchmark-{qid}"
|
||||||
|
|
||||||
print(f"{qid:>3} {expected:8} ", end="", flush=True)
|
print(f"{qid:>3} {expected:8} ", end="", flush=True)
|
||||||
@@ -206,7 +204,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
|||||||
logs_before = get_log_tail(300)
|
logs_before = get_log_tail(300)
|
||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
|
|
||||||
ok_post = await post_message(client, qid, query_text, dry_run=send_dry)
|
ok_post = await post_message(client, qid, query_text, no_inference=no_inference)
|
||||||
if not ok_post:
|
if not ok_post:
|
||||||
print(f"{'?':8} {'ERR':3} {'?':6} {category:22} {query_text[:40]}")
|
print(f"{'?':8} {'ERR':3} {'?':6} {category:22} {query_text[:40]}")
|
||||||
results.append({"id": qid, "expected": expected, "actual": None, "ok": False})
|
results.append({"id": qid, "expected": expected, "actual": None, "ok": False})
|
||||||
@@ -245,7 +243,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
|||||||
"elapsed": round(elapsed, 1),
|
"elapsed": round(elapsed, 1),
|
||||||
"category": category,
|
"category": category,
|
||||||
"query": query_text,
|
"query": query_text,
|
||||||
"dry_run": send_dry,
|
"no_inference": no_inference,
|
||||||
})
|
})
|
||||||
|
|
||||||
print("─" * 110)
|
print("─" * 110)
|
||||||
@@ -281,9 +279,9 @@ def main():
|
|||||||
parser.add_argument("--ids", help="Comma-separated IDs")
|
parser.add_argument("--ids", help="Comma-separated IDs")
|
||||||
parser.add_argument("--list-categories", action="store_true")
|
parser.add_argument("--list-categories", action="store_true")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dry-run",
|
"--no-inference",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="For complex queries: route classification is tested but medium model is used for inference (no API cost)",
|
help="Skip LLM inference for all tiers — only routing decisions are tested (no GPU/API cost)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--skip-gpu-check",
|
"--skip-gpu-check",
|
||||||
@@ -302,7 +300,7 @@ def main():
|
|||||||
return
|
return
|
||||||
|
|
||||||
# ALWAYS check GPU and RAM before running
|
# ALWAYS check GPU and RAM before running
|
||||||
if not preflight_checks(skip_gpu_check=args.skip_gpu_check):
|
if not preflight_checks(skip_gpu_check=args.no_inference):
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
ids = [int(i) for i in args.ids.split(",")] if args.ids else None
|
ids = [int(i) for i in args.ids.split(",")] if args.ids else None
|
||||||
@@ -311,7 +309,7 @@ def main():
|
|||||||
print("No queries match filters.")
|
print("No queries match filters.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
asyncio.run(run(queries, dry_run=args.dry_run))
|
asyncio.run(run(queries, no_inference=args.no_inference))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user