diff --git a/benchmarks/run_voice_benchmark.py b/benchmarks/run_voice_benchmark.py index 8382ab6..525f84d 100644 --- a/benchmarks/run_voice_benchmark.py +++ b/benchmarks/run_voice_benchmark.py @@ -12,7 +12,7 @@ Usage: python3 run_voice_benchmark.py [options] python3 run_voice_benchmark.py --tier light|medium|complex python3 run_voice_benchmark.py --ids 1,2,3 - python3 run_voice_benchmark.py --dry-run # complex queries use medium model + python3 run_voice_benchmark.py --no-inference # skip LLM inference — routing only, all tiers IMPORTANT: Always check GPU is free before running. Done automatically. @@ -210,9 +210,9 @@ def get_log_tail(n: int = 60) -> str: def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None: before_lines = set(logs_before.splitlines()) - new_lines = [l for l in logs_after.splitlines() if l not in before_lines] - for line in reversed(new_lines): - m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line) + new_lines = [line for line in logs_after.splitlines() if line not in before_lines] + for line in new_lines: + m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line) if m: return m.group(1).split()[0] return None @@ -222,14 +222,14 @@ async def post_to_adolf( client: httpx.AsyncClient, query_id: int, text: str, - dry_run: bool = False, + no_inference: bool = False, ) -> bool: payload = { "text": text, "session_id": f"voice-bench-{query_id}", "channel": "cli", "user_id": "benchmark", - "metadata": {"dry_run": dry_run, "benchmark": True, "voice": True}, + "metadata": {"no_inference": no_inference, "benchmark": True, "voice": True}, } try: r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10) @@ -259,7 +259,7 @@ def filter_queries(queries, tier, category, ids): # ── Main run ─────────────────────────────────────────────────────────────────── -async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = False) -> None: +async def run(queries: list[dict], no_inference: bool = False, save_audio: bool = False) -> None: async with httpx.AsyncClient() as client: # Check Adolf try: @@ -272,7 +272,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal total = len(queries) results = [] - dry_label = " [DRY-RUN]" if dry_run else "" + dry_label = " [NO-INFERENCE: routing only]" if no_inference else "" print(f"Voice benchmark: {total} queries{dry_label}\n") print(f"{'ID':>3} {'EXP':8} {'ACT':8} {'OK':3} {'WER':5} {'TRANSCRIPT'}") print("─" * 100) @@ -312,11 +312,10 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal wer_count += 1 # Step 3: Send to Adolf - send_dry = dry_run and expected == "complex" logs_before = get_log_tail(60) t0 = time.monotonic() - ok_post = await post_to_adolf(client, qid, transcript, dry_run=send_dry) + ok_post = await post_to_adolf(client, qid, transcript, no_inference=no_inference) if not ok_post: print(f"{'?':8} {'ERR':3} {wer:4.2f} {transcript[:50]}") results.append({"id": qid, "expected": expected, "actual": None, "ok": False, "wer": wer, "transcript": transcript}) @@ -349,7 +348,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal "original": original, "transcript": transcript, "elapsed": round(elapsed, 1), - "dry_run": send_dry, + "no_inference": no_inference, }) await asyncio.sleep(0.5) @@ -374,7 +373,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal if wrong: print(f"\nMisclassified after voice ({len(wrong)}):") for r in wrong: - print(f" id={r['id']:3} expected={r.get('expected','?'):8} actual={r.get('actual','?'):8} transcript={r.get('transcript','')[:50]}") + print(f" id={r['id']:3} expected={r.get('expected') or '?':8} actual={r.get('actual') or '?':8} transcript={r.get('transcript','')[:50]}") high_wer = [r for r in results if r.get("wer") and r["wer"] > 0.3] if high_wer: @@ -402,14 +401,14 @@ def main(): parser.add_argument("--tier", choices=["light", "medium", "complex"]) parser.add_argument("--category") parser.add_argument("--ids", help="Comma-separated IDs") - parser.add_argument("--dry-run", action="store_true", - help="Complex queries use medium model for inference (no API cost)") + parser.add_argument("--no-inference", action="store_true", + help="Skip LLM inference for all tiers — routing decisions only (no GPU/API cost)") parser.add_argument("--save-audio", action="store_true", help="Save synthesized WAV files to voice_audio/ directory") parser.add_argument("--skip-gpu-check", action="store_true") args = parser.parse_args() - if not preflight_checks(skip_gpu_check=args.skip_gpu_check): + if not preflight_checks(skip_gpu_check=args.skip_gpu_check or args.no_inference): sys.exit(1) queries = load_dataset() @@ -419,7 +418,7 @@ def main(): print("No queries match filters.") sys.exit(1) - asyncio.run(run(queries, dry_run=args.dry_run, save_audio=args.save_audio)) + asyncio.run(run(queries, no_inference=args.no_inference, save_audio=args.save_audio)) if __name__ == "__main__":