voice benchmark: rename --dry-run → --no-inference, fix log extraction

- --no-inference applies to all tiers (not just complex) - metadata key: dry_run → no_inference - extract_tier_from_logs: forward iteration (not reversed), updated regex - GPU check skipped when --no-inference - Fix TypeError in misclassified print when actual=None Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 07:58:05 +00:00
parent 4e6d3090c2
commit 887d4b8d90
1 changed files with 15 additions and 16 deletions
--- a/benchmarks/run_voice_benchmark.py
+++ b/benchmarks/run_voice_benchmark.py
@@ -12,7 +12,7 @@ Usage:
    python3 run_voice_benchmark.py [options]
    python3 run_voice_benchmark.py --tier light|medium|complex
    python3 run_voice_benchmark.py --ids 1,2,3
-    python3 run_voice_benchmark.py --dry-run       # complex queries use medium model
+    python3 run_voice_benchmark.py --no-inference  # skip LLM inference — routing only, all tiers
 IMPORTANT: Always check GPU is free before running. Done automatically.
@@ -210,9 +210,9 @@ def get_log_tail(n: int = 60) -> str:
 def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
    before_lines = set(logs_before.splitlines())
-    new_lines = [l for l in logs_after.splitlines() if l not in before_lines]
+    new_lines = [line for line in logs_after.splitlines() if line not in before_lines]
-    for line in reversed(new_lines):
+    for line in new_lines:
-        m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line)
+        m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
        if m:
            return m.group(1).split()[0]
    return None
@@ -222,14 +222,14 @@ async def post_to_adolf(
    client: httpx.AsyncClient,
    query_id: int,
    text: str,
-    dry_run: bool = False,
+    no_inference: bool = False,
 ) -> bool:
    payload = {
        "text": text,
        "session_id": f"voice-bench-{query_id}",
        "channel": "cli",
        "user_id": "benchmark",
-        "metadata": {"dry_run": dry_run, "benchmark": True, "voice": True},
+        "metadata": {"no_inference": no_inference, "benchmark": True, "voice": True},
    }
    try:
        r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
@@ -259,7 +259,7 @@ def filter_queries(queries, tier, category, ids):
 # ── Main run ───────────────────────────────────────────────────────────────────
-async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = False) -> None:
+async def run(queries: list[dict], no_inference: bool = False, save_audio: bool = False) -> None:
    async with httpx.AsyncClient() as client:
        # Check Adolf
        try:
@@ -272,7 +272,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
        total = len(queries)
        results = []
-        dry_label = " [DRY-RUN]" if dry_run else ""
+        dry_label = " [NO-INFERENCE: routing only]" if no_inference else ""
        print(f"Voice benchmark: {total} queries{dry_label}\n")
        print(f"{'ID':>3}  {'EXP':8}  {'ACT':8}  {'OK':3}  {'WER':5}  {'TRANSCRIPT'}")
        print("─" * 100)
@@ -312,11 +312,10 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
            wer_count += 1
            # Step 3: Send to Adolf
            send_dry = dry_run and expected == "complex"
            logs_before = get_log_tail(60)
            t0 = time.monotonic()
-            ok_post = await post_to_adolf(client, qid, transcript, dry_run=send_dry)
+            ok_post = await post_to_adolf(client, qid, transcript, no_inference=no_inference)
            if not ok_post:
                print(f"{'?':8}  {'ERR':3}  {wer:4.2f}  {transcript[:50]}")
                results.append({"id": qid, "expected": expected, "actual": None, "ok": False, "wer": wer, "transcript": transcript})
@@ -349,7 +348,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
                "original": original,
                "transcript": transcript,
                "elapsed": round(elapsed, 1),
-                "dry_run": send_dry,
+                "no_inference": no_inference,
            })
            await asyncio.sleep(0.5)
@@ -374,7 +373,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
        if wrong:
            print(f"\nMisclassified after voice ({len(wrong)}):")
            for r in wrong:
-                print(f"  id={r['id']:3}  expected={r.get('expected','?'):8}  actual={r.get('actual','?'):8}  transcript={r.get('transcript','')[:50]}")
+                print(f"  id={r['id']:3}  expected={r.get('expected') or '?':8}  actual={r.get('actual') or '?':8}  transcript={r.get('transcript','')[:50]}")
        high_wer = [r for r in results if r.get("wer") and r["wer"] > 0.3]
        if high_wer:
@@ -402,14 +401,14 @@ def main():
    parser.add_argument("--tier", choices=["light", "medium", "complex"])
    parser.add_argument("--category")
    parser.add_argument("--ids", help="Comma-separated IDs")
-    parser.add_argument("--dry-run", action="store_true",
+    parser.add_argument("--no-inference", action="store_true",
-                        help="Complex queries use medium model for inference (no API cost)")
+                        help="Skip LLM inference for all tiers — routing decisions only (no GPU/API cost)")
    parser.add_argument("--save-audio", action="store_true",
                        help="Save synthesized WAV files to voice_audio/ directory")
    parser.add_argument("--skip-gpu-check", action="store_true")
    args = parser.parse_args()
-    if not preflight_checks(skip_gpu_check=args.skip_gpu_check):
+    if not preflight_checks(skip_gpu_check=args.skip_gpu_check or args.no_inference):
        sys.exit(1)
    queries = load_dataset()
@@ -419,7 +418,7 @@ def main():
        print("No queries match filters.")
        sys.exit(1)
-    asyncio.run(run(queries, dry_run=args.dry_run, save_audio=args.save_audio))
+    asyncio.run(run(queries, no_inference=args.no_inference, save_audio=args.save_audio))
 if __name__ == "__main__":