voice benchmark: rename --dry-run → --no-inference, fix log extraction

- --no-inference applies to all tiers (not just complex)
- metadata key: dry_run → no_inference
- extract_tier_from_logs: forward iteration (not reversed), updated regex
- GPU check skipped when --no-inference
- Fix TypeError in misclassified print when actual=None

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-24 07:58:05 +00:00
parent 4e6d3090c2
commit 887d4b8d90

View File

@@ -12,7 +12,7 @@ Usage:
python3 run_voice_benchmark.py [options] python3 run_voice_benchmark.py [options]
python3 run_voice_benchmark.py --tier light|medium|complex python3 run_voice_benchmark.py --tier light|medium|complex
python3 run_voice_benchmark.py --ids 1,2,3 python3 run_voice_benchmark.py --ids 1,2,3
python3 run_voice_benchmark.py --dry-run # complex queries use medium model python3 run_voice_benchmark.py --no-inference # skip LLM inference — routing only, all tiers
IMPORTANT: Always check GPU is free before running. Done automatically. IMPORTANT: Always check GPU is free before running. Done automatically.
@@ -210,9 +210,9 @@ def get_log_tail(n: int = 60) -> str:
def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None: def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
before_lines = set(logs_before.splitlines()) before_lines = set(logs_before.splitlines())
new_lines = [l for l in logs_after.splitlines() if l not in before_lines] new_lines = [line for line in logs_after.splitlines() if line not in before_lines]
for line in reversed(new_lines): for line in new_lines:
m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line) m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
if m: if m:
return m.group(1).split()[0] return m.group(1).split()[0]
return None return None
@@ -222,14 +222,14 @@ async def post_to_adolf(
client: httpx.AsyncClient, client: httpx.AsyncClient,
query_id: int, query_id: int,
text: str, text: str,
dry_run: bool = False, no_inference: bool = False,
) -> bool: ) -> bool:
payload = { payload = {
"text": text, "text": text,
"session_id": f"voice-bench-{query_id}", "session_id": f"voice-bench-{query_id}",
"channel": "cli", "channel": "cli",
"user_id": "benchmark", "user_id": "benchmark",
"metadata": {"dry_run": dry_run, "benchmark": True, "voice": True}, "metadata": {"no_inference": no_inference, "benchmark": True, "voice": True},
} }
try: try:
r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10) r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
@@ -259,7 +259,7 @@ def filter_queries(queries, tier, category, ids):
# ── Main run ─────────────────────────────────────────────────────────────────── # ── Main run ───────────────────────────────────────────────────────────────────
async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = False) -> None: async def run(queries: list[dict], no_inference: bool = False, save_audio: bool = False) -> None:
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
# Check Adolf # Check Adolf
try: try:
@@ -272,7 +272,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
total = len(queries) total = len(queries)
results = [] results = []
dry_label = " [DRY-RUN]" if dry_run else "" dry_label = " [NO-INFERENCE: routing only]" if no_inference else ""
print(f"Voice benchmark: {total} queries{dry_label}\n") print(f"Voice benchmark: {total} queries{dry_label}\n")
print(f"{'ID':>3} {'EXP':8} {'ACT':8} {'OK':3} {'WER':5} {'TRANSCRIPT'}") print(f"{'ID':>3} {'EXP':8} {'ACT':8} {'OK':3} {'WER':5} {'TRANSCRIPT'}")
print("" * 100) print("" * 100)
@@ -312,11 +312,10 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
wer_count += 1 wer_count += 1
# Step 3: Send to Adolf # Step 3: Send to Adolf
send_dry = dry_run and expected == "complex"
logs_before = get_log_tail(60) logs_before = get_log_tail(60)
t0 = time.monotonic() t0 = time.monotonic()
ok_post = await post_to_adolf(client, qid, transcript, dry_run=send_dry) ok_post = await post_to_adolf(client, qid, transcript, no_inference=no_inference)
if not ok_post: if not ok_post:
print(f"{'?':8} {'ERR':3} {wer:4.2f} {transcript[:50]}") print(f"{'?':8} {'ERR':3} {wer:4.2f} {transcript[:50]}")
results.append({"id": qid, "expected": expected, "actual": None, "ok": False, "wer": wer, "transcript": transcript}) results.append({"id": qid, "expected": expected, "actual": None, "ok": False, "wer": wer, "transcript": transcript})
@@ -349,7 +348,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
"original": original, "original": original,
"transcript": transcript, "transcript": transcript,
"elapsed": round(elapsed, 1), "elapsed": round(elapsed, 1),
"dry_run": send_dry, "no_inference": no_inference,
}) })
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
@@ -374,7 +373,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
if wrong: if wrong:
print(f"\nMisclassified after voice ({len(wrong)}):") print(f"\nMisclassified after voice ({len(wrong)}):")
for r in wrong: for r in wrong:
print(f" id={r['id']:3} expected={r.get('expected','?'):8} actual={r.get('actual','?'):8} transcript={r.get('transcript','')[:50]}") print(f" id={r['id']:3} expected={r.get('expected') or '?':8} actual={r.get('actual') or '?':8} transcript={r.get('transcript','')[:50]}")
high_wer = [r for r in results if r.get("wer") and r["wer"] > 0.3] high_wer = [r for r in results if r.get("wer") and r["wer"] > 0.3]
if high_wer: if high_wer:
@@ -402,14 +401,14 @@ def main():
parser.add_argument("--tier", choices=["light", "medium", "complex"]) parser.add_argument("--tier", choices=["light", "medium", "complex"])
parser.add_argument("--category") parser.add_argument("--category")
parser.add_argument("--ids", help="Comma-separated IDs") parser.add_argument("--ids", help="Comma-separated IDs")
parser.add_argument("--dry-run", action="store_true", parser.add_argument("--no-inference", action="store_true",
help="Complex queries use medium model for inference (no API cost)") help="Skip LLM inference for all tiers — routing decisions only (no GPU/API cost)")
parser.add_argument("--save-audio", action="store_true", parser.add_argument("--save-audio", action="store_true",
help="Save synthesized WAV files to voice_audio/ directory") help="Save synthesized WAV files to voice_audio/ directory")
parser.add_argument("--skip-gpu-check", action="store_true") parser.add_argument("--skip-gpu-check", action="store_true")
args = parser.parse_args() args = parser.parse_args()
if not preflight_checks(skip_gpu_check=args.skip_gpu_check): if not preflight_checks(skip_gpu_check=args.skip_gpu_check or args.no_inference):
sys.exit(1) sys.exit(1)
queries = load_dataset() queries = load_dataset()
@@ -419,7 +418,7 @@ def main():
print("No queries match filters.") print("No queries match filters.")
sys.exit(1) sys.exit(1)
asyncio.run(run(queries, dry_run=args.dry_run, save_audio=args.save_audio)) asyncio.run(run(queries, no_inference=args.no_inference, save_audio=args.save_audio))
if __name__ == "__main__": if __name__ == "__main__":