voice benchmark: rename --dry-run → --no-inference, fix log extraction
- --no-inference applies to all tiers (not just complex) - metadata key: dry_run → no_inference - extract_tier_from_logs: forward iteration (not reversed), updated regex - GPU check skipped when --no-inference - Fix TypeError in misclassified print when actual=None Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,7 +12,7 @@ Usage:
|
|||||||
python3 run_voice_benchmark.py [options]
|
python3 run_voice_benchmark.py [options]
|
||||||
python3 run_voice_benchmark.py --tier light|medium|complex
|
python3 run_voice_benchmark.py --tier light|medium|complex
|
||||||
python3 run_voice_benchmark.py --ids 1,2,3
|
python3 run_voice_benchmark.py --ids 1,2,3
|
||||||
python3 run_voice_benchmark.py --dry-run # complex queries use medium model
|
python3 run_voice_benchmark.py --no-inference # skip LLM inference — routing only, all tiers
|
||||||
|
|
||||||
IMPORTANT: Always check GPU is free before running. Done automatically.
|
IMPORTANT: Always check GPU is free before running. Done automatically.
|
||||||
|
|
||||||
@@ -210,9 +210,9 @@ def get_log_tail(n: int = 60) -> str:
|
|||||||
|
|
||||||
def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
|
def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
|
||||||
before_lines = set(logs_before.splitlines())
|
before_lines = set(logs_before.splitlines())
|
||||||
new_lines = [l for l in logs_after.splitlines() if l not in before_lines]
|
new_lines = [line for line in logs_after.splitlines() if line not in before_lines]
|
||||||
for line in reversed(new_lines):
|
for line in new_lines:
|
||||||
m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line)
|
m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
|
||||||
if m:
|
if m:
|
||||||
return m.group(1).split()[0]
|
return m.group(1).split()[0]
|
||||||
return None
|
return None
|
||||||
@@ -222,14 +222,14 @@ async def post_to_adolf(
|
|||||||
client: httpx.AsyncClient,
|
client: httpx.AsyncClient,
|
||||||
query_id: int,
|
query_id: int,
|
||||||
text: str,
|
text: str,
|
||||||
dry_run: bool = False,
|
no_inference: bool = False,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
payload = {
|
payload = {
|
||||||
"text": text,
|
"text": text,
|
||||||
"session_id": f"voice-bench-{query_id}",
|
"session_id": f"voice-bench-{query_id}",
|
||||||
"channel": "cli",
|
"channel": "cli",
|
||||||
"user_id": "benchmark",
|
"user_id": "benchmark",
|
||||||
"metadata": {"dry_run": dry_run, "benchmark": True, "voice": True},
|
"metadata": {"no_inference": no_inference, "benchmark": True, "voice": True},
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
|
r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
|
||||||
@@ -259,7 +259,7 @@ def filter_queries(queries, tier, category, ids):
|
|||||||
|
|
||||||
# ── Main run ───────────────────────────────────────────────────────────────────
|
# ── Main run ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = False) -> None:
|
async def run(queries: list[dict], no_inference: bool = False, save_audio: bool = False) -> None:
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
# Check Adolf
|
# Check Adolf
|
||||||
try:
|
try:
|
||||||
@@ -272,7 +272,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
|
|||||||
total = len(queries)
|
total = len(queries)
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
dry_label = " [DRY-RUN]" if dry_run else ""
|
dry_label = " [NO-INFERENCE: routing only]" if no_inference else ""
|
||||||
print(f"Voice benchmark: {total} queries{dry_label}\n")
|
print(f"Voice benchmark: {total} queries{dry_label}\n")
|
||||||
print(f"{'ID':>3} {'EXP':8} {'ACT':8} {'OK':3} {'WER':5} {'TRANSCRIPT'}")
|
print(f"{'ID':>3} {'EXP':8} {'ACT':8} {'OK':3} {'WER':5} {'TRANSCRIPT'}")
|
||||||
print("─" * 100)
|
print("─" * 100)
|
||||||
@@ -312,11 +312,10 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
|
|||||||
wer_count += 1
|
wer_count += 1
|
||||||
|
|
||||||
# Step 3: Send to Adolf
|
# Step 3: Send to Adolf
|
||||||
send_dry = dry_run and expected == "complex"
|
|
||||||
logs_before = get_log_tail(60)
|
logs_before = get_log_tail(60)
|
||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
|
|
||||||
ok_post = await post_to_adolf(client, qid, transcript, dry_run=send_dry)
|
ok_post = await post_to_adolf(client, qid, transcript, no_inference=no_inference)
|
||||||
if not ok_post:
|
if not ok_post:
|
||||||
print(f"{'?':8} {'ERR':3} {wer:4.2f} {transcript[:50]}")
|
print(f"{'?':8} {'ERR':3} {wer:4.2f} {transcript[:50]}")
|
||||||
results.append({"id": qid, "expected": expected, "actual": None, "ok": False, "wer": wer, "transcript": transcript})
|
results.append({"id": qid, "expected": expected, "actual": None, "ok": False, "wer": wer, "transcript": transcript})
|
||||||
@@ -349,7 +348,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
|
|||||||
"original": original,
|
"original": original,
|
||||||
"transcript": transcript,
|
"transcript": transcript,
|
||||||
"elapsed": round(elapsed, 1),
|
"elapsed": round(elapsed, 1),
|
||||||
"dry_run": send_dry,
|
"no_inference": no_inference,
|
||||||
})
|
})
|
||||||
|
|
||||||
await asyncio.sleep(0.5)
|
await asyncio.sleep(0.5)
|
||||||
@@ -374,7 +373,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
|
|||||||
if wrong:
|
if wrong:
|
||||||
print(f"\nMisclassified after voice ({len(wrong)}):")
|
print(f"\nMisclassified after voice ({len(wrong)}):")
|
||||||
for r in wrong:
|
for r in wrong:
|
||||||
print(f" id={r['id']:3} expected={r.get('expected','?'):8} actual={r.get('actual','?'):8} transcript={r.get('transcript','')[:50]}")
|
print(f" id={r['id']:3} expected={r.get('expected') or '?':8} actual={r.get('actual') or '?':8} transcript={r.get('transcript','')[:50]}")
|
||||||
|
|
||||||
high_wer = [r for r in results if r.get("wer") and r["wer"] > 0.3]
|
high_wer = [r for r in results if r.get("wer") and r["wer"] > 0.3]
|
||||||
if high_wer:
|
if high_wer:
|
||||||
@@ -402,14 +401,14 @@ def main():
|
|||||||
parser.add_argument("--tier", choices=["light", "medium", "complex"])
|
parser.add_argument("--tier", choices=["light", "medium", "complex"])
|
||||||
parser.add_argument("--category")
|
parser.add_argument("--category")
|
||||||
parser.add_argument("--ids", help="Comma-separated IDs")
|
parser.add_argument("--ids", help="Comma-separated IDs")
|
||||||
parser.add_argument("--dry-run", action="store_true",
|
parser.add_argument("--no-inference", action="store_true",
|
||||||
help="Complex queries use medium model for inference (no API cost)")
|
help="Skip LLM inference for all tiers — routing decisions only (no GPU/API cost)")
|
||||||
parser.add_argument("--save-audio", action="store_true",
|
parser.add_argument("--save-audio", action="store_true",
|
||||||
help="Save synthesized WAV files to voice_audio/ directory")
|
help="Save synthesized WAV files to voice_audio/ directory")
|
||||||
parser.add_argument("--skip-gpu-check", action="store_true")
|
parser.add_argument("--skip-gpu-check", action="store_true")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if not preflight_checks(skip_gpu_check=args.skip_gpu_check):
|
if not preflight_checks(skip_gpu_check=args.skip_gpu_check or args.no_inference):
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
queries = load_dataset()
|
queries = load_dataset()
|
||||||
@@ -419,7 +418,7 @@ def main():
|
|||||||
print("No queries match filters.")
|
print("No queries match filters.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
asyncio.run(run(queries, dry_run=args.dry_run, save_audio=args.save_audio))
|
asyncio.run(run(queries, no_inference=args.no_inference, save_audio=args.save_audio))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user