diff --git a/benchmarks/run_voice_benchmark.py b/benchmarks/run_voice_benchmark.py
index 8382ab6..525f84d 100644
--- a/benchmarks/run_voice_benchmark.py
+++ b/benchmarks/run_voice_benchmark.py
@@ -12,7 +12,7 @@ Usage:
     python3 run_voice_benchmark.py [options]
     python3 run_voice_benchmark.py --tier light|medium|complex
     python3 run_voice_benchmark.py --ids 1,2,3
-    python3 run_voice_benchmark.py --dry-run       # complex queries use medium model
+    python3 run_voice_benchmark.py --no-inference  # skip LLM inference — routing only, all tiers
 
 IMPORTANT: Always check GPU is free before running. Done automatically.
 
@@ -210,9 +210,9 @@ def get_log_tail(n: int = 60) -> str:
 
 def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
     before_lines = set(logs_before.splitlines())
-    new_lines = [l for l in logs_after.splitlines() if l not in before_lines]
-    for line in reversed(new_lines):
-        m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line)
+    new_lines = [line for line in logs_after.splitlines() if line not in before_lines]
+    for line in new_lines:
+        m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
         if m:
             return m.group(1).split()[0]
     return None
@@ -222,14 +222,14 @@ async def post_to_adolf(
     client: httpx.AsyncClient,
     query_id: int,
     text: str,
-    dry_run: bool = False,
+    no_inference: bool = False,
 ) -> bool:
     payload = {
         "text": text,
         "session_id": f"voice-bench-{query_id}",
         "channel": "cli",
         "user_id": "benchmark",
-        "metadata": {"dry_run": dry_run, "benchmark": True, "voice": True},
+        "metadata": {"no_inference": no_inference, "benchmark": True, "voice": True},
     }
     try:
         r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
@@ -259,7 +259,7 @@ def filter_queries(queries, tier, category, ids):
 
 # ── Main run ───────────────────────────────────────────────────────────────────
 
-async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = False) -> None:
+async def run(queries: list[dict], no_inference: bool = False, save_audio: bool = False) -> None:
     async with httpx.AsyncClient() as client:
         # Check Adolf
         try:
@@ -272,7 +272,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
         total = len(queries)
         results = []
 
-        dry_label = " [DRY-RUN]" if dry_run else ""
+        dry_label = " [NO-INFERENCE: routing only]" if no_inference else ""
         print(f"Voice benchmark: {total} queries{dry_label}\n")
         print(f"{'ID':>3}  {'EXP':8}  {'ACT':8}  {'OK':3}  {'WER':5}  {'TRANSCRIPT'}")
         print("─" * 100)
@@ -312,11 +312,10 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
             wer_count += 1
 
             # Step 3: Send to Adolf
-            send_dry = dry_run and expected == "complex"
             logs_before = get_log_tail(60)
             t0 = time.monotonic()
 
-            ok_post = await post_to_adolf(client, qid, transcript, dry_run=send_dry)
+            ok_post = await post_to_adolf(client, qid, transcript, no_inference=no_inference)
             if not ok_post:
                 print(f"{'?':8}  {'ERR':3}  {wer:4.2f}  {transcript[:50]}")
                 results.append({"id": qid, "expected": expected, "actual": None, "ok": False, "wer": wer, "transcript": transcript})
@@ -349,7 +348,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
                 "original": original,
                 "transcript": transcript,
                 "elapsed": round(elapsed, 1),
-                "dry_run": send_dry,
+                "no_inference": no_inference,
             })
 
             await asyncio.sleep(0.5)
@@ -374,7 +373,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
         if wrong:
             print(f"\nMisclassified after voice ({len(wrong)}):")
             for r in wrong:
-                print(f"  id={r['id']:3}  expected={r.get('expected','?'):8}  actual={r.get('actual','?'):8}  transcript={r.get('transcript','')[:50]}")
+                print(f"  id={r['id']:3}  expected={r.get('expected') or '?':8}  actual={r.get('actual') or '?':8}  transcript={r.get('transcript','')[:50]}")
 
         high_wer = [r for r in results if r.get("wer") and r["wer"] > 0.3]
         if high_wer:
@@ -402,14 +401,14 @@ def main():
     parser.add_argument("--tier", choices=["light", "medium", "complex"])
     parser.add_argument("--category")
     parser.add_argument("--ids", help="Comma-separated IDs")
-    parser.add_argument("--dry-run", action="store_true",
-                        help="Complex queries use medium model for inference (no API cost)")
+    parser.add_argument("--no-inference", action="store_true",
+                        help="Skip LLM inference for all tiers — routing decisions only (no GPU/API cost)")
     parser.add_argument("--save-audio", action="store_true",
                         help="Save synthesized WAV files to voice_audio/ directory")
     parser.add_argument("--skip-gpu-check", action="store_true")
     args = parser.parse_args()
 
-    if not preflight_checks(skip_gpu_check=args.skip_gpu_check):
+    if not preflight_checks(skip_gpu_check=args.skip_gpu_check or args.no_inference):
         sys.exit(1)
 
     queries = load_dataset()
@@ -419,7 +418,7 @@ def main():
         print("No queries match filters.")
         sys.exit(1)
 
-    asyncio.run(run(queries, dry_run=args.dry_run, save_audio=args.save_audio))
+    asyncio.run(run(queries, no_inference=args.no_inference, save_audio=args.save_audio))
 
 
 if __name__ == "__main__":