Skip _reply_semaphore in no_inference mode

No GPU inference happens in this mode, so serialization is not needed. Without this, timed-out routing benchmark queries hold the semaphore and cascade-block all subsequent queries. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 07:40:07 +00:00
parent 4d37ac65b2
commit 3fb90ae083
1 changed files with 2 additions and 2 deletions
--- a/agent.py
+++ b/agent.py
@@ -2,7 +2,7 @@ import asyncio
 import json as _json_module
 import os
 import time
-from contextlib import asynccontextmanager
+from contextlib import asynccontextmanager, nullcontext
 from pathlib import Path

 from fastapi import FastAPI, BackgroundTasks, Request
@@ -440,7 +440,7 @@ async def _run_agent_pipeline(
    no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately
    Caller is responsible for scheduling _store_memory after consuming all chunks.
    """
-    async with _reply_semaphore:
+    async with (nullcontext() if no_inference else _reply_semaphore):
        t0 = time.monotonic()
        clean_message = message
        print(f"[agent] running: {clean_message[:80]!r}", flush=True)