Skip _reply_semaphore in no_inference mode

No GPU inference happens in this mode, so serialization is not needed.
Without this, timed-out routing benchmark queries hold the semaphore
and cascade-block all subsequent queries.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-24 07:40:07 +00:00
parent 4d37ac65b2
commit 3fb90ae083

View File

@@ -2,7 +2,7 @@ import asyncio
import json as _json_module
import os
import time
from contextlib import asynccontextmanager
from contextlib import asynccontextmanager, nullcontext
from pathlib import Path
from fastapi import FastAPI, BackgroundTasks, Request
@@ -440,7 +440,7 @@ async def _run_agent_pipeline(
no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately
Caller is responsible for scheduling _store_memory after consuming all chunks.
"""
async with _reply_semaphore:
async with (nullcontext() if no_inference else _reply_semaphore):
t0 = time.monotonic()
clean_message = message
print(f"[agent] running: {clean_message[:80]!r}", flush=True)