Skip _reply_semaphore in no_inference mode

No GPU inference happens in this mode, so serialization is not needed.
Without this, timed-out routing benchmark queries hold the semaphore
and cascade-block all subsequent queries.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-24 07:40:07 +00:00
parent 4d37ac65b2
commit 3fb90ae083

View File

@@ -2,7 +2,7 @@ import asyncio
import json as _json_module import json as _json_module
import os import os
import time import time
from contextlib import asynccontextmanager from contextlib import asynccontextmanager, nullcontext
from pathlib import Path from pathlib import Path
from fastapi import FastAPI, BackgroundTasks, Request from fastapi import FastAPI, BackgroundTasks, Request
@@ -440,7 +440,7 @@ async def _run_agent_pipeline(
no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately
Caller is responsible for scheduling _store_memory after consuming all chunks. Caller is responsible for scheduling _store_memory after consuming all chunks.
""" """
async with _reply_semaphore: async with (nullcontext() if no_inference else _reply_semaphore):
t0 = time.monotonic() t0 = time.monotonic()
clean_message = message clean_message = message
print(f"[agent] running: {clean_message[:80]!r}", flush=True) print(f"[agent] running: {clean_message[:80]!r}", flush=True)