Skip _reply_semaphore in no_inference mode
No GPU inference happens in this mode, so serialization is not needed. Without this, timed-out routing benchmark queries hold the semaphore and cascade-block all subsequent queries. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
4
agent.py
4
agent.py
@@ -2,7 +2,7 @@ import asyncio
|
||||
import json as _json_module
|
||||
import os
|
||||
import time
|
||||
from contextlib import asynccontextmanager
|
||||
from contextlib import asynccontextmanager, nullcontext
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, BackgroundTasks, Request
|
||||
@@ -440,7 +440,7 @@ async def _run_agent_pipeline(
|
||||
no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately
|
||||
Caller is responsible for scheduling _store_memory after consuming all chunks.
|
||||
"""
|
||||
async with _reply_semaphore:
|
||||
async with (nullcontext() if no_inference else _reply_semaphore):
|
||||
t0 = time.monotonic()
|
||||
clean_message = message
|
||||
print(f"[agent] running: {clean_message[:80]!r}", flush=True)
|
||||
|
||||
Reference in New Issue
Block a user