Skip _reply_semaphore in no_inference mode
No GPU inference happens in this mode, so serialization is not needed. Without this, timed-out routing benchmark queries hold the semaphore and cascade-block all subsequent queries. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
4
agent.py
4
agent.py
@@ -2,7 +2,7 @@ import asyncio
|
|||||||
import json as _json_module
|
import json as _json_module
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager, nullcontext
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from fastapi import FastAPI, BackgroundTasks, Request
|
from fastapi import FastAPI, BackgroundTasks, Request
|
||||||
@@ -440,7 +440,7 @@ async def _run_agent_pipeline(
|
|||||||
no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately
|
no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately
|
||||||
Caller is responsible for scheduling _store_memory after consuming all chunks.
|
Caller is responsible for scheduling _store_memory after consuming all chunks.
|
||||||
"""
|
"""
|
||||||
async with _reply_semaphore:
|
async with (nullcontext() if no_inference else _reply_semaphore):
|
||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
clean_message = message
|
clean_message = message
|
||||||
print(f"[agent] running: {clean_message[:80]!r}", flush=True)
|
print(f"[agent] running: {clean_message[:80]!r}", flush=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user