From 3fb90ae0838ab588c97a17bffd5f12f2cdbd7433 Mon Sep 17 00:00:00 2001 From: alvis Date: Tue, 24 Mar 2026 07:40:07 +0000 Subject: [PATCH] Skip _reply_semaphore in no_inference mode No GPU inference happens in this mode, so serialization is not needed. Without this, timed-out routing benchmark queries hold the semaphore and cascade-block all subsequent queries. Co-Authored-By: Claude Sonnet 4.6 --- agent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agent.py b/agent.py index 7c650a4..54018aa 100644 --- a/agent.py +++ b/agent.py @@ -2,7 +2,7 @@ import asyncio import json as _json_module import os import time -from contextlib import asynccontextmanager +from contextlib import asynccontextmanager, nullcontext from pathlib import Path from fastapi import FastAPI, BackgroundTasks, Request @@ -440,7 +440,7 @@ async def _run_agent_pipeline( no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately Caller is responsible for scheduling _store_memory after consuming all chunks. """ - async with _reply_semaphore: + async with (nullcontext() if no_inference else _reply_semaphore): t0 = time.monotonic() clean_message = message print(f"[agent] running: {clean_message[:80]!r}", flush=True)