From 3fb90ae0838ab588c97a17bffd5f12f2cdbd7433 Mon Sep 17 00:00:00 2001
From: alvis <allogn@gmail.com>
Date: Tue, 24 Mar 2026 07:40:07 +0000
Subject: [PATCH] Skip _reply_semaphore in no_inference mode

No GPU inference happens in this mode, so serialization is not needed.
Without this, timed-out routing benchmark queries hold the semaphore
and cascade-block all subsequent queries.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 agent.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/agent.py b/agent.py
index 7c650a4..54018aa 100644
--- a/agent.py
+++ b/agent.py
@@ -2,7 +2,7 @@ import asyncio
 import json as _json_module
 import os
 import time
-from contextlib import asynccontextmanager
+from contextlib import asynccontextmanager, nullcontext
 from pathlib import Path
 
 from fastapi import FastAPI, BackgroundTasks, Request
@@ -440,7 +440,7 @@ async def _run_agent_pipeline(
     no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately
     Caller is responsible for scheduling _store_memory after consuming all chunks.
     """
-    async with _reply_semaphore:
+    async with (nullcontext() if no_inference else _reply_semaphore):
         t0 = time.monotonic()
         clean_message = message
         print(f"[agent] running: {clean_message[:80]!r}", flush=True)