Rename dry_run → no_inference, extend to all tiers in agent.py
When no_inference=True, routing decision is captured but all LLM inference is skipped — yields constant "I don't know" immediately. Also disables fast-tool short-circuit so routing path always runs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
21
agent.py
21
agent.py
@@ -431,13 +431,13 @@ async def _run_agent_pipeline(
|
|||||||
history: list[dict],
|
history: list[dict],
|
||||||
session_id: str,
|
session_id: str,
|
||||||
tier_override: str | None = None,
|
tier_override: str | None = None,
|
||||||
dry_run: bool = False,
|
no_inference: bool = False,
|
||||||
tier_capture: list | None = None,
|
tier_capture: list | None = None,
|
||||||
) -> AsyncGenerator[str, None]:
|
) -> AsyncGenerator[str, None]:
|
||||||
"""Core pipeline: pre-flight → routing → inference. Yields text chunks.
|
"""Core pipeline: pre-flight → routing → inference. Yields text chunks.
|
||||||
|
|
||||||
tier_override: "light" | "medium" | "complex" | None (auto-route)
|
tier_override: "light" | "medium" | "complex" | None (auto-route)
|
||||||
dry_run: if True and tier=complex, log tier=complex but use medium model (avoids API cost)
|
no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately
|
||||||
Caller is responsible for scheduling _store_memory after consuming all chunks.
|
Caller is responsible for scheduling _store_memory after consuming all chunks.
|
||||||
"""
|
"""
|
||||||
async with _reply_semaphore:
|
async with _reply_semaphore:
|
||||||
@@ -471,7 +471,7 @@ async def _run_agent_pipeline(
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Short-circuit: fast tool already has the answer
|
# Short-circuit: fast tool already has the answer
|
||||||
if fast_context and tier_override is None and not url_context:
|
if fast_context and tier_override is None and not url_context and not no_inference:
|
||||||
tier = "fast"
|
tier = "fast"
|
||||||
final_text = fast_context
|
final_text = fast_context
|
||||||
llm_elapsed = time.monotonic() - t0
|
llm_elapsed = time.monotonic() - t0
|
||||||
@@ -494,17 +494,14 @@ async def _run_agent_pipeline(
|
|||||||
light_reply = None
|
light_reply = None
|
||||||
print("[agent] URL in message → upgraded light→medium", flush=True)
|
print("[agent] URL in message → upgraded light→medium", flush=True)
|
||||||
|
|
||||||
# Dry-run: log as complex but infer with medium (no remote API call)
|
|
||||||
effective_tier = tier
|
|
||||||
if dry_run and tier == "complex":
|
|
||||||
effective_tier = "medium"
|
|
||||||
print(f"[agent] tier=complex (dry-run) → using medium model, message={clean_message[:60]!r}", flush=True)
|
|
||||||
else:
|
|
||||||
print(f"[agent] tier={tier} message={clean_message[:60]!r}", flush=True)
|
print(f"[agent] tier={tier} message={clean_message[:60]!r}", flush=True)
|
||||||
tier = effective_tier
|
|
||||||
if tier_capture is not None:
|
if tier_capture is not None:
|
||||||
tier_capture.append(tier)
|
tier_capture.append(tier)
|
||||||
|
|
||||||
|
if no_inference:
|
||||||
|
yield "I don't know"
|
||||||
|
return
|
||||||
|
|
||||||
if tier == "light":
|
if tier == "light":
|
||||||
final_text = light_reply
|
final_text = light_reply
|
||||||
llm_elapsed = time.monotonic() - t0
|
llm_elapsed = time.monotonic() - t0
|
||||||
@@ -594,7 +591,7 @@ async def run_agent_task(
|
|||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
|
|
||||||
meta = metadata or {}
|
meta = metadata or {}
|
||||||
dry_run = bool(meta.get("dry_run", False))
|
no_inference = bool(meta.get("no_inference", False))
|
||||||
is_benchmark = bool(meta.get("benchmark", False))
|
is_benchmark = bool(meta.get("benchmark", False))
|
||||||
|
|
||||||
history = _conversation_buffers.get(session_id, [])
|
history = _conversation_buffers.get(session_id, [])
|
||||||
@@ -602,7 +599,7 @@ async def run_agent_task(
|
|||||||
actual_tier = "unknown"
|
actual_tier = "unknown"
|
||||||
tier_capture: list = []
|
tier_capture: list = []
|
||||||
|
|
||||||
async for chunk in _run_agent_pipeline(message, history, session_id, dry_run=dry_run, tier_capture=tier_capture):
|
async for chunk in _run_agent_pipeline(message, history, session_id, no_inference=no_inference, tier_capture=tier_capture):
|
||||||
await _push_stream_chunk(session_id, chunk)
|
await _push_stream_chunk(session_id, chunk)
|
||||||
if final_text is None:
|
if final_text is None:
|
||||||
final_text = chunk
|
final_text = chunk
|
||||||
|
|||||||
Reference in New Issue
Block a user