diff --git a/agent.py b/agent.py index b532c58..fed4af7 100644 --- a/agent.py +++ b/agent.py @@ -431,13 +431,13 @@ async def _run_agent_pipeline( history: list[dict], session_id: str, tier_override: str | None = None, - dry_run: bool = False, + no_inference: bool = False, tier_capture: list | None = None, ) -> AsyncGenerator[str, None]: """Core pipeline: pre-flight → routing → inference. Yields text chunks. tier_override: "light" | "medium" | "complex" | None (auto-route) - dry_run: if True and tier=complex, log tier=complex but use medium model (avoids API cost) + no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately Caller is responsible for scheduling _store_memory after consuming all chunks. """ async with _reply_semaphore: @@ -471,7 +471,7 @@ async def _run_agent_pipeline( try: # Short-circuit: fast tool already has the answer - if fast_context and tier_override is None and not url_context: + if fast_context and tier_override is None and not url_context and not no_inference: tier = "fast" final_text = fast_context llm_elapsed = time.monotonic() - t0 @@ -494,17 +494,14 @@ async def _run_agent_pipeline( light_reply = None print("[agent] URL in message → upgraded light→medium", flush=True) - # Dry-run: log as complex but infer with medium (no remote API call) - effective_tier = tier - if dry_run and tier == "complex": - effective_tier = "medium" - print(f"[agent] tier=complex (dry-run) → using medium model, message={clean_message[:60]!r}", flush=True) - else: - print(f"[agent] tier={tier} message={clean_message[:60]!r}", flush=True) - tier = effective_tier + print(f"[agent] tier={tier} message={clean_message[:60]!r}", flush=True) if tier_capture is not None: tier_capture.append(tier) + if no_inference: + yield "I don't know" + return + if tier == "light": final_text = light_reply llm_elapsed = time.monotonic() - t0 @@ -594,7 +591,7 @@ async def run_agent_task( t0 = time.monotonic() meta = metadata or {} - dry_run = bool(meta.get("dry_run", False)) + no_inference = bool(meta.get("no_inference", False)) is_benchmark = bool(meta.get("benchmark", False)) history = _conversation_buffers.get(session_id, []) @@ -602,7 +599,7 @@ async def run_agent_task( actual_tier = "unknown" tier_capture: list = [] - async for chunk in _run_agent_pipeline(message, history, session_id, dry_run=dry_run, tier_capture=tier_capture): + async for chunk in _run_agent_pipeline(message, history, session_id, no_inference=no_inference, tier_capture=tier_capture): await _push_stream_chunk(session_id, chunk) if final_text is None: final_text = chunk