Introduce FastTools: pre-flight classifier + context enrichment

New fast_tools.py module:
- FastTool base class (matches + run interface)
- RealTimeSearchTool: SearXNG search for weather/news/prices/scores
- FastToolRunner: classifier that checks all tools, runs matching
  ones concurrently and returns combined context

Router accepts FastToolRunner; any_matches() forces medium tier
before LLM classification (replaces _MEDIUM_FORCE_PATTERNS regex).

agent.py: _REALTIME_RE and _searxng_search_async removed; pre-flight
gather now includes fast_tool_runner.run_matching() alongside URL
fetch and memory retrieval.

To add a new fast tool: subclass FastTool, add to the list in agent.py.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alvis
2026-03-13 05:18:44 +00:00
parent 436299f7e2
commit f5fc2e9bfb
4 changed files with 145 additions and 72 deletions

View File

@@ -12,16 +12,6 @@ import httpx as _httpx
_URL_RE = _re.compile(r'https?://[^\s<>"\']+')
# Queries that need live data — trigger pre-search enrichment for medium tier
_REALTIME_RE = _re.compile(
r"\b(weather|forecast|temperature|rain(ing)?|snow(ing)?|humidity|wind speed"
r"|today.?s news|breaking news|latest news|news today|current events"
r"|bitcoin price|crypto price|stock price|exchange rate"
r"|right now|currently|at the moment|live score|score now|score today"
r"|open now|hours today|is .+ open)\b",
_re.IGNORECASE,
)
def _extract_urls(text: str) -> list[str]:
return _URL_RE.findall(text)
@@ -34,6 +24,7 @@ from langchain_core.tools import Tool
from vram_manager import VRAMManager
from router import Router
from agent_factory import build_medium_agent, build_complex_agent
from fast_tools import FastToolRunner, RealTimeSearchTool
import channels
# Bifrost gateway — all LLM inference goes through here
@@ -98,29 +89,6 @@ async def _fetch_urls_from_message(message: str) -> str:
return "User's message contains URLs. Fetched content:\n\n" + "\n\n".join(parts)
async def _searxng_search_async(query: str) -> str:
"""Run a SearXNG search and return top result snippets as text for prompt injection.
Kept short (snippets only) so medium model context stays within streaming timeout."""
try:
async with _httpx.AsyncClient(timeout=15) as client:
r = await client.get(
f"{SEARXNG_URL}/search",
params={"q": query, "format": "json"},
)
r.raise_for_status()
items = r.json().get("results", [])[:4]
except Exception as e:
return f"[search error: {e}]"
if not items:
return ""
lines = [f"Web search results for: {query}\n"]
for i, item in enumerate(items, 1):
title = item.get("title", "")
url = item.get("url", "")
snippet = item.get("content", "")[:400]
lines.append(f"[{i}] {title}\nURL: {url}\n{snippet}\n")
return "\n".join(lines)
# /no_think at the start of the system prompt disables qwen3 chain-of-thought.
# create_deep_agent prepends our system_prompt before BASE_AGENT_PROMPT, so
@@ -151,6 +119,11 @@ mcp_client = None
_memory_add_tool = None
_memory_search_tool = None
# Fast tools run before the LLM — classifier + context enricher
_fast_tool_runner = FastToolRunner([
RealTimeSearchTool(searxng_url=SEARXNG_URL),
])
# GPU mutex: one LLM inference at a time
_reply_semaphore = asyncio.Semaphore(1)
@@ -188,7 +161,7 @@ async def lifespan(app: FastAPI):
)
vram_manager = VRAMManager(base_url=OLLAMA_BASE_URL)
router = Router(model=router_model)
router = Router(model=router_model, fast_tool_runner=_fast_tool_runner)
mcp_connections = {
"openmemory": {"transport": "sse", "url": f"{OPENMEMORY_URL}/sse"},
@@ -413,33 +386,24 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
history = _conversation_buffers.get(session_id, [])
print(f"[agent] running: {clean_message[:80]!r}", flush=True)
# Fetch URL content, memories, and (for real-time queries) web search — all IO-bound
is_realtime = bool(_REALTIME_RE.search(clean_message))
if is_realtime:
url_context, memories, search_context = await asyncio.gather(
_fetch_urls_from_message(clean_message),
_retrieve_memories(clean_message, session_id),
_searxng_search_async(clean_message),
)
if search_context and not search_context.startswith("[search error"):
print(f"[agent] pre-search: {len(search_context)} chars for real-time query", flush=True)
else:
search_context = ""
else:
url_context, memories = await asyncio.gather(
_fetch_urls_from_message(clean_message),
_retrieve_memories(clean_message, session_id),
)
search_context = ""
# Fetch URL content, memories, and fast-tool context concurrently — all IO-bound
url_context, memories, fast_context = await asyncio.gather(
_fetch_urls_from_message(clean_message),
_retrieve_memories(clean_message, session_id),
_fast_tool_runner.run_matching(clean_message),
)
if url_context:
print(f"[agent] crawl4ai: {len(url_context)} chars fetched from message URLs", flush=True)
if fast_context:
names = _fast_tool_runner.matching_names(clean_message)
print(f"[agent] fast_tools={names}: {len(fast_context)} chars injected", flush=True)
# Build enriched history: memories + url_context + search_context for ALL tiers
# Build enriched history: memories + url_context + fast_context for ALL tiers
enriched_history = list(history)
if url_context:
enriched_history = [{"role": "system", "content": url_context}] + enriched_history
if search_context:
enriched_history = [{"role": "system", "content": search_context}] + enriched_history
if fast_context:
enriched_history = [{"role": "system", "content": fast_context}] + enriched_history
if memories:
enriched_history = [{"role": "system", "content": memories}] + enriched_history
@@ -467,8 +431,8 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
system_prompt = system_prompt + "\n\n" + memories
if url_context:
system_prompt = system_prompt + "\n\n" + url_context
if search_context:
system_prompt = system_prompt + "\n\nLive web search results (use these to answer):\n\n" + search_context
if fast_context:
system_prompt = system_prompt + "\n\nLive web search results (use these to answer):\n\n" + fast_context
# Stream tokens directly — filter out qwen3 <think> blocks
in_think = False