Introduce FastTools: pre-flight classifier + context enrichment
New fast_tools.py module: - FastTool base class (matches + run interface) - RealTimeSearchTool: SearXNG search for weather/news/prices/scores - FastToolRunner: classifier that checks all tools, runs matching ones concurrently and returns combined context Router accepts FastToolRunner; any_matches() forces medium tier before LLM classification (replaces _MEDIUM_FORCE_PATTERNS regex). agent.py: _REALTIME_RE and _searxng_search_async removed; pre-flight gather now includes fast_tool_runner.run_matching() alongside URL fetch and memory retrieval. To add a new fast tool: subclass FastTool, add to the list in agent.py. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,6 @@ WORKDIR /app
|
|||||||
RUN pip install --no-cache-dir deepagents langchain-openai langgraph \
|
RUN pip install --no-cache-dir deepagents langchain-openai langgraph \
|
||||||
fastapi uvicorn langchain-mcp-adapters langchain-community httpx
|
fastapi uvicorn langchain-mcp-adapters langchain-community httpx
|
||||||
|
|
||||||
COPY agent.py channels.py vram_manager.py router.py agent_factory.py hello_world.py .
|
COPY agent.py channels.py vram_manager.py router.py agent_factory.py fast_tools.py hello_world.py .
|
||||||
|
|
||||||
CMD ["uvicorn", "agent:app", "--host", "0.0.0.0", "--port", "8000"]
|
CMD ["uvicorn", "agent:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
|
|||||||
72
agent.py
72
agent.py
@@ -12,16 +12,6 @@ import httpx as _httpx
|
|||||||
|
|
||||||
_URL_RE = _re.compile(r'https?://[^\s<>"\']+')
|
_URL_RE = _re.compile(r'https?://[^\s<>"\']+')
|
||||||
|
|
||||||
# Queries that need live data — trigger pre-search enrichment for medium tier
|
|
||||||
_REALTIME_RE = _re.compile(
|
|
||||||
r"\b(weather|forecast|temperature|rain(ing)?|snow(ing)?|humidity|wind speed"
|
|
||||||
r"|today.?s news|breaking news|latest news|news today|current events"
|
|
||||||
r"|bitcoin price|crypto price|stock price|exchange rate"
|
|
||||||
r"|right now|currently|at the moment|live score|score now|score today"
|
|
||||||
r"|open now|hours today|is .+ open)\b",
|
|
||||||
_re.IGNORECASE,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _extract_urls(text: str) -> list[str]:
|
def _extract_urls(text: str) -> list[str]:
|
||||||
return _URL_RE.findall(text)
|
return _URL_RE.findall(text)
|
||||||
@@ -34,6 +24,7 @@ from langchain_core.tools import Tool
|
|||||||
from vram_manager import VRAMManager
|
from vram_manager import VRAMManager
|
||||||
from router import Router
|
from router import Router
|
||||||
from agent_factory import build_medium_agent, build_complex_agent
|
from agent_factory import build_medium_agent, build_complex_agent
|
||||||
|
from fast_tools import FastToolRunner, RealTimeSearchTool
|
||||||
import channels
|
import channels
|
||||||
|
|
||||||
# Bifrost gateway — all LLM inference goes through here
|
# Bifrost gateway — all LLM inference goes through here
|
||||||
@@ -98,29 +89,6 @@ async def _fetch_urls_from_message(message: str) -> str:
|
|||||||
return "User's message contains URLs. Fetched content:\n\n" + "\n\n".join(parts)
|
return "User's message contains URLs. Fetched content:\n\n" + "\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
async def _searxng_search_async(query: str) -> str:
|
|
||||||
"""Run a SearXNG search and return top result snippets as text for prompt injection.
|
|
||||||
Kept short (snippets only) so medium model context stays within streaming timeout."""
|
|
||||||
try:
|
|
||||||
async with _httpx.AsyncClient(timeout=15) as client:
|
|
||||||
r = await client.get(
|
|
||||||
f"{SEARXNG_URL}/search",
|
|
||||||
params={"q": query, "format": "json"},
|
|
||||||
)
|
|
||||||
r.raise_for_status()
|
|
||||||
items = r.json().get("results", [])[:4]
|
|
||||||
except Exception as e:
|
|
||||||
return f"[search error: {e}]"
|
|
||||||
if not items:
|
|
||||||
return ""
|
|
||||||
lines = [f"Web search results for: {query}\n"]
|
|
||||||
for i, item in enumerate(items, 1):
|
|
||||||
title = item.get("title", "")
|
|
||||||
url = item.get("url", "")
|
|
||||||
snippet = item.get("content", "")[:400]
|
|
||||||
lines.append(f"[{i}] {title}\nURL: {url}\n{snippet}\n")
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
# /no_think at the start of the system prompt disables qwen3 chain-of-thought.
|
# /no_think at the start of the system prompt disables qwen3 chain-of-thought.
|
||||||
# create_deep_agent prepends our system_prompt before BASE_AGENT_PROMPT, so
|
# create_deep_agent prepends our system_prompt before BASE_AGENT_PROMPT, so
|
||||||
@@ -151,6 +119,11 @@ mcp_client = None
|
|||||||
_memory_add_tool = None
|
_memory_add_tool = None
|
||||||
_memory_search_tool = None
|
_memory_search_tool = None
|
||||||
|
|
||||||
|
# Fast tools run before the LLM — classifier + context enricher
|
||||||
|
_fast_tool_runner = FastToolRunner([
|
||||||
|
RealTimeSearchTool(searxng_url=SEARXNG_URL),
|
||||||
|
])
|
||||||
|
|
||||||
# GPU mutex: one LLM inference at a time
|
# GPU mutex: one LLM inference at a time
|
||||||
_reply_semaphore = asyncio.Semaphore(1)
|
_reply_semaphore = asyncio.Semaphore(1)
|
||||||
|
|
||||||
@@ -188,7 +161,7 @@ async def lifespan(app: FastAPI):
|
|||||||
)
|
)
|
||||||
|
|
||||||
vram_manager = VRAMManager(base_url=OLLAMA_BASE_URL)
|
vram_manager = VRAMManager(base_url=OLLAMA_BASE_URL)
|
||||||
router = Router(model=router_model)
|
router = Router(model=router_model, fast_tool_runner=_fast_tool_runner)
|
||||||
|
|
||||||
mcp_connections = {
|
mcp_connections = {
|
||||||
"openmemory": {"transport": "sse", "url": f"{OPENMEMORY_URL}/sse"},
|
"openmemory": {"transport": "sse", "url": f"{OPENMEMORY_URL}/sse"},
|
||||||
@@ -413,33 +386,24 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
|
|||||||
history = _conversation_buffers.get(session_id, [])
|
history = _conversation_buffers.get(session_id, [])
|
||||||
print(f"[agent] running: {clean_message[:80]!r}", flush=True)
|
print(f"[agent] running: {clean_message[:80]!r}", flush=True)
|
||||||
|
|
||||||
# Fetch URL content, memories, and (for real-time queries) web search — all IO-bound
|
# Fetch URL content, memories, and fast-tool context concurrently — all IO-bound
|
||||||
is_realtime = bool(_REALTIME_RE.search(clean_message))
|
url_context, memories, fast_context = await asyncio.gather(
|
||||||
if is_realtime:
|
|
||||||
url_context, memories, search_context = await asyncio.gather(
|
|
||||||
_fetch_urls_from_message(clean_message),
|
_fetch_urls_from_message(clean_message),
|
||||||
_retrieve_memories(clean_message, session_id),
|
_retrieve_memories(clean_message, session_id),
|
||||||
_searxng_search_async(clean_message),
|
_fast_tool_runner.run_matching(clean_message),
|
||||||
)
|
)
|
||||||
if search_context and not search_context.startswith("[search error"):
|
|
||||||
print(f"[agent] pre-search: {len(search_context)} chars for real-time query", flush=True)
|
|
||||||
else:
|
|
||||||
search_context = ""
|
|
||||||
else:
|
|
||||||
url_context, memories = await asyncio.gather(
|
|
||||||
_fetch_urls_from_message(clean_message),
|
|
||||||
_retrieve_memories(clean_message, session_id),
|
|
||||||
)
|
|
||||||
search_context = ""
|
|
||||||
if url_context:
|
if url_context:
|
||||||
print(f"[agent] crawl4ai: {len(url_context)} chars fetched from message URLs", flush=True)
|
print(f"[agent] crawl4ai: {len(url_context)} chars fetched from message URLs", flush=True)
|
||||||
|
if fast_context:
|
||||||
|
names = _fast_tool_runner.matching_names(clean_message)
|
||||||
|
print(f"[agent] fast_tools={names}: {len(fast_context)} chars injected", flush=True)
|
||||||
|
|
||||||
# Build enriched history: memories + url_context + search_context for ALL tiers
|
# Build enriched history: memories + url_context + fast_context for ALL tiers
|
||||||
enriched_history = list(history)
|
enriched_history = list(history)
|
||||||
if url_context:
|
if url_context:
|
||||||
enriched_history = [{"role": "system", "content": url_context}] + enriched_history
|
enriched_history = [{"role": "system", "content": url_context}] + enriched_history
|
||||||
if search_context:
|
if fast_context:
|
||||||
enriched_history = [{"role": "system", "content": search_context}] + enriched_history
|
enriched_history = [{"role": "system", "content": fast_context}] + enriched_history
|
||||||
if memories:
|
if memories:
|
||||||
enriched_history = [{"role": "system", "content": memories}] + enriched_history
|
enriched_history = [{"role": "system", "content": memories}] + enriched_history
|
||||||
|
|
||||||
@@ -467,8 +431,8 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
|
|||||||
system_prompt = system_prompt + "\n\n" + memories
|
system_prompt = system_prompt + "\n\n" + memories
|
||||||
if url_context:
|
if url_context:
|
||||||
system_prompt = system_prompt + "\n\n" + url_context
|
system_prompt = system_prompt + "\n\n" + url_context
|
||||||
if search_context:
|
if fast_context:
|
||||||
system_prompt = system_prompt + "\n\nLive web search results (use these to answer):\n\n" + search_context
|
system_prompt = system_prompt + "\n\nLive web search results (use these to answer):\n\n" + fast_context
|
||||||
|
|
||||||
# Stream tokens directly — filter out qwen3 <think> blocks
|
# Stream tokens directly — filter out qwen3 <think> blocks
|
||||||
in_think = False
|
in_think = False
|
||||||
|
|||||||
116
fast_tools.py
Normal file
116
fast_tools.py
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
"""
|
||||||
|
Fast Tools — pre-flight tools invoked by a classifier before the main LLM call.
|
||||||
|
|
||||||
|
Each FastTool has:
|
||||||
|
- matches(message) → bool : regex classifier that determines if this tool applies
|
||||||
|
- run(message) → str : async fetch that returns enrichment context
|
||||||
|
|
||||||
|
FastToolRunner holds a list of FastTools. The Router uses any_matches() to force
|
||||||
|
the tier to medium before LLM classification. run_agent_task() calls run_matching()
|
||||||
|
to build extra context that is injected into the system prompt.
|
||||||
|
|
||||||
|
To add a new fast tool:
|
||||||
|
1. Subclass FastTool, implement name/matches/run
|
||||||
|
2. Add an instance to the list passed to FastToolRunner in agent.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import re
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
|
||||||
|
class FastTool(ABC):
|
||||||
|
"""Base class for all pre-flight fast tools."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def name(self) -> str: ...
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def matches(self, message: str) -> bool: ...
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def run(self, message: str) -> str: ...
|
||||||
|
|
||||||
|
|
||||||
|
class RealTimeSearchTool(FastTool):
|
||||||
|
"""
|
||||||
|
Injects live SearXNG search snippets for queries that require real-time data:
|
||||||
|
weather, news, prices, scores, business hours.
|
||||||
|
|
||||||
|
Matched queries are also forced to medium tier by the Router so the richer
|
||||||
|
model handles the injected context.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_PATTERN = re.compile(
|
||||||
|
r"\b(weather|forecast|temperature|rain(ing)?|snow(ing)?|humidity|wind\s*speed"
|
||||||
|
r"|today.?s news|breaking news|latest news|news today|current events"
|
||||||
|
r"|bitcoin price|crypto price|stock price|exchange rate"
|
||||||
|
r"|right now|currently|at the moment|live score|score now|score today"
|
||||||
|
r"|open now|hours today|is .+ open)\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
def __init__(self, searxng_url: str):
|
||||||
|
self._searxng_url = searxng_url
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "real_time_search"
|
||||||
|
|
||||||
|
def matches(self, message: str) -> bool:
|
||||||
|
return bool(self._PATTERN.search(message))
|
||||||
|
|
||||||
|
async def run(self, message: str) -> str:
|
||||||
|
"""Search SearXNG and return top snippets as a context block."""
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=15) as client:
|
||||||
|
r = await client.get(
|
||||||
|
f"{self._searxng_url}/search",
|
||||||
|
params={"q": message, "format": "json"},
|
||||||
|
)
|
||||||
|
r.raise_for_status()
|
||||||
|
items = r.json().get("results", [])[:4]
|
||||||
|
except Exception as e:
|
||||||
|
return f"[real_time_search error: {e}]"
|
||||||
|
if not items:
|
||||||
|
return ""
|
||||||
|
lines = [f"Live web search results for: {message}\n"]
|
||||||
|
for i, item in enumerate(items, 1):
|
||||||
|
title = item.get("title", "")
|
||||||
|
url = item.get("url", "")
|
||||||
|
snippet = item.get("content", "")[:400]
|
||||||
|
lines.append(f"[{i}] {title}\nURL: {url}\n{snippet}\n")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
class FastToolRunner:
|
||||||
|
"""
|
||||||
|
Classifier + executor for fast tools.
|
||||||
|
|
||||||
|
Used in two places:
|
||||||
|
- Router.route(): any_matches() forces medium tier before LLM classification
|
||||||
|
- run_agent_task(): run_matching() builds enrichment context in the pre-flight gather
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, tools: list[FastTool]):
|
||||||
|
self._tools = tools
|
||||||
|
|
||||||
|
def any_matches(self, message: str) -> bool:
|
||||||
|
"""True if any fast tool applies to this message."""
|
||||||
|
return any(t.matches(message) for t in self._tools)
|
||||||
|
|
||||||
|
def matching_names(self, message: str) -> list[str]:
|
||||||
|
"""Names of tools that match this message (for logging)."""
|
||||||
|
return [t.name for t in self._tools if t.matches(message)]
|
||||||
|
|
||||||
|
async def run_matching(self, message: str) -> str:
|
||||||
|
"""Run all matching tools concurrently and return combined context."""
|
||||||
|
matching = [t for t in self._tools if t.matches(message)]
|
||||||
|
if not matching:
|
||||||
|
return ""
|
||||||
|
results = await asyncio.gather(*[t.run(message) for t in matching])
|
||||||
|
parts = [r for r in results if r and not r.startswith("[")]
|
||||||
|
return "\n\n".join(parts)
|
||||||
21
router.py
21
router.py
@@ -1,6 +1,7 @@
|
|||||||
import re
|
import re
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from langchain_core.messages import SystemMessage, HumanMessage
|
from langchain_core.messages import SystemMessage, HumanMessage
|
||||||
|
from fast_tools import FastToolRunner
|
||||||
|
|
||||||
# ── Regex pre-classifier ──────────────────────────────────────────────────────
|
# ── Regex pre-classifier ──────────────────────────────────────────────────────
|
||||||
# Catches obvious light-tier patterns before calling the LLM.
|
# Catches obvious light-tier patterns before calling the LLM.
|
||||||
@@ -23,16 +24,6 @@ _LIGHT_PATTERNS = re.compile(
|
|||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Queries that require live data — never answer from static knowledge
|
|
||||||
_MEDIUM_FORCE_PATTERNS = re.compile(
|
|
||||||
r"\b(weather|forecast|temperature|rain(ing)?|snow(ing)?|humidity|wind speed"
|
|
||||||
r"|today.?s news|breaking news|latest news|news today|current events"
|
|
||||||
r"|bitcoin price|crypto price|stock price|exchange rate|usd|eur|btc"
|
|
||||||
r"|right now|currently|at the moment|live score|score now|score today"
|
|
||||||
r"|open now|hours today|is .+ open)\b",
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
|
|
||||||
# ── LLM classification prompt ─────────────────────────────────────────────────
|
# ── LLM classification prompt ─────────────────────────────────────────────────
|
||||||
CLASSIFY_PROMPT = """Classify the message. Output ONLY one word: light, medium, or complex.
|
CLASSIFY_PROMPT = """Classify the message. Output ONLY one word: light, medium, or complex.
|
||||||
|
|
||||||
@@ -83,8 +74,9 @@ def _parse_tier(text: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
class Router:
|
class Router:
|
||||||
def __init__(self, model):
|
def __init__(self, model, fast_tool_runner: FastToolRunner | None = None):
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self._fast_tool_runner = fast_tool_runner
|
||||||
|
|
||||||
async def route(
|
async def route(
|
||||||
self,
|
self,
|
||||||
@@ -100,9 +92,10 @@ class Router:
|
|||||||
if force_complex:
|
if force_complex:
|
||||||
return "complex", None
|
return "complex", None
|
||||||
|
|
||||||
# Step 0a: force medium for real-time / live-data queries
|
# Step 0a: force medium if any fast tool matches (live-data queries)
|
||||||
if _MEDIUM_FORCE_PATTERNS.search(message.strip()):
|
if self._fast_tool_runner and self._fast_tool_runner.any_matches(message.strip()):
|
||||||
print(f"[router] regex→medium (real-time query)", flush=True)
|
names = self._fast_tool_runner.matching_names(message.strip())
|
||||||
|
print(f"[router] fast_tool_match={names} → medium", flush=True)
|
||||||
return "medium", None
|
return "medium", None
|
||||||
|
|
||||||
# Step 0b: regex pre-classification for obvious light patterns
|
# Step 0b: regex pre-classification for obvious light patterns
|
||||||
|
|||||||
Reference in New Issue
Block a user