Files
adolf/agent.py
2026-03-05 11:22:34 +00:00

387 lines
15 KiB
Python

import asyncio
import os
import time
from contextlib import asynccontextmanager
from fastapi import FastAPI, BackgroundTasks, Request
from fastapi.responses import JSONResponse, StreamingResponse
from pydantic import BaseModel
import re as _re
import httpx as _httpx
from langchain_ollama import ChatOllama
from langchain_mcp_adapters.client import MultiServerMCPClient
from langchain_community.utilities import SearxSearchWrapper
from langchain_core.tools import Tool
from vram_manager import VRAMManager
from router import Router
from agent_factory import build_medium_agent, build_complex_agent
import channels
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
ROUTER_MODEL = os.getenv("DEEPAGENTS_ROUTER_MODEL", "qwen2.5:0.5b")
MEDIUM_MODEL = os.getenv("DEEPAGENTS_MODEL", "qwen3:4b")
COMPLEX_MODEL = os.getenv("DEEPAGENTS_COMPLEX_MODEL", "qwen3:8b")
SEARXNG_URL = os.getenv("SEARXNG_URL", "http://host.docker.internal:11437")
OPENMEMORY_URL = os.getenv("OPENMEMORY_URL", "http://openmemory:8765")
CRAWL4AI_URL = os.getenv("CRAWL4AI_URL", "http://crawl4ai:11235")
MAX_HISTORY_TURNS = 5
_conversation_buffers: dict[str, list] = {}
MEDIUM_SYSTEM_PROMPT = (
"You are a helpful AI assistant. "
"Use web_search for questions about current events or facts you don't know. "
"Reply concisely."
)
COMPLEX_SYSTEM_PROMPT = (
"You are a deep research assistant. "
"web_search automatically fetches full page content from top results — use it 6+ times with different queries. "
"Also call fetch_url on any specific URL you want to read in full.\n\n"
"Run searches in English AND Russian/Latvian. "
"After getting results, run follow-up searches based on new facts found.\n\n"
"Write a structured markdown report with sections: "
"Overview, Education, Career, Publications, Online Presence, Interesting Findings.\n"
"Every fact must link to the real URL it came from: [fact](url). "
"NEVER invent URLs. End with: **Sources checked: N**"
)
medium_agent = None
complex_agent = None
router: Router = None
vram_manager: VRAMManager = None
mcp_client = None
# GPU mutex: one LLM inference at a time
_reply_semaphore = asyncio.Semaphore(1)
@asynccontextmanager
async def lifespan(app: FastAPI):
global medium_agent, complex_agent, router, vram_manager, mcp_client
# Register channel adapters
channels.register_defaults()
# Three model instances
router_model = ChatOllama(
model=ROUTER_MODEL, base_url=OLLAMA_BASE_URL, think=False, num_ctx=4096,
temperature=0,
)
medium_model = ChatOllama(
model=MEDIUM_MODEL, base_url=OLLAMA_BASE_URL, think=False, num_ctx=8192
)
complex_model = ChatOllama(
model=COMPLEX_MODEL, base_url=OLLAMA_BASE_URL, think=True, num_ctx=16384
)
vram_manager = VRAMManager(base_url=OLLAMA_BASE_URL)
router = Router(model=router_model)
mcp_connections = {
"openmemory": {"transport": "sse", "url": f"{OPENMEMORY_URL}/sse"},
}
mcp_client = MultiServerMCPClient(mcp_connections)
for attempt in range(12):
try:
mcp_tools = await mcp_client.get_tools()
break
except Exception as e:
if attempt == 11:
raise
print(f"[agent] MCP not ready (attempt {attempt + 1}/12): {e}. Retrying in 5s...")
await asyncio.sleep(5)
agent_tools = [t for t in mcp_tools if t.name not in ("add_memory", "search_memory", "get_all_memories")]
searx = SearxSearchWrapper(searx_host=SEARXNG_URL)
def _crawl4ai_fetch(url: str) -> str:
"""Fetch a URL via Crawl4AI (JS-rendered, bot-bypass) and return clean markdown."""
try:
r = _httpx.post(f"{CRAWL4AI_URL}/crawl", json={"urls": [url]}, timeout=60)
r.raise_for_status()
results = r.json().get("results", [])
if not results or not results[0].get("success"):
return ""
md_obj = results[0].get("markdown") or {}
md = md_obj.get("raw_markdown") if isinstance(md_obj, dict) else str(md_obj)
return (md or "")[:5000]
except Exception as e:
return f"[fetch error: {e}]"
def _search_and_read(query: str) -> str:
"""Search the web and automatically fetch full content of top results.
Returns snippets + full page content from the top URLs."""
import json as _json
# Get structured results from SearXNG
try:
r = _httpx.get(
f"{SEARXNG_URL}/search",
params={"q": query, "format": "json"},
timeout=15,
)
data = r.json()
items = data.get("results", [])[:5]
except Exception as e:
return f"[search error: {e}]"
if not items:
return "No results found."
out = [f"Search: {query}\n"]
for i, item in enumerate(items, 1):
url = item.get("url", "")
title = item.get("title", "")
snippet = item.get("content", "")[:300]
out.append(f"\n[{i}] {title}\nURL: {url}\nSnippet: {snippet}")
# Auto-fetch top 2 URLs for full content
out.append("\n\n--- Full page content ---")
for item in items[:2]:
url = item.get("url", "")
if not url:
continue
content = _crawl4ai_fetch(url)
if content and not content.startswith("[fetch error"):
out.append(f"\n### {url}\n{content[:3000]}")
return "\n".join(out)
agent_tools.append(Tool(
name="web_search",
func=_search_and_read,
description=(
"Search the web and read full content of top results. "
"Returns search snippets AND full page text from the top URLs. "
"Use multiple different queries to research a topic thoroughly."
),
))
def _fetch_url(url: str) -> str:
"""Fetch and read the full text content of a URL."""
content = _crawl4ai_fetch(url)
return content if content else "[fetch_url: empty or blocked]"
agent_tools.append(Tool(
name="fetch_url",
func=_fetch_url,
description=(
"Fetch and read the full text content of a specific URL. "
"Use for URLs not covered by web_search. Input: a single URL string."
),
))
medium_agent = build_medium_agent(
model=medium_model,
agent_tools=agent_tools,
system_prompt=MEDIUM_SYSTEM_PROMPT,
)
complex_agent = build_complex_agent(
model=complex_model,
agent_tools=agent_tools,
system_prompt=COMPLEX_SYSTEM_PROMPT.format(user_id="{user_id}"),
)
print(
f"[agent] three-tier: router={ROUTER_MODEL} | medium={MEDIUM_MODEL} | complex={COMPLEX_MODEL}",
flush=True,
)
print(f"[agent] agent tools: {[t.name for t in agent_tools]}", flush=True)
yield
medium_agent = None
complex_agent = None
router = None
vram_manager = None
mcp_client = None
app = FastAPI(lifespan=lifespan)
# ── request models ─────────────────────────────────────────────────────────────
class InboundMessage(BaseModel):
text: str
session_id: str # e.g. "tg-346967270", "cli-alvis"
channel: str # "telegram" | "cli"
user_id: str = "" # human identity; defaults to session_id if empty
metadata: dict = {}
class ChatRequest(BaseModel):
"""Legacy model — kept for test_pipeline.py compatibility."""
message: str
chat_id: str
# ── helpers ────────────────────────────────────────────────────────────────────
def _extract_final_text(result) -> str | None:
msgs = result.get("messages", [])
for m in reversed(msgs):
if type(m).__name__ == "AIMessage" and getattr(m, "content", ""):
return m.content
if isinstance(result, dict) and result.get("output"):
return result["output"]
return None
def _log_messages(result):
msgs = result.get("messages", [])
for m in msgs:
role = type(m).__name__
content = getattr(m, "content", "")
tool_calls = getattr(m, "tool_calls", [])
if content:
print(f"[agent] {role}: {str(content)[:150]}", flush=True)
for tc in tool_calls:
print(f"[agent] {role}{tc['name']}({tc['args']})", flush=True)
# ── core task ──────────────────────────────────────────────────────────────────
async def run_agent_task(message: str, session_id: str, channel: str = "telegram"):
print(f"[agent] queued: {message[:80]!r} chat={session_id}", flush=True)
force_complex = False
clean_message = message
if message.startswith("/think "):
force_complex = True
clean_message = message[len("/think "):]
print("[agent] /think prefix → force_complex=True", flush=True)
async with _reply_semaphore:
t0 = time.monotonic()
history = _conversation_buffers.get(session_id, [])
print(f"[agent] running: {clean_message[:80]!r}", flush=True)
tier, light_reply = await router.route(clean_message, history, force_complex)
print(f"[agent] tier={tier} message={clean_message[:60]!r}", flush=True)
final_text = None
try:
if tier == "light":
final_text = light_reply
llm_elapsed = time.monotonic() - t0
print(f"[agent] light path: answered by router", flush=True)
elif tier == "medium":
system_prompt = MEDIUM_SYSTEM_PROMPT
result = await medium_agent.ainvoke({
"messages": [
{"role": "system", "content": system_prompt},
*history,
{"role": "user", "content": clean_message},
]
})
llm_elapsed = time.monotonic() - t0
_log_messages(result)
final_text = _extract_final_text(result)
else: # complex
ok = await vram_manager.enter_complex_mode()
if not ok:
print("[agent] complex→medium fallback (eviction timeout)", flush=True)
tier = "medium"
result = await medium_agent.ainvoke({
"messages": [
{"role": "system", "content": MEDIUM_SYSTEM_PROMPT},
*history,
{"role": "user", "content": clean_message},
]
})
else:
system_prompt = COMPLEX_SYSTEM_PROMPT.format(user_id=session_id)
result = await complex_agent.ainvoke({
"messages": [
{"role": "system", "content": system_prompt},
*history,
{"role": "user", "content": clean_message},
]
})
asyncio.create_task(vram_manager.exit_complex_mode())
llm_elapsed = time.monotonic() - t0
_log_messages(result)
final_text = _extract_final_text(result)
except Exception as e:
import traceback
llm_elapsed = time.monotonic() - t0
print(f"[agent] error after {llm_elapsed:.1f}s for chat {session_id}: {e}", flush=True)
traceback.print_exc()
# Deliver reply through the originating channel
if final_text:
t1 = time.monotonic()
await channels.deliver(session_id, channel, final_text)
send_elapsed = time.monotonic() - t1
print(
f"[agent] replied in {time.monotonic() - t0:.1f}s "
f"(llm={llm_elapsed:.1f}s, send={send_elapsed:.1f}s) tier={tier}",
flush=True,
)
print(f"[agent] reply_text: {final_text}", flush=True)
else:
print("[agent] warning: no text reply from agent", flush=True)
# Update conversation buffer
if final_text:
buf = _conversation_buffers.get(session_id, [])
buf.append({"role": "user", "content": clean_message})
buf.append({"role": "assistant", "content": final_text})
_conversation_buffers[session_id] = buf[-(MAX_HISTORY_TURNS * 2):]
# ── endpoints ──────────────────────────────────────────────────────────────────
@app.post("/message")
async def message(request: InboundMessage, background_tasks: BackgroundTasks):
"""Unified inbound endpoint for all channels."""
if medium_agent is None:
return JSONResponse(status_code=503, content={"error": "Agent not ready"})
session_id = request.session_id
channel = request.channel
background_tasks.add_task(run_agent_task, request.text, session_id, channel)
return JSONResponse(status_code=202, content={"status": "accepted"})
@app.post("/chat")
async def chat(request: ChatRequest, background_tasks: BackgroundTasks):
"""Legacy endpoint — maps chat_id to tg-<chat_id> session for backward compatibility."""
if medium_agent is None:
return JSONResponse(status_code=503, content={"error": "Agent not ready"})
session_id = f"tg-{request.chat_id}"
background_tasks.add_task(run_agent_task, request.message, session_id, "telegram")
return JSONResponse(status_code=202, content={"status": "accepted"})
@app.get("/reply/{session_id}")
async def reply_stream(session_id: str):
"""
SSE endpoint — streams the reply for a session once available, then closes.
Used by CLI client and wiki_research.py instead of log polling.
"""
q = channels.pending_replies.setdefault(session_id, asyncio.Queue())
async def event_generator():
try:
text = await asyncio.wait_for(q.get(), timeout=900)
# Escape newlines so entire reply fits in one SSE data line
yield f"data: {text.replace(chr(10), '\\n').replace(chr(13), '')}\n\n"
except asyncio.TimeoutError:
yield "data: [timeout]\n\n"
return StreamingResponse(event_generator(), media_type="text/event-stream")
@app.get("/health")
async def health():
return {"status": "ok", "agent_ready": medium_agent is not None}