- Add Bifrost (maximhq/bifrost) as LLM gateway: all inference routes through bifrost:8080/v1 with retry logic and observability; VRAMManager keeps direct Ollama access for VRAM flush/prewarm operations - Switch medium model from qwen3:4b to qwen2.5:1.5b (direct call, no tools) via _DirectModel wrapper; complex keeps create_deep_agent with qwen3:8b - Implement out-of-agent memory pipeline: _retrieve_memories pre-fetches relevant context (injected into all tiers), _store_memory runs as background task after each reply writing to openmemory/Qdrant - Add tests/unit/ with 133 tests covering router, channels, vram_manager, agent helpers; move integration test to tests/integration/ - Add bifrost-config.json with GPU Ollama (qwen2.5:0.5b/1.5b, qwen3:4b/8b, gemma3:4b) and CPU Ollama providers - Integration test 28/29 pass (only grammy fails — no TELEGRAM_BOT_TOKEN) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
157 lines
6.2 KiB
Python
157 lines
6.2 KiB
Python
import json
|
|
import os
|
|
from mcp.server.fastmcp import FastMCP
|
|
from mem0 import Memory
|
|
|
|
# Extraction LLM — GPU Ollama (qwen3:4b, same model as medium agent)
|
|
# Runs after reply when GPU is idle; spin-wait in agent.py prevents contention
|
|
OLLAMA_GPU_URL = os.getenv("OLLAMA_GPU_URL", "http://host.docker.internal:11436")
|
|
EXTRACTION_MODEL = os.getenv("OLLAMA_EXTRACTION_MODEL", "qwen2.5:1.5b")
|
|
|
|
# Embedding — CPU Ollama (nomic-embed-text, 137 MB RAM)
|
|
# Used for both search (50-150ms, acceptable) and store-time embedding
|
|
OLLAMA_CPU_URL = os.getenv("OLLAMA_CPU_URL", "http://host.docker.internal:11435")
|
|
|
|
QDRANT_HOST = os.getenv("QDRANT_HOST", "host.docker.internal")
|
|
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
|
|
|
|
# Change 2: Custom extraction prompt
|
|
# /no_think disables qwen3 thinking tokens so output is clean JSON
|
|
EXTRACTION_PROMPT = """/no_think
|
|
You are a memory extraction assistant. Extract factual statements from a conversation that are worth remembering long-term.
|
|
|
|
Extract facts from BOTH user AND assistant messages, including:
|
|
- User details, preferences, and personal information
|
|
- User's plans, goals, and intentions
|
|
- The assistant's name or persona (if set by the user or stated by the assistant)
|
|
- Any commitments or agreements made
|
|
- Key facts stated as true
|
|
|
|
Return ONLY valid JSON in this exact format:
|
|
{"facts": ["fact 1", "fact 2"]}
|
|
|
|
If there are no facts worth storing, return: {"facts": []}
|
|
|
|
IMPORTANT rules:
|
|
- Extract the EXACT concrete values mentioned. Never say "not known" or "unspecified".
|
|
- If the user states their name, job, pet, city, allergy, or preference — store the exact value.
|
|
- A single message may contain multiple facts — extract ALL of them.
|
|
- Do NOT extract vague summaries. Extract specific facts with real values.
|
|
|
|
Examples:
|
|
|
|
Input: "User: I live in Berlin\nAssistant: Got it, you're in Berlin!"
|
|
Output: {"facts": ["User lives in Berlin"]}
|
|
|
|
Input: "User: My name is Alice and I live in Tokyo\nAssistant: Nice to meet you Alice!"
|
|
Output: {"facts": ["User's name is Alice", "User lives in Tokyo"]}
|
|
|
|
Input: "User: I work as a software engineer at a startup\nAssistant: Cool!"
|
|
Output: {"facts": ["User works as a software engineer at a startup"]}
|
|
|
|
Input: "User: I have a cat named Whiskers\nAssistant: Whiskers is a cute name!"
|
|
Output: {"facts": ["User has a cat named Whiskers"]}
|
|
|
|
Input: "User: I'm allergic to nuts\nAssistant: I'll remember that."
|
|
Output: {"facts": ["User is allergic to nuts"]}
|
|
|
|
Input: "User: remember that your name is Adolf\nAssistant: My name is Adolf!"
|
|
Output: {"facts": ["Assistant's name is Adolf"]}
|
|
|
|
Input: "User: what time is it?\nAssistant: I don't have access to real-time data."
|
|
Output: {"facts": []}
|
|
|
|
Input: "User: I prefer dark mode\nAssistant: Noted, I'll keep that in mind."
|
|
Output: {"facts": ["User prefers dark mode"]}
|
|
|
|
Now extract facts from this conversation:"""
|
|
|
|
# Update/dedup decision prompt — overrides mem0's default.
|
|
# qwen2.5:1.5b struggles with the default multi-step reasoning; this version is
|
|
# more explicit: list existing, list new, decide ADD/NONE per item.
|
|
UPDATE_PROMPT = """/no_think
|
|
You manage a memory store. Given EXISTING memories and NEW facts:
|
|
- For each EXISTING memory: output NONE (no change) or UPDATE (if a new fact replaces it) or DELETE.
|
|
- For each NEW fact: output ADD if it is not already covered by existing memories. Output NONE if it is already covered.
|
|
- IMPORTANT: You MUST include ALL new facts in your output — either as ADD or NONE.
|
|
- Output ONLY valid JSON, no explanation.
|
|
|
|
Example A — new fact is genuinely new:
|
|
Existing: [{"id": "0", "text": "User lives in Berlin"}]
|
|
New facts: ["User is allergic to nuts"]
|
|
Output: {"memory": [{"id": "0", "text": "User lives in Berlin", "event": "NONE"}, {"id": "1", "text": "User is allergic to nuts", "event": "ADD"}]}
|
|
|
|
Example B — new fact updates an existing one:
|
|
Existing: [{"id": "0", "text": "User lives in Berlin"}]
|
|
New facts: ["User lives in Paris"]
|
|
Output: {"memory": [{"id": "0", "text": "User lives in Paris", "event": "UPDATE", "old_memory": "User lives in Berlin"}]}
|
|
|
|
Example C — new fact already covered:
|
|
Existing: [{"id": "0", "text": "User is allergic to nuts"}]
|
|
New facts: ["User has a nut allergy"]
|
|
Output: {"memory": [{"id": "0", "text": "User is allergic to nuts", "event": "NONE"}]}"""
|
|
|
|
config = {
|
|
"llm": {
|
|
"provider": "ollama",
|
|
"config": {
|
|
"model": EXTRACTION_MODEL,
|
|
"ollama_base_url": OLLAMA_GPU_URL,
|
|
"temperature": 0.1, # consistent JSON output
|
|
},
|
|
},
|
|
"embedder": {
|
|
"provider": "ollama",
|
|
"config": {
|
|
"model": "nomic-embed-text",
|
|
"ollama_base_url": OLLAMA_CPU_URL, # CPU: 50-150ms per query, no GPU needed
|
|
},
|
|
},
|
|
"vector_store": {
|
|
"provider": "qdrant",
|
|
"config": {
|
|
"collection_name": "adolf_memories",
|
|
"embedding_model_dims": 768,
|
|
"host": QDRANT_HOST,
|
|
"port": QDRANT_PORT,
|
|
},
|
|
},
|
|
"custom_fact_extraction_prompt": EXTRACTION_PROMPT,
|
|
"custom_update_memory_prompt": UPDATE_PROMPT,
|
|
}
|
|
|
|
memory = Memory.from_config(config)
|
|
|
|
mcp = FastMCP("openmemory", host="0.0.0.0", port=8765)
|
|
|
|
|
|
@mcp.tool()
|
|
def add_memory(text: str, user_id: str = "default") -> str:
|
|
"""Store a memory for a user."""
|
|
result = memory.add(text, user_id=user_id)
|
|
# Change 3: return clean JSON instead of Python repr
|
|
return json.dumps(result, default=str)
|
|
|
|
|
|
@mcp.tool()
|
|
def search_memory(query: str, user_id: str = "default") -> str:
|
|
"""Search memories for a user using semantic similarity."""
|
|
results = memory.search(query, user_id=user_id, limit=10, threshold=0.3)
|
|
# Filter to only return results with score >= 0.5 to avoid irrelevant noise
|
|
if isinstance(results, dict) and "results" in results:
|
|
results["results"] = [r for r in results["results"] if r.get("score", 0) >= 0.5]
|
|
return json.dumps(results, default=str)
|
|
|
|
|
|
@mcp.tool()
|
|
def get_all_memories(user_id: str = "default", limit: int = 50) -> str:
|
|
"""Get stored memories for a user (up to limit)."""
|
|
# Change 5: cap results to avoid flooding context
|
|
results = memory.get_all(user_id=user_id, limit=limit)
|
|
# Change 3: return clean JSON instead of Python repr
|
|
return json.dumps(results, default=str)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
mcp.run(transport="sse")
|