- Add Bifrost (maximhq/bifrost) as LLM gateway: all inference routes through bifrost:8080/v1 with retry logic and observability; VRAMManager keeps direct Ollama access for VRAM flush/prewarm operations - Switch medium model from qwen3:4b to qwen2.5:1.5b (direct call, no tools) via _DirectModel wrapper; complex keeps create_deep_agent with qwen3:8b - Implement out-of-agent memory pipeline: _retrieve_memories pre-fetches relevant context (injected into all tiers), _store_memory runs as background task after each reply writing to openmemory/Qdrant - Add tests/unit/ with 133 tests covering router, channels, vram_manager, agent helpers; move integration test to tests/integration/ - Add bifrost-config.json with GPU Ollama (qwen2.5:0.5b/1.5b, qwen3:4b/8b, gemma3:4b) and CPU Ollama providers - Integration test 28/29 pass (only grammy fails — no TELEGRAM_BOT_TOKEN) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
77 lines
2.1 KiB
YAML
77 lines
2.1 KiB
YAML
services:
|
|
bifrost:
|
|
image: maximhq/bifrost
|
|
container_name: bifrost
|
|
ports:
|
|
- "8080:8080"
|
|
volumes:
|
|
- ./bifrost-config.json:/app/data/config.json:ro
|
|
environment:
|
|
- APP_DIR=/app/data
|
|
- APP_PORT=8080
|
|
- LOG_LEVEL=info
|
|
extra_hosts:
|
|
- "host.docker.internal:host-gateway"
|
|
restart: unless-stopped
|
|
|
|
deepagents:
|
|
build: .
|
|
container_name: deepagents
|
|
ports:
|
|
- "8000:8000"
|
|
environment:
|
|
- PYTHONUNBUFFERED=1
|
|
# Bifrost gateway — all LLM inference goes through here
|
|
- BIFROST_URL=http://bifrost:8080/v1
|
|
# Direct Ollama GPU URL — used only by VRAMManager for flush/prewarm
|
|
- OLLAMA_BASE_URL=http://host.docker.internal:11436
|
|
- DEEPAGENTS_MODEL=qwen2.5:1.5b
|
|
- DEEPAGENTS_COMPLEX_MODEL=qwen3:8b
|
|
- DEEPAGENTS_ROUTER_MODEL=qwen2.5:1.5b
|
|
- SEARXNG_URL=http://host.docker.internal:11437
|
|
- GRAMMY_URL=http://grammy:3001
|
|
- CRAWL4AI_URL=http://crawl4ai:11235
|
|
extra_hosts:
|
|
- "host.docker.internal:host-gateway"
|
|
depends_on:
|
|
- openmemory
|
|
- grammy
|
|
- crawl4ai
|
|
- bifrost
|
|
restart: unless-stopped
|
|
|
|
openmemory:
|
|
build: ./openmemory
|
|
container_name: openmemory
|
|
ports:
|
|
- "8765:8765"
|
|
environment:
|
|
# Extraction LLM runs on GPU — qwen2.5:1.5b for speed (~3s)
|
|
- OLLAMA_GPU_URL=http://host.docker.internal:11436
|
|
- OLLAMA_EXTRACTION_MODEL=qwen2.5:1.5b
|
|
# Embedding (nomic-embed-text) runs on CPU — fast enough for search (50-150ms)
|
|
- OLLAMA_CPU_URL=http://host.docker.internal:11435
|
|
extra_hosts:
|
|
- "host.docker.internal:host-gateway"
|
|
restart: unless-stopped
|
|
|
|
grammy:
|
|
build: ./grammy
|
|
container_name: grammy
|
|
ports:
|
|
- "3001:3001"
|
|
environment:
|
|
- TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN}
|
|
- DEEPAGENTS_URL=http://deepagents:8000
|
|
restart: unless-stopped
|
|
|
|
crawl4ai:
|
|
image: unclecode/crawl4ai:latest
|
|
container_name: crawl4ai
|
|
ports:
|
|
- "11235:11235"
|
|
environment:
|
|
- CRAWL4AI_LOG_LEVEL=WARNING
|
|
shm_size: "1g"
|
|
restart: unless-stopped
|