services: ollama: image: ollama/ollama container_name: ollama ports: - "11436:11434" volumes: - /mnt/ssd/ai/ollama:/root/.ollama - /mnt/ssd/ai/open-webui:/app/backend/data restart: always environment: # Allow qwen3:8b + qwen2.5:1.5b to coexist in VRAM (~6.7-7.7 GB on 8 GB GPU) - OLLAMA_MAX_LOADED_MODELS=2 # One GPU inference at a time — prevents compute contention between models - OLLAMA_NUM_PARALLEL=1 deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] ollama-cpu: image: ollama/ollama container_name: ollama-cpu ports: - "11435:11434" volumes: - /mnt/ssd/ai/ollama-cpu:/root/.ollama restart: always open-webui: image: ghcr.io/open-webui/open-webui:main container_name: open-webui ports: - "3125:8080" volumes: - /mnt/ssd/ai/open-webui:/app/backend/data restart: always deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] environment: - ANTHROPIC_API_KEY=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 searxng: image: docker.io/searxng/searxng:latest container_name: searxng volumes: - /mnt/ssd/ai/searxng/config/:/etc/searxng/ - /mnt/ssd/ai/searxng/data/:/var/cache/searxng/ restart: always ports: - "11437:8080" qdrant: image: qdrant/qdrant container_name: qdrant ports: - "6333:6333" - "6334:6334" restart: always volumes: - /mnt/ssd/dbs/qdrant:/qdrant/storage:z