services:
  ollama:
    image: ollama/ollama
    container_name: ollama
    ports:
      - "11436:11434"
    volumes:
      - /mnt/ssd/ai/ollama:/root/.ollama
      - /mnt/ssd/ai/open-webui:/app/backend/data
    restart: always
    environment:
      # Allow qwen3:8b + qwen2.5:1.5b to coexist in VRAM (~6.7-7.7 GB on 8 GB GPU)
      - OLLAMA_MAX_LOADED_MODELS=2
      # One GPU inference at a time — prevents compute contention between models
      - OLLAMA_NUM_PARALLEL=1
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]

  ollama-cpu:
    image: ollama/ollama
    container_name: ollama-cpu
    ports:
      - "11435:11434"
    volumes:
      - /mnt/ssd/ai/ollama-cpu:/root/.ollama
    restart: always
   
  open-webui:
    image: ghcr.io/open-webui/open-webui:main
    container_name: open-webui
    ports:
      - "3125:8080"
    volumes:
      - /mnt/ssd/ai/open-webui:/app/backend/data
    restart: always
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      - ANTHROPIC_API_KEY=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

  searxng:
    image: docker.io/searxng/searxng:latest
    container_name: searxng
    volumes:
      - /mnt/ssd/ai/searxng/config/:/etc/searxng/
      - /mnt/ssd/ai/searxng/data/:/var/cache/searxng/
    restart: always
    ports:
      - "11437:8080"

  qdrant:
    image: qdrant/qdrant
    container_name: qdrant
    ports:
      - "6333:6333"
      - "6334:6334"
    restart: always
    volumes:
      - /mnt/ssd/dbs/qdrant:/qdrant/storage:z