oO/infra/docker/docker-compose.yml

name: oo

services:
  # ── core profile ──────────────────────────────────────────────────────────

  api:
    build:
      context: ../..
      dockerfile: infra/docker/Dockerfile.api
    profiles: [core, full]
    env_file: ../../.env.local
    environment:
      NODE_ENV: production
      ML_SERVING_URL: "http://ml-serving:8000"
      MLFLOW_URL: "http://mlflow:5000"
      INTERNAL_API_TOKEN: "${INTERNAL_API_TOKEN:-}"
    volumes:
      - /mnt/ssd/dbs/oo:/mnt/ssd/dbs/oo
    ports:
      - "127.0.0.1:3078:3078"
    healthcheck:
      test: ["CMD", "node", "-e", "fetch('http://localhost:3078/health').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
      interval: 10s
      timeout: 5s
      retries: 5

  web:
    build:
      context: ../..
      dockerfile: infra/docker/Dockerfile.web
    profiles: [core, full]
    env_file: ../../.env.local
    environment:
      NODE_ENV: production
      PORT: "3079"
      HOSTNAME: "0.0.0.0"
      NEXT_PUBLIC_API_URL: ""   # Caddy routes /api/* directly to the API in prod
    ports:
      - "127.0.0.1:3079:3079"
    depends_on:
      api:
        condition: service_healthy

  admin:
    build:
      context: ../..
      dockerfile: infra/docker/Dockerfile.admin
    profiles: [core, full]
    env_file: ../../.env.local
    environment:
      NODE_ENV: production
      PORT: "3080"
      HOSTNAME: "0.0.0.0"
      NEXT_PUBLIC_API_URL: ""
      NEXT_PUBLIC_MLFLOW_URL: "/mlflow"
      INTERNAL_API_URL: "http://api:3078"
    ports:
      - "127.0.0.1:3080:3080"
    depends_on:
      api:
        condition: service_healthy

  # ── full profile ──────────────────────────────────────────────────────────

  ml-serving:
    build:
      context: ../..
      dockerfile: infra/docker/Dockerfile.ml
    profiles: [full]
    env_file: ../../.env.local
    environment:
      LITELLM_URL: ${LITELLM_URL:-http://host.docker.internal:4000}
      OLLAMA_URL: ${OLLAMA_URL:-http://host.docker.internal:11434}
      MLFLOW_TRACKING_URI: ${MLFLOW_TRACKING_URI:-http://mlflow:5000}
    extra_hosts:
      - "host.docker.internal:host-gateway"
    ports:
      - "127.0.0.1:8000:8000"
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8000/health',timeout=3).status==200 else 1)"]
      interval: 10s
      timeout: 5s
      retries: 5

  # ── ai profile — Ollama + LiteLLM for local dev ──────────────────────────
  # Start: docker compose --profile ai up
  # Use when the Agap shared Ollama/LiteLLM services are not available locally.
  # Set LITELLM_URL=http://localhost:4000 and OLLAMA_URL=http://localhost:11434
  # in .env.local to point ml-serving at these containers instead of Agap.

  ollama:
    image: ollama/ollama:latest
    profiles: [ai]
    volumes:
      - ollama-models:/root/.ollama
    ports:
      - "127.0.0.1:11434:11434"
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://localhost:11434/api/tags"]
      interval: 15s
      timeout: 5s
      retries: 10

  litellm:
    image: ghcr.io/berriai/litellm:main-latest
    profiles: [ai]
    environment:
      LITELLM_MASTER_KEY: ${LITELLM_MASTER_KEY:-sk-local-dev}
    command: >
      --model ollama/qwen2.5:1.5b
      --model ollama/nomic-embed-text
      --api_base http://ollama:11434
      --port 4000
    ports:
      - "127.0.0.1:4000:4000"
    depends_on:
      ollama:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://localhost:4000/health"]
      interval: 10s
      timeout: 5s
      retries: 5

  # ── mlops profile — MLflow ────────────────────────────────────────────────
  # Start: docker compose --profile mlops up
  # MLflow UI:  http://localhost:5000  or  https://o.alogins.net/mlflow

  # ── events profile — NATS JetStream ─────────────────────────────────────
  # Start: docker compose --profile events up
  # NATS monitoring: http://localhost:8222
  # Enable in the API by setting NATS_URL=nats://nats:4222 in .env.local

  nats:
    image: nats:2.10-alpine
    profiles: [events, full]
    command: ["-js", "-sd", "/data", "-m", "8222"]
    volumes:
      - /mnt/ssd/dbs/oo/nats:/data
    ports:
      - "127.0.0.1:4222:4222"   # client connections
      - "127.0.0.1:8222:8222"   # HTTP monitoring
    healthcheck:
      test: ["CMD", "wget", "--spider", "-q", "http://localhost:8222/healthz"]
      interval: 10s
      timeout: 5s
      retries: 5

  mlflow:
    image: ghcr.io/mlflow/mlflow:v3.11.1
    profiles: [mlops]
    command: >
      mlflow server
      --backend-store-uri sqlite:////mlflow/mlflow.db
      --artifacts-destination /mlflow/artifacts
      --serve-artifacts
      --default-artifact-root mlflow-artifacts:/
      --host 0.0.0.0
      --port 5000
      --static-prefix /mlflow
      --allowed-hosts o.alogins.net,localhost,localhost:5000,mlflow,mlflow:5000
      --cors-allowed-origins https://o.alogins.net
    volumes:
      - /mnt/ssd/dbs/oo/mlflow:/mlflow
    ports:
      - "127.0.0.1:5000:5000"
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:5000/mlflow/health',timeout=3).status==200 else 1)"]
      interval: 10s
      timeout: 5s
      retries: 5

volumes:
  ollama-models: