oO/infra/docker/docker-compose.yml

name: oo

services:
  # ── core profile ──────────────────────────────────────────────────────────

  api:
    build:
      context: ../..
      dockerfile: infra/docker/Dockerfile.api
    profiles: [core, full]
    env_file: ../../.env.local
    environment:
      NODE_ENV: production
    volumes:
      - /mnt/ssd/dbs/oo:/mnt/ssd/dbs/oo
    ports:
      - "127.0.0.1:3001:3001"
    healthcheck:
      test: ["CMD", "wget", "--spider", "-q", "http://localhost:3001/health"]
      interval: 10s
      timeout: 5s
      retries: 5

  web:
    build:
      context: ../..
      dockerfile: infra/docker/Dockerfile.web
    profiles: [core, full]
    env_file: ../../.env.local
    environment:
      NODE_ENV: production
      PORT: "3079"
      HOSTNAME: "0.0.0.0"
      NEXT_PUBLIC_API_URL: ""   # Caddy routes /api/* directly to the API in prod
    ports:
      - "127.0.0.1:3079:3079"
    depends_on:
      api:
        condition: service_healthy

  admin:
    build:
      context: ../..
      dockerfile: infra/docker/Dockerfile.admin
    profiles: [core, full]
    env_file: ../../.env.local
    environment:
      NODE_ENV: production
      PORT: "3080"
      HOSTNAME: "0.0.0.0"
      NEXT_PUBLIC_API_URL: ""
      INTERNAL_API_URL: "http://api:3001"
    ports:
      - "127.0.0.1:3080:3080"
    depends_on:
      api:
        condition: service_healthy

  # ── full profile ──────────────────────────────────────────────────────────

  ml-serving:
    build:
      context: ../..
      dockerfile: infra/docker/Dockerfile.ml
    profiles: [full]
    env_file: ../../.env.local
    environment:
      LITELLM_URL: ${LITELLM_URL:-http://litellm:4000}
      OLLAMA_URL: ${OLLAMA_URL:-http://ollama:11434}
    ports:
      - "127.0.0.1:8000:8000"
    healthcheck:
      test: ["CMD", "wget", "--spider", "-q", "http://localhost:8000/health"]
      interval: 10s
      timeout: 5s
      retries: 5

  # ── mlops profile — MLflow + Airflow ──────────────────────────────────────
  # Start: docker compose --profile mlops up
  # MLflow UI:  http://localhost:5000       or https://o.alogins.net/mlflow  (admin / password — change via basic_auth.ini)
  # Airflow UI: http://localhost:8080/airflow  or https://o.alogins.net/airflow  (admin / AIRFLOW_ADMIN_PASSWORD)
  # Caddy routes /mlflow* and /airflow* inside the o.alogins.net block

  airflow-db:
    image: postgres:16-alpine
    profiles: [mlops]
    environment:
      POSTGRES_DB: airflow
      POSTGRES_USER: airflow
      POSTGRES_PASSWORD: ${AIRFLOW_DB_PASSWORD:-airflow}
    volumes:
      - /mnt/ssd/dbs/oo/airflow-db:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U airflow"]
      interval: 10s
      timeout: 5s
      retries: 5

  airflow-init:
    image: apache/airflow:2.9.3
    profiles: [mlops]
    entrypoint: /bin/bash
    command:
      - -c
      - |
        airflow db migrate
        airflow users create \
          --username admin \
          --firstname Admin \
          --lastname User \
          --role Admin \
          --email admin@oo.local \
          --password "$${AIRFLOW_ADMIN_PASSWORD:-admin}"
    environment:
      AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${AIRFLOW_DB_PASSWORD:-airflow}@airflow-db/airflow
      AIRFLOW__CORE__EXECUTOR: LocalExecutor
      AIRFLOW__WEBSERVER__SECRET_KEY: ${AIRFLOW_SECRET_KEY:-change-me-in-prod}
      AIRFLOW__WEBSERVER__BASE_URL: ${AIRFLOW_BASE_URL:-https://o.alogins.net/airflow}
    depends_on:
      airflow-db:
        condition: service_healthy
    restart: "no"

  airflow-webserver:
    image: apache/airflow:2.9.3
    profiles: [mlops]
    command: webserver
    environment:
      AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${AIRFLOW_DB_PASSWORD:-airflow}@airflow-db/airflow
      AIRFLOW__CORE__EXECUTOR: LocalExecutor
      AIRFLOW__WEBSERVER__SECRET_KEY: ${AIRFLOW_SECRET_KEY:-change-me-in-prod}
      AIRFLOW__CORE__FERNET_KEY: ${AIRFLOW_FERNET_KEY:-}
      AIRFLOW__WEBSERVER__BASE_URL: ${AIRFLOW_BASE_URL:-https://o.alogins.net/airflow}
    volumes:
      - ../../ml/pipelines:/opt/airflow/dags:ro
    ports:
      - "127.0.0.1:8080:8080"
    depends_on:
      airflow-init:
        condition: service_completed_successfully
    healthcheck:
      test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 60s

  airflow-scheduler:
    image: apache/airflow:2.9.3
    profiles: [mlops]
    command: scheduler
    environment:
      AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${AIRFLOW_DB_PASSWORD:-airflow}@airflow-db/airflow
      AIRFLOW__CORE__EXECUTOR: LocalExecutor
      AIRFLOW__CORE__FERNET_KEY: ${AIRFLOW_FERNET_KEY:-}
    volumes:
      - ../../ml/pipelines:/opt/airflow/dags:ro
    depends_on:
      airflow-init:
        condition: service_completed_successfully

  # ── ai profile — Ollama + LiteLLM ────────────────────────────────────────
  # Start: docker compose --profile ai up
  # LiteLLM proxy: http://localhost:4000  (master key from LITELLM_MASTER_KEY)
  # Ollama API:    http://localhost:11434
  # In prod both are shared Agap services; set LITELLM_URL + OLLAMA_URL in .env.local

  ollama:
    image: ollama/ollama:latest
    profiles: [ai]
    volumes:
      - /mnt/ssd/dbs/oo/ollama:/root/.ollama
    ports:
      - "127.0.0.1:11434:11434"
    healthcheck:
      test: ["CMD", "curl", "--fail", "http://localhost:11434"]
      interval: 15s
      timeout: 5s
      retries: 5

  litellm:
    image: ghcr.io/berriai/litellm:main-latest
    profiles: [ai]
    command: ["--config", "/app/litellm_config.yaml", "--port", "4000"]
    environment:
      LITELLM_MASTER_KEY: ${LITELLM_MASTER_KEY:-sk-oo-dev}
      OLLAMA_URL: ${OLLAMA_URL:-http://ollama:11434}
    volumes:
      - ../../infra/litellm/litellm_config.yaml:/app/litellm_config.yaml:ro
    ports:
      - "127.0.0.1:4000:4000"
    depends_on:
      ollama:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "curl", "--fail", "http://localhost:4000/health"]
      interval: 15s
      timeout: 5s
      retries: 5

  mlflow:
    image: ghcr.io/mlflow/mlflow:2.14.3
    profiles: [mlops]
    command: >
      mlflow server
      --backend-store-uri sqlite:////mlflow/mlflow.db
      --default-artifact-root /mlflow/artifacts
      --host 0.0.0.0
      --port 5000
      --app-name basic-auth
      --static-prefix /mlflow
    environment:
      MLFLOW_AUTH_CONFIG_PATH: /mlflow/basic_auth.ini
    volumes:
      - /mnt/ssd/dbs/oo/mlflow:/mlflow
      - ../../infra/mlflow/basic_auth.ini:/mlflow/basic_auth.ini:ro
    ports:
      - "127.0.0.1:5000:5000"
    healthcheck:
      test: ["CMD", "curl", "--fail", "http://localhost:5000/health"]
      interval: 10s
      timeout: 5s
      retries: 5