Files
oO/infra/docker/docker-compose.yml
alvis e40dfdcbb0
Some checks failed
buf-check / Lint & breaking-change check (push) Has been cancelled
chore(infra): wire MLflow/Airflow env vars, fix healthcheck, add .dockerignore
- docker-compose: pass ML_SERVING_URL, MLFLOW_URL, AIRFLOW_URL + creds to api service
- docker-compose: pass NEXT_PUBLIC_MLFLOW_URL/AIRFLOW_URL to admin service
- docker-compose: replace wget healthcheck with node fetch (wget not in node image)
- docker-compose: enable Airflow basic_auth API backend; add MLflow pip dep for DAGs
- Dockerfiles: tighten layer caching, add .dockerignore

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-26 12:08:43 +00:00

226 lines
7.8 KiB
YAML

name: oo
services:
# ── core profile ──────────────────────────────────────────────────────────
api:
build:
context: ../..
dockerfile: infra/docker/Dockerfile.api
profiles: [core, full]
env_file: ../../.env.local
environment:
NODE_ENV: production
ML_SERVING_URL: "http://ml-serving:8000"
MLFLOW_URL: "http://mlflow:5000"
AIRFLOW_URL: "http://airflow-webserver:8080"
AIRFLOW_API_USER: "admin"
AIRFLOW_API_PASSWORD: "${AIRFLOW_ADMIN_PASSWORD:-admin}"
INTERNAL_API_TOKEN: "${INTERNAL_API_TOKEN:-}"
volumes:
- /mnt/ssd/dbs/oo:/mnt/ssd/dbs/oo
ports:
- "127.0.0.1:3078:3078"
healthcheck:
test: ["CMD", "node", "-e", "fetch('http://localhost:3078/health').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
interval: 10s
timeout: 5s
retries: 5
web:
build:
context: ../..
dockerfile: infra/docker/Dockerfile.web
profiles: [core, full]
env_file: ../../.env.local
environment:
NODE_ENV: production
PORT: "3079"
HOSTNAME: "0.0.0.0"
NEXT_PUBLIC_API_URL: "" # Caddy routes /api/* directly to the API in prod
ports:
- "127.0.0.1:3079:3079"
depends_on:
api:
condition: service_healthy
admin:
build:
context: ../..
dockerfile: infra/docker/Dockerfile.admin
profiles: [core, full]
env_file: ../../.env.local
environment:
NODE_ENV: production
PORT: "3080"
HOSTNAME: "0.0.0.0"
NEXT_PUBLIC_API_URL: ""
NEXT_PUBLIC_MLFLOW_URL: "/mlflow"
NEXT_PUBLIC_AIRFLOW_URL: "/airflow"
INTERNAL_API_URL: "http://api:3078"
ports:
- "127.0.0.1:3080:3080"
depends_on:
api:
condition: service_healthy
# ── full profile ──────────────────────────────────────────────────────────
ml-serving:
build:
context: ../..
dockerfile: infra/docker/Dockerfile.ml
profiles: [full]
env_file: ../../.env.local
environment:
LITELLM_URL: ${LITELLM_URL:-http://host.docker.internal:4000}
OLLAMA_URL: ${OLLAMA_URL:-http://host.docker.internal:11434}
extra_hosts:
- "host.docker.internal:host-gateway"
ports:
- "127.0.0.1:8000:8000"
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8000/health',timeout=3).status==200 else 1)"]
interval: 10s
timeout: 5s
retries: 5
# ── mlops profile — MLflow + Airflow ──────────────────────────────────────
# Start: docker compose --profile mlops up
# MLflow UI: http://localhost:5000 or https://o.alogins.net/mlflow (admin / password — change via basic_auth.ini)
# Airflow UI: http://localhost:8080/airflow or https://o.alogins.net/airflow (admin / AIRFLOW_ADMIN_PASSWORD)
# Caddy routes /mlflow* and /airflow* inside the o.alogins.net block
airflow-db:
image: postgres:16-alpine
profiles: [mlops]
environment:
POSTGRES_DB: airflow
POSTGRES_USER: airflow
POSTGRES_PASSWORD: ${AIRFLOW_DB_PASSWORD:-airflow}
volumes:
- /mnt/ssd/dbs/oo/airflow-db:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U airflow"]
interval: 10s
timeout: 5s
retries: 5
airflow-init:
image: apache/airflow:2.9.3
profiles: [mlops]
entrypoint: /bin/bash
command:
- -c
- |
airflow db migrate
airflow users create \
--username admin \
--firstname Admin \
--lastname User \
--role Admin \
--email admin@oo.local \
--password "$${AIRFLOW_ADMIN_PASSWORD:-admin}"
environment:
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${AIRFLOW_DB_PASSWORD:-airflow}@airflow-db/airflow
AIRFLOW__CORE__EXECUTOR: LocalExecutor
AIRFLOW__WEBSERVER__SECRET_KEY: ${AIRFLOW_SECRET_KEY:-change-me-in-prod}
AIRFLOW__WEBSERVER__BASE_URL: ${AIRFLOW_BASE_URL:-https://o.alogins.net/airflow}
depends_on:
airflow-db:
condition: service_healthy
restart: "no"
airflow-webserver:
image: apache/airflow:2.9.3
profiles: [mlops]
command: webserver
environment:
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${AIRFLOW_DB_PASSWORD:-airflow}@airflow-db/airflow
AIRFLOW__CORE__EXECUTOR: LocalExecutor
AIRFLOW__WEBSERVER__SECRET_KEY: ${AIRFLOW_SECRET_KEY:-change-me-in-prod}
AIRFLOW__CORE__FERNET_KEY: ${AIRFLOW_FERNET_KEY:-}
AIRFLOW__WEBSERVER__BASE_URL: ${AIRFLOW_BASE_URL:-https://o.alogins.net/airflow}
AIRFLOW__API__AUTH_BACKENDS: "airflow.api.auth.backend.basic_auth"
_PIP_ADDITIONAL_REQUIREMENTS: "mlflow==2.14.3 httpx"
MLFLOW_TRACKING_URI: "http://mlflow:5000/mlflow"
MLFLOW_TRACKING_USERNAME: "admin"
MLFLOW_TRACKING_PASSWORD: "${MLFLOW_ADMIN_PASSWORD:-password}"
volumes:
- ../../ml/pipelines:/opt/airflow/dags:ro
- ../../ml:/opt/airflow/ml:ro
ports:
- "127.0.0.1:8080:8080"
depends_on:
airflow-init:
condition: service_completed_successfully
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 60s
airflow-scheduler:
image: apache/airflow:2.9.3
profiles: [mlops]
command: scheduler
environment:
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${AIRFLOW_DB_PASSWORD:-airflow}@airflow-db/airflow
AIRFLOW__CORE__EXECUTOR: LocalExecutor
AIRFLOW__CORE__FERNET_KEY: ${AIRFLOW_FERNET_KEY:-}
_PIP_ADDITIONAL_REQUIREMENTS: "mlflow==2.14.3 httpx"
MLFLOW_TRACKING_URI: "http://mlflow:5000/mlflow"
MLFLOW_TRACKING_USERNAME: "admin"
MLFLOW_TRACKING_PASSWORD: "${MLFLOW_ADMIN_PASSWORD:-password}"
volumes:
- ../../ml/pipelines:/opt/airflow/dags:ro
- ../../ml:/opt/airflow/ml:ro
depends_on:
airflow-init:
condition: service_completed_successfully
# ── events profile — NATS JetStream ─────────────────────────────────────
# Start: docker compose --profile events up
# NATS monitoring: http://localhost:8222
# Enable in the API by setting NATS_URL=nats://nats:4222 in .env.local
nats:
image: nats:2.10-alpine
profiles: [events, full]
command: ["-js", "-sd", "/data", "-m", "8222"]
volumes:
- /mnt/ssd/dbs/oo/nats:/data
ports:
- "127.0.0.1:4222:4222" # client connections
- "127.0.0.1:8222:8222" # HTTP monitoring
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:8222/healthz"]
interval: 10s
timeout: 5s
retries: 5
mlflow:
image: ghcr.io/mlflow/mlflow:v2.14.3
profiles: [mlops]
command: >
mlflow server
--backend-store-uri sqlite:////mlflow/mlflow.db
--default-artifact-root /mlflow/artifacts
--host 0.0.0.0
--port 5000
--app-name basic-auth
--static-prefix /mlflow
environment:
MLFLOW_AUTH_CONFIG_PATH: /mlflow/basic_auth.ini
volumes:
- /mnt/ssd/dbs/oo/mlflow:/mlflow
- ../../infra/mlflow/basic_auth.ini:/mlflow/basic_auth.ini:ro
ports:
- "127.0.0.1:5000:5000"
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:5000/health',timeout=3).status==200 else 1)"]
interval: 10s
timeout: 5s
retries: 5