Files
oO/infra/docker/docker-compose.yml
alvis f8d66aa01f chore: remove Airflow completely from the stack
Drop all four Airflow containers (db, init, webserver, scheduler) from the
mlops compose profile, leaving MLflow as the sole mlops service. Remove
AIRFLOW_* env vars, config fields, health-check entries, DAG trigger code
in admin/bench routes, the airflow_dag_run_id schema column, Airflow nav
links and DAG-run links in the admin UI, the two Airflow DAG files
(bench_dag.py, sim_dag.py), and all related docs/ADR references.
Simulations now run exclusively via the subprocess path.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-03 16:38:46 +00:00

129 lines
4.2 KiB
YAML

name: oo
services:
# ── core profile ──────────────────────────────────────────────────────────
api:
build:
context: ../..
dockerfile: infra/docker/Dockerfile.api
profiles: [core, full]
env_file: ../../.env.local
environment:
NODE_ENV: production
ML_SERVING_URL: "http://ml-serving:8000"
MLFLOW_URL: "http://mlflow:5000"
INTERNAL_API_TOKEN: "${INTERNAL_API_TOKEN:-}"
volumes:
- /mnt/ssd/dbs/oo:/mnt/ssd/dbs/oo
ports:
- "127.0.0.1:3078:3078"
healthcheck:
test: ["CMD", "node", "-e", "fetch('http://localhost:3078/health').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
interval: 10s
timeout: 5s
retries: 5
web:
build:
context: ../..
dockerfile: infra/docker/Dockerfile.web
profiles: [core, full]
env_file: ../../.env.local
environment:
NODE_ENV: production
PORT: "3079"
HOSTNAME: "0.0.0.0"
NEXT_PUBLIC_API_URL: "" # Caddy routes /api/* directly to the API in prod
ports:
- "127.0.0.1:3079:3079"
depends_on:
api:
condition: service_healthy
admin:
build:
context: ../..
dockerfile: infra/docker/Dockerfile.admin
profiles: [core, full]
env_file: ../../.env.local
environment:
NODE_ENV: production
PORT: "3080"
HOSTNAME: "0.0.0.0"
NEXT_PUBLIC_API_URL: ""
NEXT_PUBLIC_MLFLOW_URL: "/mlflow"
INTERNAL_API_URL: "http://api:3078"
ports:
- "127.0.0.1:3080:3080"
depends_on:
api:
condition: service_healthy
# ── full profile ──────────────────────────────────────────────────────────
ml-serving:
build:
context: ../..
dockerfile: infra/docker/Dockerfile.ml
profiles: [full]
env_file: ../../.env.local
environment:
LITELLM_URL: ${LITELLM_URL:-http://host.docker.internal:4000}
OLLAMA_URL: ${OLLAMA_URL:-http://host.docker.internal:11434}
extra_hosts:
- "host.docker.internal:host-gateway"
ports:
- "127.0.0.1:8000:8000"
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8000/health',timeout=3).status==200 else 1)"]
interval: 10s
timeout: 5s
retries: 5
# ── mlops profile — MLflow ────────────────────────────────────────────────
# Start: docker compose --profile mlops up
# MLflow UI: http://localhost:5000 or https://o.alogins.net/mlflow
# ── events profile — NATS JetStream ─────────────────────────────────────
# Start: docker compose --profile events up
# NATS monitoring: http://localhost:8222
# Enable in the API by setting NATS_URL=nats://nats:4222 in .env.local
nats:
image: nats:2.10-alpine
profiles: [events, full]
command: ["-js", "-sd", "/data", "-m", "8222"]
volumes:
- /mnt/ssd/dbs/oo/nats:/data
ports:
- "127.0.0.1:4222:4222" # client connections
- "127.0.0.1:8222:8222" # HTTP monitoring
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:8222/healthz"]
interval: 10s
timeout: 5s
retries: 5
mlflow:
image: ghcr.io/mlflow/mlflow:v3.11.1
profiles: [mlops]
command: >
mlflow server
--backend-store-uri sqlite:////mlflow/mlflow.db
--default-artifact-root /mlflow/artifacts
--host 0.0.0.0
--port 5000
--static-prefix /mlflow
--allowed-hosts o.alogins.net,localhost
--cors-allowed-origins https://o.alogins.net
volumes:
- /mnt/ssd/dbs/oo/mlflow:/mlflow
ports:
- "127.0.0.1:5000:5000"
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:5000/mlflow/health',timeout=3).status==200 else 1)"]
interval: 10s
timeout: 5s
retries: 5