Files
oO/infra/docker/docker-compose.yml
alvis c43dbaf23d feat(serving): add MLflow tracing to ml-serving for all agent calls
Logs one MLflow run per /recommend (params, token metrics, latency,
full prompt + tip as artifacts) and per /agents/{id}/compute and
/infer call (signals snapshot, inferred prefs, latency).

Tracing is a no-op when MLFLOW_TRACKING_URI is unset; ml-serving
starts and serves tips correctly without MLflow configured.

Refs #118 (M4: remove from production / move off critical path).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 10:30:24 +00:00

130 lines
4.3 KiB
YAML

name: oo
services:
# ── core profile ──────────────────────────────────────────────────────────
api:
build:
context: ../..
dockerfile: infra/docker/Dockerfile.api
profiles: [core, full]
env_file: ../../.env.local
environment:
NODE_ENV: production
ML_SERVING_URL: "http://ml-serving:8000"
MLFLOW_URL: "http://mlflow:5000"
INTERNAL_API_TOKEN: "${INTERNAL_API_TOKEN:-}"
volumes:
- /mnt/ssd/dbs/oo:/mnt/ssd/dbs/oo
ports:
- "127.0.0.1:3078:3078"
healthcheck:
test: ["CMD", "node", "-e", "fetch('http://localhost:3078/health').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
interval: 10s
timeout: 5s
retries: 5
web:
build:
context: ../..
dockerfile: infra/docker/Dockerfile.web
profiles: [core, full]
env_file: ../../.env.local
environment:
NODE_ENV: production
PORT: "3079"
HOSTNAME: "0.0.0.0"
NEXT_PUBLIC_API_URL: "" # Caddy routes /api/* directly to the API in prod
ports:
- "127.0.0.1:3079:3079"
depends_on:
api:
condition: service_healthy
admin:
build:
context: ../..
dockerfile: infra/docker/Dockerfile.admin
profiles: [core, full]
env_file: ../../.env.local
environment:
NODE_ENV: production
PORT: "3080"
HOSTNAME: "0.0.0.0"
NEXT_PUBLIC_API_URL: ""
NEXT_PUBLIC_MLFLOW_URL: "/mlflow"
INTERNAL_API_URL: "http://api:3078"
ports:
- "127.0.0.1:3080:3080"
depends_on:
api:
condition: service_healthy
# ── full profile ──────────────────────────────────────────────────────────
ml-serving:
build:
context: ../..
dockerfile: infra/docker/Dockerfile.ml
profiles: [full]
env_file: ../../.env.local
environment:
LITELLM_URL: ${LITELLM_URL:-http://host.docker.internal:4000}
OLLAMA_URL: ${OLLAMA_URL:-http://host.docker.internal:11434}
MLFLOW_TRACKING_URI: ${MLFLOW_TRACKING_URI:-}
extra_hosts:
- "host.docker.internal:host-gateway"
ports:
- "127.0.0.1:8000:8000"
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8000/health',timeout=3).status==200 else 1)"]
interval: 10s
timeout: 5s
retries: 5
# ── mlops profile — MLflow ────────────────────────────────────────────────
# Start: docker compose --profile mlops up
# MLflow UI: http://localhost:5000 or https://o.alogins.net/mlflow
# ── events profile — NATS JetStream ─────────────────────────────────────
# Start: docker compose --profile events up
# NATS monitoring: http://localhost:8222
# Enable in the API by setting NATS_URL=nats://nats:4222 in .env.local
nats:
image: nats:2.10-alpine
profiles: [events, full]
command: ["-js", "-sd", "/data", "-m", "8222"]
volumes:
- /mnt/ssd/dbs/oo/nats:/data
ports:
- "127.0.0.1:4222:4222" # client connections
- "127.0.0.1:8222:8222" # HTTP monitoring
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:8222/healthz"]
interval: 10s
timeout: 5s
retries: 5
mlflow:
image: ghcr.io/mlflow/mlflow:v3.11.1
profiles: [mlops]
command: >
mlflow server
--backend-store-uri sqlite:////mlflow/mlflow.db
--default-artifact-root /mlflow/artifacts
--host 0.0.0.0
--port 5000
--static-prefix /mlflow
--allowed-hosts o.alogins.net,localhost
--cors-allowed-origins https://o.alogins.net
volumes:
- /mnt/ssd/dbs/oo/mlflow:/mlflow
ports:
- "127.0.0.1:5000:5000"
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:5000/mlflow/health',timeout=3).status==200 else 1)"]
interval: 10s
timeout: 5s
retries: 5