feat: MLOps external services, AI stack planning, admin MLOps hub

Infrastructure:
- Add `mlops` compose profile: MLflow (basic-auth, /mlflow path) + Airflow (LocalExecutor, /airflow path) + airflow-db
- infra/mlflow/basic_auth.ini for MLflow auth config
- Caddy routes /mlflow* and /airflow* inside existing o.alogins.net block (see agap_git)
- Dockerfile.admin: NEXT_PUBLIC_MLFLOW_URL / NEXT_PUBLIC_AIRFLOW_URL build args (default /mlflow, /airflow)

Admin panel:
- /admin/models: replace MLflow iframe with external link cards
- /admin/experiments: replace LinUCB stats with MLOps hub (links to MLflow experiments/models + Airflow DAGs/datasets)
- AdminShell: external nav links for MLflow ↗ and Airflow ↗ under MLOps section

Docs & planning:
- README: new AI stack section (Ollama/LiteLLM/OpenWebUI three-tier, tip generation pipeline, model aliases)
- README: Phase 2 expanded with AI infra issues (#86-#93) and granular pipeline breakdown
- README: Phase 4 expanded with LLM MLOps items (#94-#97)
- CLAUDE.md: AI stack section, updated current phase (M1 shipped / M2 in progress), compose profiles, updated What NOT to do
- docs/architecture/overview.md: AI stack section, updated decision flow diagram for Phase 2 LLM pipeline
- ADR-0006: updated to reflect external services (path-based, not embedded)
- Gitea issues #86-#97 created (M2: AI infra + pipeline; M4: LLM MLOps)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-17 08:20:44 +00:00
parent faf44c18fc
commit 85367aeaa0
25 changed files with 695 additions and 222 deletions

View File

@@ -0,0 +1,32 @@
FROM node:22-alpine AS base
RUN npm install -g pnpm
FROM base AS deps
WORKDIR /app
COPY package.json pnpm-workspace.yaml pnpm-lock.yaml* ./
COPY packages/shared-types/package.json ./packages/shared-types/
COPY apps/admin/package.json ./apps/admin/
RUN pnpm install --frozen-lockfile
FROM base AS builder
WORKDIR /app
COPY --from=deps /app/node_modules ./node_modules
COPY --from=deps /app/packages/shared-types/node_modules ./packages/shared-types/node_modules
COPY --from=deps /app/apps/admin/node_modules ./apps/admin/node_modules
COPY tsconfig.base.json ./
COPY packages/shared-types ./packages/shared-types
COPY apps/admin ./apps/admin
RUN pnpm --filter @oo/shared-types build
ARG NEXT_PUBLIC_MLFLOW_URL=/mlflow
ARG NEXT_PUBLIC_AIRFLOW_URL=/airflow
ENV NEXT_TELEMETRY_DISABLED=1 \
NEXT_PUBLIC_MLFLOW_URL=$NEXT_PUBLIC_MLFLOW_URL \
NEXT_PUBLIC_AIRFLOW_URL=$NEXT_PUBLIC_AIRFLOW_URL
RUN pnpm --filter @oo/admin build
FROM node:22-alpine AS runner
ENV NODE_ENV=production NEXT_TELEMETRY_DISABLED=1 PORT=3080
WORKDIR /app
COPY --from=builder /app/apps/admin/.next/standalone ./
COPY --from=builder /app/apps/admin/.next/static ./apps/admin/.next/static
CMD ["node", "apps/admin/server.js"]

View File

@@ -22,7 +22,7 @@ RUN pnpm --filter @oo/api build
FROM node:22-alpine AS runner
WORKDIR /app
RUN npm install -g pnpm
COPY package.json pnpm-workspace.yaml ./
COPY package.json pnpm-workspace.yaml pnpm-lock.yaml* ./
COPY packages/shared-types/package.json ./packages/shared-types/
COPY services/api/package.json ./services/api/
RUN pnpm install --prod --frozen-lockfile

View File

@@ -10,15 +10,13 @@ services:
profiles: [core, full]
env_file: ../../.env.local
environment:
DATABASE_PATH: /data/oo.db
PORT: "3001"
NODE_ENV: production
volumes:
- api-data:/data
- /mnt/ssd/dbs/oo:/mnt/ssd/dbs/oo
ports:
- "3001:3001"
- "127.0.0.1:3078:3078"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3001/health"]
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3078/health"]
interval: 10s
timeout: 5s
retries: 5
@@ -30,9 +28,30 @@ services:
profiles: [core, full]
env_file: ../../.env.local
environment:
NEXT_PUBLIC_API_URL: "" # rewrites proxy to /api, no cross-origin needed in prod
NODE_ENV: production
PORT: "3079"
HOSTNAME: "0.0.0.0"
NEXT_PUBLIC_API_URL: "" # Caddy routes /api/* directly to the API in prod
ports:
- "3000:3000"
- "127.0.0.1:3079:3079"
depends_on:
api:
condition: service_healthy
admin:
build:
context: ../..
dockerfile: infra/docker/Dockerfile.admin
profiles: [core, full]
env_file: ../../.env.local
environment:
NODE_ENV: production
PORT: "3080"
HOSTNAME: "0.0.0.0"
NEXT_PUBLIC_API_URL: ""
INTERNAL_API_URL: "http://api:3078"
ports:
- "127.0.0.1:3080:3080"
depends_on:
api:
condition: service_healthy
@@ -45,12 +64,117 @@ services:
dockerfile: infra/docker/Dockerfile.ml
profiles: [full]
ports:
- "8000:8000"
- "127.0.0.1:8000:8000"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
test: ["CMD", "wget", "--spider", "-q", "http://localhost:8000/health"]
interval: 10s
timeout: 5s
retries: 5
volumes:
api-data:
# ── mlops profile — MLflow + Airflow ──────────────────────────────────────
# Start: docker compose --profile mlops up
# MLflow UI: http://localhost:5000 or https://o.alogins.net/mlflow (admin / password — change via basic_auth.ini)
# Airflow UI: http://localhost:8080/airflow or https://o.alogins.net/airflow (admin / AIRFLOW_ADMIN_PASSWORD)
# Caddy routes /mlflow* and /airflow* inside the o.alogins.net block
airflow-db:
image: postgres:16-alpine
profiles: [mlops]
environment:
POSTGRES_DB: airflow
POSTGRES_USER: airflow
POSTGRES_PASSWORD: ${AIRFLOW_DB_PASSWORD:-airflow}
volumes:
- /mnt/ssd/dbs/oo/airflow-db:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U airflow"]
interval: 10s
timeout: 5s
retries: 5
airflow-init:
image: apache/airflow:2.9.3
profiles: [mlops]
entrypoint: /bin/bash
command:
- -c
- |
airflow db migrate
airflow users create \
--username admin \
--firstname Admin \
--lastname User \
--role Admin \
--email admin@oo.local \
--password "$${AIRFLOW_ADMIN_PASSWORD:-admin}"
environment:
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${AIRFLOW_DB_PASSWORD:-airflow}@airflow-db/airflow
AIRFLOW__CORE__EXECUTOR: LocalExecutor
AIRFLOW__WEBSERVER__SECRET_KEY: ${AIRFLOW_SECRET_KEY:-change-me-in-prod}
AIRFLOW__WEBSERVER__BASE_URL: ${AIRFLOW_BASE_URL:-https://o.alogins.net/airflow}
depends_on:
airflow-db:
condition: service_healthy
restart: "no"
airflow-webserver:
image: apache/airflow:2.9.3
profiles: [mlops]
command: webserver
environment:
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${AIRFLOW_DB_PASSWORD:-airflow}@airflow-db/airflow
AIRFLOW__CORE__EXECUTOR: LocalExecutor
AIRFLOW__WEBSERVER__SECRET_KEY: ${AIRFLOW_SECRET_KEY:-change-me-in-prod}
AIRFLOW__CORE__FERNET_KEY: ${AIRFLOW_FERNET_KEY:-}
AIRFLOW__WEBSERVER__BASE_URL: ${AIRFLOW_BASE_URL:-https://o.alogins.net/airflow}
volumes:
- ../../ml/pipelines:/opt/airflow/dags:ro
ports:
- "127.0.0.1:8080:8080"
depends_on:
airflow-init:
condition: service_completed_successfully
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 60s
airflow-scheduler:
image: apache/airflow:2.9.3
profiles: [mlops]
command: scheduler
environment:
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${AIRFLOW_DB_PASSWORD:-airflow}@airflow-db/airflow
AIRFLOW__CORE__EXECUTOR: LocalExecutor
AIRFLOW__CORE__FERNET_KEY: ${AIRFLOW_FERNET_KEY:-}
volumes:
- ../../ml/pipelines:/opt/airflow/dags:ro
depends_on:
airflow-init:
condition: service_completed_successfully
mlflow:
image: ghcr.io/mlflow/mlflow:2.14.3
profiles: [mlops]
command: >
mlflow server
--backend-store-uri sqlite:////mlflow/mlflow.db
--default-artifact-root /mlflow/artifacts
--host 0.0.0.0
--port 5000
--app-name basic-auth
--static-prefix /mlflow
environment:
MLFLOW_AUTH_CONFIG_PATH: /mlflow/basic_auth.ini
volumes:
- /mnt/ssd/dbs/oo/mlflow:/mlflow
- ../../infra/mlflow/basic_auth.ini:/mlflow/basic_auth.ini:ro
ports:
- "127.0.0.1:5000:5000"
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:5000/health"]
interval: 10s
timeout: 5s
retries: 5