feat(simulate): MLflow tracking, Airflow DAG integration, health checks for mlflow/airflow

- sim_runs schema: add judge_mode, n_policies, airflow_dag_run_id, mlflow_run_id columns
- admin health endpoint: add mlflow + airflow checks (Basic auth for Airflow API)
- admin nav: add Simulations page link; rename section label
- runner.py: optional MLflow experiment tracking; multi-policy support
- sim_dag.py: Airflow DAG for offline sim pipeline
- admin simulate page + API client methods for sim runs
- shared-types tsconfig: exclude test files from build

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-26 12:08:36 +00:00
parent e96ceb7ee1
commit bad1bb2cba
12 changed files with 818 additions and 107 deletions

View File

@@ -156,6 +156,10 @@ export function runMigrations() {
`ALTER TABLE tip_scores ADD COLUMN prompt_version TEXT`,
`ALTER TABLE tip_scores ADD COLUMN llm_model TEXT`,
`ALTER TABLE tip_scores ADD COLUMN tip_kind TEXT`,
`ALTER TABLE sim_runs ADD COLUMN airflow_dag_run_id TEXT`,
`ALTER TABLE sim_runs ADD COLUMN mlflow_run_id TEXT`,
`ALTER TABLE sim_runs ADD COLUMN judge_mode TEXT NOT NULL DEFAULT 'rule'`,
`ALTER TABLE sim_runs ADD COLUMN n_policies INTEGER NOT NULL DEFAULT 2`,
]) {
try { sqlite.exec(stmt); } catch { /* column already exists */ }
}

View File

@@ -112,9 +112,13 @@ export const simRuns = sqliteTable('sim_runs', {
tasksPerRound: integer('tasks_per_round').notNull().default(8),
useLlm: integer('use_llm', { mode: 'boolean' }).notNull().default(false),
status: text('status').notNull().default('pending'), // 'pending'|'running'|'done'|'failed'
judgeMode: text('judge_mode').notNull().default('rule'),
nPolicies: integer('n_policies').notNull().default(2),
summaryJson: text('summary_json'), // JSON: { [policy]: PolicySummary }
winner: text('winner'),
personaBreakdownJson: text('persona_breakdown_json'), // JSON: { [persona]: { [policy]: {reward,n} } }
airflowDagRunId: text('airflow_dag_run_id'),
mlflowRunId: text('mlflow_run_id'),
createdAt: text('created_at').notNull(),
finishedAt: text('finished_at'),
});

View File

@@ -15,7 +15,7 @@ import { integrationsRouter } from './routes/integrations.js';
import { recommenderRouter } from './routes/recommender.js';
import { userRouter } from './routes/user.js';
import { pushRouter } from './routes/push.js';
import { adminRouter } from './routes/admin.js';
import { adminRouter, adminInternalRouter } from './routes/admin.js';
import { mkdir } from 'fs/promises';
import { dirname } from 'path';
import { requireAuth } from './middleware/session.js';
@@ -65,6 +65,7 @@ app.use('/api', recommenderRouter);
app.use('/api/user', userRouter);
app.use('/api/push', pushRouter);
app.use('/api/admin', adminRouter);
app.use('/api/admin', adminInternalRouter);
app.use('/api/ml', requireAuth as any, requireAdmin as any, async (req: Request, res: Response) => {
const mlUrl = config.ML_SERVING_URL;

View File

@@ -4,7 +4,7 @@
* A real Express app + in-memory SQLite DB per test suite.
* Auth and admin middleware are mocked so we can focus on route logic.
*/
import { describe, it, expect, vi, beforeAll } from 'vitest';
import { describe, it, expect, vi, beforeAll, afterEach } from 'vitest';
import express from 'express';
import * as http from 'http';
import { makeTestDb } from '../../test/db.js';
@@ -385,16 +385,126 @@ describe('GET /api/admin/events', () => {
});
});
// ---------------------------------------------------------------------------
// Health endpoint — mock fetch so tests don't depend on running services.
// ---------------------------------------------------------------------------
describe('GET /api/admin/health', () => {
it('returns 200 with ok, services array, and checkedAt', async () => {
const EXPECTED_HTTP_SERVICES = ['api', 'ml-serving', 'mlflow', 'airflow'] as const;
const EXPECTED_INTERNAL = ['sqlite', 'event-bus'] as const;
const VALID_STATUSES = new Set(['ok', 'degraded', 'down']);
type ServiceRow = { name: string; status: string; latencyMs: number };
type HealthBody = { ok: boolean; services: ServiceRow[]; checkedAt: string };
function mockFetch(upServices: Set<string>) {
// Resolve service name by port (matches defaults in config.ts).
// Up services return HTTP 200; absent ones throw (simulates connection refused → 'down').
vi.stubGlobal('fetch', async (url: string) => {
const s = String(url);
let name: string;
if (s.includes(':8000')) name = 'ml-serving';
else if (s.includes(':5000')) name = 'mlflow';
else if (s.includes(':8080')) name = 'airflow';
else name = 'api';
if (!upServices.has(name)) throw new Error(`ECONNREFUSED ${name}`);
return { ok: true, json: async () => ({ ok: true, status: 'healthy' }) };
});
}
afterEach(() => vi.unstubAllGlobals());
it('shape: 200, typed fields, all expected services present', async () => {
mockFetch(new Set(['api', 'ml-serving', 'mlflow', 'airflow']));
const { server, call } = await startServer(buildApp());
try {
const { status, body } = await call('GET', '/api/admin/health');
const b = body as { ok: boolean; services: { name: string; status: string }[]; checkedAt: string };
const b = body as HealthBody;
expect(status).toBe(200);
expect(typeof b.ok).toBe('boolean');
expect(Array.isArray(b.services)).toBe(true);
expect(typeof b.checkedAt).toBe('string');
expect(new Date(b.checkedAt).getTime()).toBeGreaterThan(0);
const names = b.services.map((s) => s.name);
for (const svc of [...EXPECTED_HTTP_SERVICES, ...EXPECTED_INTERNAL]) {
expect(names).toContain(svc);
}
for (const svc of b.services) {
expect(VALID_STATUSES).toContain(svc.status);
expect(typeof svc.latencyMs).toBe('number');
}
} finally {
server.close();
}
});
it('ok=true when all HTTP services respond 200', async () => {
mockFetch(new Set(['api', 'ml-serving', 'mlflow', 'airflow']));
const { server, call } = await startServer(buildApp());
try {
const { body } = await call('GET', '/api/admin/health');
const b = body as HealthBody;
for (const name of EXPECTED_HTTP_SERVICES) {
const svc = b.services.find((s) => s.name === name);
expect(svc?.status, `${name} should be ok`).toBe('ok');
}
expect(b.ok).toBe(true);
} finally {
server.close();
}
});
it('ml-serving=down and ok=false when ml-serving is unreachable', async () => {
mockFetch(new Set(['api', 'mlflow', 'airflow'])); // ml-serving absent
const { server, call } = await startServer(buildApp());
try {
const { body } = await call('GET', '/api/admin/health');
const b = body as HealthBody;
const mlSvc = b.services.find((s) => s.name === 'ml-serving');
expect(mlSvc?.status).toBe('down');
expect(b.ok).toBe(false);
} finally {
server.close();
}
});
it('airflow=down and ok=false when airflow is unreachable', async () => {
mockFetch(new Set(['api', 'ml-serving', 'mlflow'])); // airflow absent
const { server, call } = await startServer(buildApp());
try {
const { body } = await call('GET', '/api/admin/health');
const b = body as HealthBody;
const svc = b.services.find((s) => s.name === 'airflow');
expect(svc?.status).toBe('down');
expect(b.ok).toBe(false);
} finally {
server.close();
}
});
it('mlflow=down and ok=false when mlflow is unreachable', async () => {
mockFetch(new Set(['api', 'ml-serving', 'airflow'])); // mlflow absent
const { server, call } = await startServer(buildApp());
try {
const { body } = await call('GET', '/api/admin/health');
const b = body as HealthBody;
const svc = b.services.find((s) => s.name === 'mlflow');
expect(svc?.status).toBe('down');
expect(b.ok).toBe(false);
} finally {
server.close();
}
});
it('sqlite and event-bus are always present regardless of HTTP service status', async () => {
mockFetch(new Set()); // all HTTP services down
const { server, call } = await startServer(buildApp());
try {
const { body } = await call('GET', '/api/admin/health');
const b = body as HealthBody;
expect(b.services.find((s) => s.name === 'sqlite')?.status).toBe('ok');
expect(b.services.find((s) => s.name === 'event-bus')?.status).toBe('ok');
} finally {
server.close();
}

View File

@@ -1,4 +1,4 @@
import { type Router as ExpressRouter, Router, Response } from 'express';
import { type Router as ExpressRouter, Router, Response, type Request } from 'express';
import { logger } from '../logger.js';
import { db, rawSqlite } from '../db/index.js';
import {
@@ -524,16 +524,24 @@ router.get('/data-quality', async (req: AuthenticatedRequest, res: Response) =>
// Fan-out to all subsystem /health endpoints.
// ---------------------------------------------------------------------------
router.get('/health', async (_req: AuthenticatedRequest, res: Response) => {
const checks: Array<{ name: string; url: string }> = [
{ name: 'api', url: `http://localhost:${process.env.PORT ?? 3001}/health` },
const airflowAuth = Buffer.from(`${config.AIRFLOW_API_USER}:${config.AIRFLOW_API_PASSWORD}`).toString('base64');
const checks: Array<{ name: string; url: string; headers?: Record<string, string> }> = [
{ name: 'api', url: `http://localhost:${config.PORT}/health` },
{ name: 'ml-serving', url: `${config.ML_SERVING_URL}/health` },
{ name: 'mlflow', url: `${config.MLFLOW_URL}/health` },
{ name: 'airflow', url: `${config.AIRFLOW_URL}/api/v1/health`,
headers: { Authorization: `Basic ${airflowAuth}` } },
];
const results = await Promise.allSettled(
checks.map(async ({ name, url }) => {
checks.map(async ({ name, url, headers }) => {
const t0 = Date.now();
try {
const r = await fetch(url, { signal: AbortSignal.timeout(3000) });
const r = await fetch(url, {
headers,
signal: AbortSignal.timeout(3000),
});
return { name, status: r.ok ? 'ok' : 'degraded', latencyMs: Date.now() - t0 };
} catch {
return { name, status: 'down', latencyMs: Date.now() - t0 };
@@ -549,15 +557,12 @@ router.get('/health', async (_req: AuthenticatedRequest, res: Response) => {
dbStatus = 'down';
}
// Event bus: always ok if process is alive
const eventBusStatus = 'ok';
const services = results.map((r) =>
r.status === 'fulfilled' ? r.value : { name: 'unknown', status: 'down', latencyMs: 0 },
);
services.push({ name: 'sqlite', status: dbStatus, latencyMs: 0 });
services.push({ name: 'event-bus', status: eventBusStatus, latencyMs: 0 });
services.push({ name: 'sqlite', status: dbStatus, latencyMs: 0 });
services.push({ name: 'event-bus', status: 'ok', latencyMs: 0 });
const allOk = services.every((s) => s.status === 'ok');
res.json({ ok: allOk, services, checkedAt: new Date().toISOString() });
@@ -700,22 +705,21 @@ router.delete('/saved-queries/:id', async (req: AuthenticatedRequest, res: Respo
// ---------------------------------------------------------------------------
// POST /api/admin/simulate/start
// Spawn ml/experiments/sim/runner.py in the background; return run_id.
// Trigger an Airflow DAG run (bandit_sim). Falls back to a local subprocess
// when AIRFLOW_URL is not reachable, so local dev still works.
// ---------------------------------------------------------------------------
router.post('/simulate/start', async (req: AuthenticatedRequest, res: Response) => {
const {
nUsers = 5,
nRounds = 20,
tasksPerRound = 8,
useLlm = false,
judgeMode = 'rule',
policies = ['linucb-v1', 'egreedy-v1'],
} = req.body as {
nUsers?: number;
nRounds?: number;
tasksPerRound?: number;
useLlm?: boolean;
judgeMode?: 'rule' | 'llm' | 'claude-code';
judgeMode?: 'rule' | 'llm';
policies?: string[];
};
@@ -734,17 +738,69 @@ router.post('/simulate/start', async (req: AuthenticatedRequest, res: Response)
nUsers,
nRounds,
tasksPerRound,
useLlm,
useLlm: judgeMode === 'llm',
judgeMode,
nPolicies: policies.length,
status: 'running',
createdAt: now,
});
// ── Try Airflow first ────────────────────────────────────────────────────
if (config.AIRFLOW_URL && config.INTERNAL_API_TOKEN) {
try {
const airflowAuth = Buffer.from(
`${config.AIRFLOW_API_USER}:${config.AIRFLOW_API_PASSWORD}`,
).toString('base64');
const dagRes = await fetch(
`${config.AIRFLOW_URL}/api/v1/dags/bandit_sim/dagRuns`,
{
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Basic ${airflowAuth}`,
},
body: JSON.stringify({
conf: {
sim_run_id: id,
n_users: nUsers,
n_rounds: nRounds,
tasks_per_round: tasksPerRound,
policies,
judge_mode: judgeMode,
ml_url: config.ML_SERVING_URL,
mlflow_url: config.MLFLOW_URL,
callback_url: `${config.API_BASE_URL}/api/admin/simulate/${id}/complete`,
internal_token: config.INTERNAL_API_TOKEN,
},
}),
signal: AbortSignal.timeout(5000),
},
);
if (dagRes.ok) {
const dagBody = await dagRes.json() as { dag_run_id: string };
await db
.update(simRuns)
.set({ airflowDagRunId: dagBody.dag_run_id })
.where(eq(simRuns.id, id));
res.json({ id, status: 'running', airflow_dag_run_id: dagBody.dag_run_id });
return;
}
logger.warn({ status: dagRes.status }, 'sim: Airflow trigger failed, falling back to subprocess');
} catch (err) {
logger.warn({ err }, 'sim: Airflow unreachable, falling back to subprocess');
}
}
// ── Subprocess fallback (local dev / Airflow not configured) ────────────
const runnerPath = resolve(__dirname, '../../../../ml/experiments/sim/runner.py');
const venvPython = resolve(__dirname, '../../../../ml/serving/.venv/bin/python');
const pythonBin = existsSync(venvPython) ? venvPython : 'python3';
const outPath = `/tmp/oo-sim-${id}.json`;
const args = [
const child = spawn(pythonBin, [
runnerPath,
'--n-users', String(nUsers),
'--n-rounds', String(nRounds),
@@ -752,32 +808,22 @@ router.post('/simulate/start', async (req: AuthenticatedRequest, res: Response)
'--ml-url', config.ML_SERVING_URL,
'--policies', ...policies,
'--out', outPath,
'--judge', judgeMode === 'llm' ? 'llm' : judgeMode === 'claude-code' ? 'rule' : 'rule',
// claude-code mode isn't auto-runnable from the API (requires human in the loop)
// it falls back to rule judge when triggered from the panel
];
'--judge', judgeMode,
'--mlflow-url', config.MLFLOW_URL,
'--mlflow-experiment', 'bandit_simulation',
], { stdio: ['ignore', 'pipe', 'pipe'] });
const child = spawn(pythonBin, args, { stdio: ['ignore', 'pipe', 'pipe'] });
if (child.pid) _simProcesses.set(id, { pid: child.pid, startedAt: now });
if (child.pid) {
_simProcesses.set(id, { pid: child.pid, startedAt: now });
}
// Without this listener, a spawn failure (ENOENT when python3 is absent
// — e.g. in the alpine api container) would emit an unhandled 'error' event
// and crash the whole API process.
child.on('error', async (err) => {
logger.error({ err }, 'sim: spawn error');
_simProcesses.delete(id);
await db
.update(simRuns)
await db.update(simRuns)
.set({ status: 'failed', finishedAt: new Date().toISOString() })
.where(eq(simRuns.id, id));
});
// Capture stderr for debugging
const stderrLines: string[] = [];
child.stderr?.on('data', (d: Buffer) => stderrLines.push(d.toString()));
child.stderr?.on('data', (d: Buffer) => logger.debug({ stderr: d.toString() }, 'sim stderr'));
child.on('exit', async (code) => {
_simProcesses.delete(id);
@@ -786,8 +832,6 @@ router.post('/simulate/start', async (req: AuthenticatedRequest, res: Response)
if (code === 0 && existsSync(outPath)) {
try {
const raw = JSON.parse(readFileSync(outPath, 'utf-8'));
// Bulk-insert sim events
const eventRows = (raw.events ?? []).map((ev: Record<string, unknown>) => ({
id: nanoid(),
runId: id,
@@ -805,21 +849,19 @@ router.post('/simulate/start', async (req: AuthenticatedRequest, res: Response)
dayOfWeek: Number(ev.day_of_week),
createdAt: now,
}));
for (const row of eventRows) {
await db.insert(simEvents).values(row).catch(() => {});
}
await db.update(simRuns).set({
status: 'done',
summaryJson: JSON.stringify(raw.summary),
winner: raw.winner,
personaBreakdownJson: JSON.stringify(raw.persona_breakdown),
mlflowRunId: raw.mlflow_run_id ?? null,
finishedAt,
}).where(eq(simRuns.id, id));
try { unlinkSync(outPath); } catch { /* ignore */ }
} catch (e) {
} catch {
await db.update(simRuns).set({ status: 'failed', finishedAt }).where(eq(simRuns.id, id));
}
} else {
@@ -864,4 +906,68 @@ router.get('/simulate/:id', async (req: AuthenticatedRequest, res: Response) =>
res.json({ run: { ...run, isRunning }, events });
});
export { router as adminRouter };
// ---------------------------------------------------------------------------
// internalRouter — no session auth; only INTERNAL_API_TOKEN header check.
// Mounted separately in index.ts at /api/admin to avoid router.use() auth.
// ---------------------------------------------------------------------------
const internalRouter: ExpressRouter = Router();
internalRouter.post('/simulate/:id/complete', async (req: Request, res: Response) => {
const token = req.headers['x-internal-token'];
if (!config.INTERNAL_API_TOKEN || token !== config.INTERNAL_API_TOKEN) {
res.status(401).json({ error: 'Unauthorized' });
return;
}
const { id } = req.params as { id: string };
const { summary, winner, persona_breakdown, events: rawEvents, mlflow_run_id } =
req.body as {
summary: Record<string, unknown>;
winner: string;
persona_breakdown: Record<string, unknown>;
events: Record<string, unknown>[];
mlflow_run_id?: string;
};
const finishedAt = new Date().toISOString();
const now = finishedAt;
try {
const eventRows = (rawEvents ?? []).map((ev) => ({
id: nanoid(),
runId: id,
round: Number(ev['round']),
userId: String(ev['user_id']),
persona: String(ev['persona']),
policy: String(ev['policy']),
tipContent: String(ev['tip_content']),
priority: Number(ev['priority']),
isOverdue: Boolean(ev['is_overdue']),
action: String(ev['action']),
dwellMs: ev['dwell_ms'] != null ? Number(ev['dwell_ms']) : null,
rewardMilli: Math.round(Number(ev['reward']) * 1000),
hour: Number(ev['hour']),
dayOfWeek: Number(ev['day_of_week']),
createdAt: now,
}));
for (const row of eventRows) {
await db.insert(simEvents).values(row).catch(() => {});
}
await db.update(simRuns).set({
status: 'done',
summaryJson: JSON.stringify(summary),
winner,
personaBreakdownJson: JSON.stringify(persona_breakdown),
mlflowRunId: mlflow_run_id ?? null,
finishedAt,
}).where(eq(simRuns.id, id));
res.json({ ok: true });
} catch (err) {
logger.error({ err }, 'sim: complete callback failed');
await db.update(simRuns).set({ status: 'failed', finishedAt }).where(eq(simRuns.id, id));
res.status(500).json({ error: 'Failed to store results' });
}
});
export { router as adminRouter, internalRouter as adminInternalRouter };