feat(simulate): MLflow tracking, Airflow DAG integration, health checks for mlflow/airflow

- sim_runs schema: add judge_mode, n_policies, airflow_dag_run_id, mlflow_run_id columns
- admin health endpoint: add mlflow + airflow checks (Basic auth for Airflow API)
- admin nav: add Simulations page link; rename section label
- runner.py: optional MLflow experiment tracking; multi-policy support
- sim_dag.py: Airflow DAG for offline sim pipeline
- admin simulate page + API client methods for sim runs
- shared-types tsconfig: exclude test files from build

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-26 12:08:36 +00:00
parent e96ceb7ee1
commit bad1bb2cba
12 changed files with 818 additions and 107 deletions

View File

@@ -4,7 +4,7 @@
* A real Express app + in-memory SQLite DB per test suite.
* Auth and admin middleware are mocked so we can focus on route logic.
*/
import { describe, it, expect, vi, beforeAll } from 'vitest';
import { describe, it, expect, vi, beforeAll, afterEach } from 'vitest';
import express from 'express';
import * as http from 'http';
import { makeTestDb } from '../../test/db.js';
@@ -385,16 +385,126 @@ describe('GET /api/admin/events', () => {
});
});
// ---------------------------------------------------------------------------
// Health endpoint — mock fetch so tests don't depend on running services.
// ---------------------------------------------------------------------------
describe('GET /api/admin/health', () => {
it('returns 200 with ok, services array, and checkedAt', async () => {
const EXPECTED_HTTP_SERVICES = ['api', 'ml-serving', 'mlflow', 'airflow'] as const;
const EXPECTED_INTERNAL = ['sqlite', 'event-bus'] as const;
const VALID_STATUSES = new Set(['ok', 'degraded', 'down']);
type ServiceRow = { name: string; status: string; latencyMs: number };
type HealthBody = { ok: boolean; services: ServiceRow[]; checkedAt: string };
function mockFetch(upServices: Set<string>) {
// Resolve service name by port (matches defaults in config.ts).
// Up services return HTTP 200; absent ones throw (simulates connection refused → 'down').
vi.stubGlobal('fetch', async (url: string) => {
const s = String(url);
let name: string;
if (s.includes(':8000')) name = 'ml-serving';
else if (s.includes(':5000')) name = 'mlflow';
else if (s.includes(':8080')) name = 'airflow';
else name = 'api';
if (!upServices.has(name)) throw new Error(`ECONNREFUSED ${name}`);
return { ok: true, json: async () => ({ ok: true, status: 'healthy' }) };
});
}
afterEach(() => vi.unstubAllGlobals());
it('shape: 200, typed fields, all expected services present', async () => {
mockFetch(new Set(['api', 'ml-serving', 'mlflow', 'airflow']));
const { server, call } = await startServer(buildApp());
try {
const { status, body } = await call('GET', '/api/admin/health');
const b = body as { ok: boolean; services: { name: string; status: string }[]; checkedAt: string };
const b = body as HealthBody;
expect(status).toBe(200);
expect(typeof b.ok).toBe('boolean');
expect(Array.isArray(b.services)).toBe(true);
expect(typeof b.checkedAt).toBe('string');
expect(new Date(b.checkedAt).getTime()).toBeGreaterThan(0);
const names = b.services.map((s) => s.name);
for (const svc of [...EXPECTED_HTTP_SERVICES, ...EXPECTED_INTERNAL]) {
expect(names).toContain(svc);
}
for (const svc of b.services) {
expect(VALID_STATUSES).toContain(svc.status);
expect(typeof svc.latencyMs).toBe('number');
}
} finally {
server.close();
}
});
it('ok=true when all HTTP services respond 200', async () => {
mockFetch(new Set(['api', 'ml-serving', 'mlflow', 'airflow']));
const { server, call } = await startServer(buildApp());
try {
const { body } = await call('GET', '/api/admin/health');
const b = body as HealthBody;
for (const name of EXPECTED_HTTP_SERVICES) {
const svc = b.services.find((s) => s.name === name);
expect(svc?.status, `${name} should be ok`).toBe('ok');
}
expect(b.ok).toBe(true);
} finally {
server.close();
}
});
it('ml-serving=down and ok=false when ml-serving is unreachable', async () => {
mockFetch(new Set(['api', 'mlflow', 'airflow'])); // ml-serving absent
const { server, call } = await startServer(buildApp());
try {
const { body } = await call('GET', '/api/admin/health');
const b = body as HealthBody;
const mlSvc = b.services.find((s) => s.name === 'ml-serving');
expect(mlSvc?.status).toBe('down');
expect(b.ok).toBe(false);
} finally {
server.close();
}
});
it('airflow=down and ok=false when airflow is unreachable', async () => {
mockFetch(new Set(['api', 'ml-serving', 'mlflow'])); // airflow absent
const { server, call } = await startServer(buildApp());
try {
const { body } = await call('GET', '/api/admin/health');
const b = body as HealthBody;
const svc = b.services.find((s) => s.name === 'airflow');
expect(svc?.status).toBe('down');
expect(b.ok).toBe(false);
} finally {
server.close();
}
});
it('mlflow=down and ok=false when mlflow is unreachable', async () => {
mockFetch(new Set(['api', 'ml-serving', 'airflow'])); // mlflow absent
const { server, call } = await startServer(buildApp());
try {
const { body } = await call('GET', '/api/admin/health');
const b = body as HealthBody;
const svc = b.services.find((s) => s.name === 'mlflow');
expect(svc?.status).toBe('down');
expect(b.ok).toBe(false);
} finally {
server.close();
}
});
it('sqlite and event-bus are always present regardless of HTTP service status', async () => {
mockFetch(new Set()); // all HTTP services down
const { server, call } = await startServer(buildApp());
try {
const { body } = await call('GET', '/api/admin/health');
const b = body as HealthBody;
expect(b.services.find((s) => s.name === 'sqlite')?.status).toBe('ok');
expect(b.services.find((s) => s.name === 'event-bus')?.status).toBe('ok');
} finally {
server.close();
}