New DAG (`ml/pipelines/bench_dag.py`) with three linked tasks: 1. collect.py — generates candidates, logs to MLflow 2. export_for_judge — exports pending runs for Claude Code scoring 3. compare — generates leaderboard by (model, prompt) cell Config via dag_run.conf supports all collect.py options (models, prompts, n_tips, n_scenarios, temperature, experiment name, max_model_b). New admin API endpoints (`services/api/src/routes/bench.ts`): - GET /api/bench/experiments — list tip-bench-* experiments - POST /api/bench/run — trigger DAG with custom config - GET /api/bench/runs/:experiment — list runs in experiment - GET /api/bench/leaderboard/:experiment — leaderboard by (model, prompt) All endpoints require admin auth. Human judge (Claude Code) scores are applied manually post-export; future enhancement: add webhook to DAG. Admin UI can now trigger and monitor benchmarks from a dashboard panel. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
115 lines
3.8 KiB
TypeScript
115 lines
3.8 KiB
TypeScript
import 'dotenv/config';
|
|
import { logger } from './logger.js';
|
|
import express from 'express';
|
|
import { pinoHttp } from 'pino-http';
|
|
import cookieParser from 'cookie-parser';
|
|
import cors from 'cors';
|
|
import { tracingMiddleware } from './middleware/tracing.js';
|
|
import { config } from './config.js';
|
|
import { db, runMigrations } from './db/index.js';
|
|
import { tipScores, tipFeedback } from './db/schema.js';
|
|
import { lt } from 'drizzle-orm';
|
|
import { sessionMiddleware } from './middleware/session.js';
|
|
import { authRouter } from './routes/auth.js';
|
|
import { integrationsRouter } from './routes/integrations.js';
|
|
import { recommenderRouter } from './routes/recommender.js';
|
|
import { userRouter } from './routes/user.js';
|
|
import { pushRouter } from './routes/push.js';
|
|
import { adminRouter, adminInternalRouter } from './routes/admin.js';
|
|
import benchRouter from './routes/bench.js';
|
|
import { mkdir } from 'fs/promises';
|
|
import { dirname } from 'path';
|
|
import { requireAuth } from './middleware/session.js';
|
|
import { requireAdmin } from './middleware/admin.js';
|
|
import type { Request, Response } from 'express';
|
|
import { connectNats } from './events/nats.js';
|
|
import { startTodoistSyncScheduler } from './signals/scheduler.js';
|
|
import { bus } from './events/bus.js';
|
|
import { registerProfileSubscriptions } from './profile/subscriber.js';
|
|
|
|
await mkdir(dirname(config.DATABASE_PATH), { recursive: true });
|
|
runMigrations();
|
|
|
|
process.on('unhandledRejection', (reason) => {
|
|
logger.error({ err: reason }, 'unhandledRejection');
|
|
});
|
|
process.on('uncaughtException', (err) => {
|
|
logger.fatal({ err }, 'uncaughtException');
|
|
});
|
|
|
|
const app = express();
|
|
|
|
app.use(
|
|
cors({
|
|
origin: config.WEB_BASE_URL,
|
|
credentials: true,
|
|
}),
|
|
);
|
|
app.use(tracingMiddleware);
|
|
app.use(
|
|
pinoHttp({
|
|
logger,
|
|
genReqId: (req) => req.traceId,
|
|
customProps: (req) => ({ traceId: req.traceId }),
|
|
autoLogging: { ignore: (req) => req.url === '/health' },
|
|
}),
|
|
);
|
|
app.use(express.json());
|
|
app.use(cookieParser());
|
|
app.use(sessionMiddleware);
|
|
|
|
app.get('/health', (_req, res) => res.json({ ok: true }));
|
|
|
|
app.use('/api/auth', authRouter);
|
|
app.use('/api/integrations', integrationsRouter);
|
|
app.use('/api', recommenderRouter);
|
|
app.use('/api/user', userRouter);
|
|
app.use('/api/push', pushRouter);
|
|
app.use('/api/admin', adminRouter);
|
|
app.use('/api/admin', adminInternalRouter);
|
|
app.use('/api/bench', requireAuth as any, requireAdmin as any, benchRouter);
|
|
|
|
app.use('/api/ml', requireAuth as any, requireAdmin as any, async (req: Request, res: Response) => {
|
|
const mlUrl = config.ML_SERVING_URL;
|
|
const target = `${mlUrl}${req.path}`;
|
|
try {
|
|
const upstream = await fetch(target, {
|
|
method: req.method,
|
|
headers: { 'Content-Type': 'application/json', traceparent: req.traceparent },
|
|
body: req.method !== 'GET' ? JSON.stringify(req.body) : undefined,
|
|
signal: AbortSignal.timeout(5000),
|
|
});
|
|
const data = await upstream.json();
|
|
res.status(upstream.status).json(data);
|
|
} catch (e: any) {
|
|
res.status(502).json({ error: 'ml/serving unavailable', detail: e.message });
|
|
}
|
|
});
|
|
|
|
async function purgeExpiredData() {
|
|
const cutoff = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString();
|
|
try {
|
|
await db.delete(tipScores).where(lt(tipScores.servedAt, cutoff));
|
|
await db.delete(tipFeedback).where(lt(tipFeedback.createdAt, cutoff));
|
|
} catch (err: any) {
|
|
logger.error({ err }, 'retention cleanup failed');
|
|
}
|
|
}
|
|
|
|
purgeExpiredData();
|
|
setInterval(purgeExpiredData, 24 * 60 * 60 * 1000);
|
|
|
|
app.listen(config.PORT, () => {
|
|
logger.info({ port: config.PORT }, 'oO API listening');
|
|
});
|
|
|
|
if (config.NATS_URL) {
|
|
connectNats(config.NATS_URL);
|
|
}
|
|
|
|
startTodoistSyncScheduler(config.TODOIST_SYNC_INTERVAL_MS);
|
|
|
|
// Profile features are invalidated on relevant signals (#81 phase B.2);
|
|
// TTL stays as a safety net for clock drift / dropped events.
|
|
registerProfileSubscriptions(bus);
|