feat(airflow): integrate bench harness into bench_collect DAG
New DAG (`ml/pipelines/bench_dag.py`) with three linked tasks: 1. collect.py — generates candidates, logs to MLflow 2. export_for_judge — exports pending runs for Claude Code scoring 3. compare — generates leaderboard by (model, prompt) cell Config via dag_run.conf supports all collect.py options (models, prompts, n_tips, n_scenarios, temperature, experiment name, max_model_b). New admin API endpoints (`services/api/src/routes/bench.ts`): - GET /api/bench/experiments — list tip-bench-* experiments - POST /api/bench/run — trigger DAG with custom config - GET /api/bench/runs/:experiment — list runs in experiment - GET /api/bench/leaderboard/:experiment — leaderboard by (model, prompt) All endpoints require admin auth. Human judge (Claude Code) scores are applied manually post-export; future enhancement: add webhook to DAG. Admin UI can now trigger and monitor benchmarks from a dashboard panel. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,7 @@ import { recommenderRouter } from './routes/recommender.js';
|
||||
import { userRouter } from './routes/user.js';
|
||||
import { pushRouter } from './routes/push.js';
|
||||
import { adminRouter, adminInternalRouter } from './routes/admin.js';
|
||||
import benchRouter from './routes/bench.js';
|
||||
import { mkdir } from 'fs/promises';
|
||||
import { dirname } from 'path';
|
||||
import { requireAuth } from './middleware/session.js';
|
||||
@@ -66,6 +67,7 @@ app.use('/api/user', userRouter);
|
||||
app.use('/api/push', pushRouter);
|
||||
app.use('/api/admin', adminRouter);
|
||||
app.use('/api/admin', adminInternalRouter);
|
||||
app.use('/api/bench', requireAuth as any, requireAdmin as any, benchRouter);
|
||||
|
||||
app.use('/api/ml', requireAuth as any, requireAdmin as any, async (req: Request, res: Response) => {
|
||||
const mlUrl = config.ML_SERVING_URL;
|
||||
|
||||
234
services/api/src/routes/bench.ts
Normal file
234
services/api/src/routes/bench.ts
Normal file
@@ -0,0 +1,234 @@
|
||||
/**
|
||||
* Admin API endpoints for the tip-generation benchmark.
|
||||
*
|
||||
* Exposes:
|
||||
* GET /api/bench/experiments — list MLflow experiments
|
||||
* POST /api/bench/run — trigger benchmark DAG
|
||||
* GET /api/bench/runs/:experiment — list runs in experiment
|
||||
* GET /api/bench/leaderboard/:experiment — leaderboard by (model, prompt)
|
||||
*/
|
||||
|
||||
import { Router, Request, Response } from "express";
|
||||
import httpx from "httpx";
|
||||
import * as process from "process";
|
||||
|
||||
const router = Router();
|
||||
|
||||
const MLFLOW_URL = process.env.MLFLOW_URL || "http://mlflow:5000";
|
||||
const MLFLOW_USER = process.env.MLFLOW_TRACKING_USERNAME || "admin";
|
||||
const MLFLOW_PASS = process.env.MLFLOW_TRACKING_PASSWORD || "password";
|
||||
|
||||
const AIRFLOW_URL = process.env.AIRFLOW_URL || "http://airflow-webserver:8080";
|
||||
const AIRFLOW_USER = process.env.AIRFLOW_API_USER || "admin";
|
||||
const AIRFLOW_PASS = process.env.AIRFLOW_API_PASSWORD || "admin";
|
||||
|
||||
// Wrapper for MLflow REST calls with Host header fix
|
||||
async function mlflowFetch(
|
||||
path: string,
|
||||
method: string = "GET",
|
||||
body?: object
|
||||
): Promise<any> {
|
||||
const url = new URL(path, MLFLOW_URL);
|
||||
const headers: Record<string, string> = {
|
||||
"Host": "localhost",
|
||||
"Content-Type": "application/json",
|
||||
};
|
||||
const auth = Buffer.from(`${MLFLOW_USER}:${MLFLOW_PASS}`).toString("base64");
|
||||
headers["Authorization"] = `Basic ${auth}`;
|
||||
|
||||
const response = await fetch(url.toString(), {
|
||||
method,
|
||||
headers,
|
||||
body: body ? JSON.stringify(body) : undefined,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`MLflow ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
return response.json();
|
||||
}
|
||||
|
||||
// GET /api/bench/experiments — list available experiments
|
||||
router.get("/experiments", async (req: Request, res: Response) => {
|
||||
try {
|
||||
const result = await mlflowFetch("/api/2.0/mlflow/experiments/search", "GET");
|
||||
const experiments = result.experiments
|
||||
.filter((e: any) => e.name.startsWith("tip-bench"))
|
||||
.map((e: any) => ({
|
||||
id: e.experiment_id,
|
||||
name: e.name,
|
||||
creation_time: e.creation_time,
|
||||
}));
|
||||
res.json(experiments);
|
||||
} catch (err) {
|
||||
res.status(500).json({ error: String(err) });
|
||||
}
|
||||
});
|
||||
|
||||
// POST /api/bench/run — trigger benchmark DAG
|
||||
router.post("/run", async (req: Request, res: Response) => {
|
||||
try {
|
||||
const config = req.body || {};
|
||||
const experiment = config.experiment || "tip-bench-admin";
|
||||
|
||||
const dagRunUrl = new URL("/api/v1/dags/bench_collect/dagRuns", AIRFLOW_URL);
|
||||
const auth = Buffer.from(`${AIRFLOW_USER}:${AIRFLOW_PASS}`).toString(
|
||||
"base64"
|
||||
);
|
||||
|
||||
const response = await fetch(dagRunUrl.toString(), {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Basic ${auth}`,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
conf: config,
|
||||
dag_run_id: `bench-${Date.now()}`,
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Airflow ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
|
||||
const result = await response.json();
|
||||
res.json({
|
||||
status: "triggered",
|
||||
dag_run_id: result.dag_run_id,
|
||||
experiment,
|
||||
});
|
||||
} catch (err) {
|
||||
res.status(500).json({ error: String(err) });
|
||||
}
|
||||
});
|
||||
|
||||
// GET /api/bench/runs/:experiment — list runs in an experiment
|
||||
router.get("/runs/:experiment", async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { experiment } = req.params;
|
||||
|
||||
// First, get experiment ID
|
||||
const exps = await mlflowFetch("/api/2.0/mlflow/experiments/search", "GET");
|
||||
const exp = exps.experiments.find((e: any) => e.name === experiment);
|
||||
if (!exp) {
|
||||
return res.status(404).json({ error: "Experiment not found" });
|
||||
}
|
||||
|
||||
// Then, search runs
|
||||
const result = await mlflowFetch("/api/2.0/mlflow/runs/search", "POST", {
|
||||
experiment_ids: [exp.experiment_id],
|
||||
max_results: 1000,
|
||||
});
|
||||
|
||||
const runs = (result.runs || []).map((r: any) => {
|
||||
const params = Object.fromEntries(
|
||||
(r.data?.params || []).map((p: any) => [p.key, p.value])
|
||||
);
|
||||
const metrics = Object.fromEntries(
|
||||
(r.data?.metrics || []).map((m: any) => [m.key, m.value])
|
||||
);
|
||||
return {
|
||||
run_id: r.info.run_id,
|
||||
status: r.info.status,
|
||||
model: params.model,
|
||||
prompt_version: params.prompt_version,
|
||||
scenario_id: params.scenario_id,
|
||||
composite: metrics.composite || null,
|
||||
relevance: metrics.relevance || null,
|
||||
actionability: metrics.actionability || null,
|
||||
tone: metrics.tone || null,
|
||||
latency_ms: metrics.latency_ms || null,
|
||||
};
|
||||
});
|
||||
|
||||
res.json(runs);
|
||||
} catch (err) {
|
||||
res.status(500).json({ error: String(err) });
|
||||
}
|
||||
});
|
||||
|
||||
// GET /api/bench/leaderboard/:experiment — leaderboard
|
||||
router.get("/leaderboard/:experiment", async (req: Request, res: Response) => {
|
||||
try {
|
||||
const { experiment } = req.params;
|
||||
|
||||
// Get experiment ID
|
||||
const exps = await mlflowFetch("/api/2.0/mlflow/experiments/search", "GET");
|
||||
const exp = exps.experiments.find((e: any) => e.name === experiment);
|
||||
if (!exp) {
|
||||
return res.status(404).json({ error: "Experiment not found" });
|
||||
}
|
||||
|
||||
// Search runs
|
||||
const result = await mlflowFetch("/api/2.0/mlflow/runs/search", "POST", {
|
||||
experiment_ids: [exp.experiment_id],
|
||||
max_results: 1000,
|
||||
});
|
||||
|
||||
// Aggregate by (model, prompt)
|
||||
const cells: Record<
|
||||
string,
|
||||
{ n: number; composites: number[]; latencies: number[] }
|
||||
> = {};
|
||||
for (const r of result.runs || []) {
|
||||
const params = Object.fromEntries(
|
||||
(r.data?.params || []).map((p: any) => [p.key, p.value])
|
||||
);
|
||||
const metrics = Object.fromEntries(
|
||||
(r.data?.metrics || []).map((m: any) => [m.key, m.value])
|
||||
);
|
||||
|
||||
if (r.info.status !== "FINISHED") continue;
|
||||
|
||||
const key = `${params.model}|${params.prompt_version}`;
|
||||
if (!cells[key]) {
|
||||
cells[key] = { n: 0, composites: [], latencies: [] };
|
||||
}
|
||||
cells[key].n++;
|
||||
if (metrics.composite !== undefined) {
|
||||
cells[key].composites.push(metrics.composite);
|
||||
}
|
||||
if (metrics.latency_ms !== undefined) {
|
||||
cells[key].latencies.push(metrics.latency_ms);
|
||||
}
|
||||
}
|
||||
|
||||
// Build leaderboard rows
|
||||
const rows = Object.entries(cells).map(([key, stats]) => {
|
||||
const [model, prompt] = key.split("|");
|
||||
const meanComp =
|
||||
stats.composites.length > 0
|
||||
? stats.composites.reduce((a, b) => a + b, 0) / stats.composites.length
|
||||
: null;
|
||||
const meanLat =
|
||||
stats.latencies.length > 0
|
||||
? stats.latencies.reduce((a, b) => a + b, 0) / stats.latencies.length
|
||||
: null;
|
||||
|
||||
return {
|
||||
model,
|
||||
prompt,
|
||||
n: stats.n,
|
||||
composite: meanComp,
|
||||
latency_ms: meanLat,
|
||||
};
|
||||
});
|
||||
|
||||
rows.sort((a, b) => {
|
||||
const aComp = a.composite !== null ? a.composite : -Infinity;
|
||||
const bComp = b.composite !== null ? b.composite : -Infinity;
|
||||
return bComp - aComp;
|
||||
});
|
||||
|
||||
res.json({
|
||||
experiment,
|
||||
rows,
|
||||
winner: rows.length > 0 ? rows[0] : null,
|
||||
});
|
||||
} catch (err) {
|
||||
res.status(500).json({ error: String(err) });
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
Reference in New Issue
Block a user