feat(airflow): integrate bench harness into bench_collect DAG
New DAG (`ml/pipelines/bench_dag.py`) with three linked tasks: 1. collect.py — generates candidates, logs to MLflow 2. export_for_judge — exports pending runs for Claude Code scoring 3. compare — generates leaderboard by (model, prompt) cell Config via dag_run.conf supports all collect.py options (models, prompts, n_tips, n_scenarios, temperature, experiment name, max_model_b). New admin API endpoints (`services/api/src/routes/bench.ts`): - GET /api/bench/experiments — list tip-bench-* experiments - POST /api/bench/run — trigger DAG with custom config - GET /api/bench/runs/:experiment — list runs in experiment - GET /api/bench/leaderboard/:experiment — leaderboard by (model, prompt) All endpoints require admin auth. Human judge (Claude Code) scores are applied manually post-export; future enhancement: add webhook to DAG. Admin UI can now trigger and monitor benchmarks from a dashboard panel. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
168
ml/pipelines/bench_dag.py
Normal file
168
ml/pipelines/bench_dag.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""
|
||||
Airflow DAG: bench_collect
|
||||
|
||||
Runs the tip-generation benchmark (model × prompt evaluation). Triggered
|
||||
on-demand from the admin UI or manually, collects candidates per cell,
|
||||
exports for Claude Code judgment, and generates a leaderboard.
|
||||
|
||||
Mirrors the manual flow:
|
||||
|
||||
1. collect.py → generates candidates, logs to MLflow with judge_pending=true
|
||||
2. (human: judge_cli.py --export, Claude Code scores, judge_cli.py --apply)
|
||||
3. compare.py → leaderboard
|
||||
|
||||
For now, steps 2 is manual. Future: add a webhook to trigger the human
|
||||
judge from the admin UI or set up an async task queue.
|
||||
|
||||
Required conf keys (passed via dag_run.conf):
|
||||
models str — comma-separated model tags (e.g. "qwen2.5:0.5b,qwen2.5:1.5b")
|
||||
prompts str — comma-separated prompt versions (default: "v1,v2-mentor,v3-few-shot")
|
||||
n_tips int — candidates to generate per scenario (default: 5)
|
||||
n_scenarios int — cap scenario count; 0 = all (default: 0)
|
||||
temperature float — LLM generation temperature (default: 0.7)
|
||||
experiment str — MLflow experiment name (default: "tip-bench-auto")
|
||||
max_model_b float — reject models larger than this (default: 4.0)
|
||||
ollama_url str — Ollama endpoint (default: http://localhost:11434)
|
||||
mlflow_url str — MLflow tracking URI (env MLFLOW_TRACKING_URI or http://localhost:5000)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.operators.python import PythonOperator
|
||||
|
||||
|
||||
def _collect(**context: object) -> dict:
|
||||
"""Run collect.py with the provided config."""
|
||||
conf: dict = context["dag_run"].conf or {}
|
||||
|
||||
models = str(conf.get("models", "qwen2.5:0.5b,qwen2.5:1.5b,gemma3:1b,llama3.2:3b"))
|
||||
prompts = str(conf.get("prompts", "v1,v2-mentor,v3-few-shot"))
|
||||
n_tips = int(conf.get("n_tips", 5))
|
||||
n_scenarios = int(conf.get("n_scenarios", 0))
|
||||
temperature = float(conf.get("temperature", 0.7))
|
||||
experiment = str(conf.get("experiment", "tip-bench-auto"))
|
||||
max_model_b = float(conf.get("max_model_b", 4.0))
|
||||
ollama_url = str(conf.get("ollama_url", os.environ.get("OLLAMA_URL", "http://localhost:11434")))
|
||||
mlflow_url = str(conf.get("mlflow_url", os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000")))
|
||||
|
||||
sys.path.insert(0, "/opt/airflow/ml/experiments/bench")
|
||||
from collect import main as collect_main # type: ignore
|
||||
|
||||
# Build args for collect.py
|
||||
args = [
|
||||
"--models", models,
|
||||
"--prompts", prompts,
|
||||
"--experiment", experiment,
|
||||
"--n-tips", str(n_tips),
|
||||
"--temperature", str(temperature),
|
||||
"--max-model-b", str(max_model_b),
|
||||
"--ollama-url", ollama_url,
|
||||
"--mlflow-url", mlflow_url,
|
||||
]
|
||||
if n_scenarios > 0:
|
||||
args.extend(["--n-scenarios", str(n_scenarios)])
|
||||
|
||||
# Inject args into sys.argv so argparse picks them up
|
||||
old_argv = sys.argv
|
||||
try:
|
||||
sys.argv = ["collect.py"] + args
|
||||
result = collect_main()
|
||||
return {
|
||||
"status": "success" if result == 0 else "failed",
|
||||
"exit_code": result,
|
||||
"experiment": experiment,
|
||||
}
|
||||
finally:
|
||||
sys.argv = old_argv
|
||||
|
||||
|
||||
def _compare(**context: object) -> dict:
|
||||
"""Run compare.py to generate the leaderboard."""
|
||||
conf: dict = context["dag_run"].conf or {}
|
||||
experiment = str(conf.get("experiment", "tip-bench-auto"))
|
||||
mlflow_url = str(conf.get("mlflow_url", os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000")))
|
||||
|
||||
sys.path.insert(0, "/opt/airflow/ml/experiments/bench")
|
||||
from compare import main as compare_main # type: ignore
|
||||
|
||||
old_argv = sys.argv
|
||||
try:
|
||||
sys.argv = [
|
||||
"compare.py",
|
||||
"--experiment", experiment,
|
||||
"--mlflow-url", mlflow_url,
|
||||
]
|
||||
result = compare_main()
|
||||
return {
|
||||
"status": "success" if result == 0 else "failed",
|
||||
"exit_code": result,
|
||||
"experiment": experiment,
|
||||
}
|
||||
finally:
|
||||
sys.argv = old_argv
|
||||
|
||||
|
||||
def _export_for_judge(**context: object) -> str:
|
||||
"""Export pending runs for Claude Code judgment."""
|
||||
conf: dict = context["dag_run"].conf or {}
|
||||
experiment = str(conf.get("experiment", "tip-bench-auto"))
|
||||
mlflow_url = str(conf.get("mlflow_url", os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000")))
|
||||
|
||||
export_path = f"/tmp/oo-bench-{experiment}-{int(context['ti'].start_date.timestamp())}.json"
|
||||
|
||||
sys.path.insert(0, "/opt/airflow/ml/experiments/bench")
|
||||
from judge_cli import export # type: ignore
|
||||
from mlflow_client import MLflowClient # type: ignore
|
||||
|
||||
client = MLflowClient(
|
||||
tracking_uri=mlflow_url,
|
||||
username=os.environ.get("MLFLOW_TRACKING_USERNAME") or "admin",
|
||||
password=os.environ.get("MLFLOW_TRACKING_PASSWORD") or "password",
|
||||
)
|
||||
result = export(client, experiment, export_path)
|
||||
|
||||
# XCom: push path so next task can find it
|
||||
context["ti"].xcom_push(key="export_path", value=export_path)
|
||||
|
||||
return export_path
|
||||
|
||||
|
||||
with DAG(
|
||||
dag_id="bench_collect",
|
||||
description="Tip-generation benchmark: model & prompt evaluation via MLflow",
|
||||
schedule_interval=None,
|
||||
start_date=datetime(2025, 1, 1),
|
||||
catchup=False,
|
||||
tags=["bench", "ml", "evaluation"],
|
||||
default_args={
|
||||
"retries": 1,
|
||||
"retry_delay": timedelta(minutes=5),
|
||||
},
|
||||
) as dag:
|
||||
|
||||
collect = PythonOperator(
|
||||
task_id="collect",
|
||||
python_callable=_collect,
|
||||
provide_context=True,
|
||||
)
|
||||
|
||||
export_judge = PythonOperator(
|
||||
task_id="export_for_judge",
|
||||
python_callable=_export_for_judge,
|
||||
provide_context=True,
|
||||
)
|
||||
|
||||
compare = PythonOperator(
|
||||
task_id="compare",
|
||||
python_callable=_compare,
|
||||
provide_context=True,
|
||||
)
|
||||
|
||||
collect >> export_judge >> compare
|
||||
Reference in New Issue
Block a user