Combines model evaluation (#93) and prompt A/B testing (#95) into one experiment. Evaluates all (model × prompt × scenario) cells on the same fixed contexts so quality differences are attributable. Architecture: - Phase A (collect.py): generates candidates per cell, logs to MLflow with judge_pending=true. Rejects models >4B, uses keep_alive=0 for RAM safety (no concurrent model weights in VRAM). - Phase B (judge_cli.py): exports pending runs as JSON for Claude Code to score per the rubric, then applies scores back to MLflow. - Phase C (compare.py): leaderboard by (model, prompt) cell. Rubric (tip-v1) defines 1–5 scales for relevance, actionability, tone, plus format_ok and overlong flags. Composite = rel + act + tone + 2×format_ok − overlong. Rubric is self-describing and persisted in every run so judges use consistent criteria across sessions. Artifacts (prompts, candidates, raw responses) stored as MLflow tags because the server uses a file:// backend not accessible via REST. Full artifacts accessible in MLflow UI → run → Tags section. Tested end-to-end on local machine: - 4 models (qwen2.5:0.5b/1.5b, gemma3:1b, llama3.2:3b) ≤4B - 3 prompts (v1, v2-mentor, v3-few-shot) - 4 scenarios (4 personas × 2 time-slots) - 48 cells total, all judged and ranked Winner: qwen2.5:1.5b × v3-few-shot (composite=12.75). Ready for integration into Airflow prompt_ab_eval DAG and admin UI. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
144
ml/experiments/bench/compare.py
Normal file
144
ml/experiments/bench/compare.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Phase C — leaderboard from judged MLflow runs.
|
||||
|
||||
Pulls every judged run (``judge_pending=false`` or any run with the
|
||||
composite metric set) from the experiment, groups by (model, prompt)
|
||||
cell, and prints a leaderboard sorted by mean composite score.
|
||||
|
||||
Also reports the deterministic-only metrics (latency, format_ok) so
|
||||
cells with great prose but broken JSON are visible.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import statistics
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
_BENCH = Path(__file__).resolve().parent
|
||||
sys.path.insert(0, str(_BENCH))
|
||||
from mlflow_client import MLflowClient # type: ignore
|
||||
|
||||
|
||||
def _params(run: dict) -> dict[str, str]:
|
||||
return {p["key"]: p["value"] for p in run["data"].get("params", [])}
|
||||
|
||||
|
||||
def _metrics(run: dict) -> dict[str, float]:
|
||||
return {m["key"]: m["value"] for m in run["data"].get("metrics", [])}
|
||||
|
||||
|
||||
def _tags(run: dict) -> dict[str, str]:
|
||||
return {t["key"]: t["value"] for t in run["data"].get("tags", [])}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="oO bench — Phase C (leaderboard)")
|
||||
parser.add_argument("--experiment", required=True)
|
||||
parser.add_argument("--mlflow-url", default=os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000"))
|
||||
parser.add_argument("--include-pending", action="store_true",
|
||||
help="Also include rows with no quality scores (latency/format only).")
|
||||
args = parser.parse_args()
|
||||
|
||||
client = MLflowClient(
|
||||
tracking_uri=args.mlflow_url,
|
||||
username=os.environ.get("MLFLOW_TRACKING_USERNAME") or "admin",
|
||||
password=os.environ.get("MLFLOW_TRACKING_PASSWORD") or "password",
|
||||
)
|
||||
exp_id = client.get_or_create_experiment(args.experiment)
|
||||
runs = client.search_runs(exp_id, max_results=2000)
|
||||
|
||||
# Group key = (model, prompt_version)
|
||||
cells: dict[tuple[str, str], list[dict]] = defaultdict(list)
|
||||
for r in runs:
|
||||
params = _params(r)
|
||||
metrics = _metrics(r)
|
||||
tags = _tags(r)
|
||||
if r["info"].get("status") != "FINISHED":
|
||||
continue
|
||||
if not args.include_pending and "composite" not in metrics:
|
||||
continue
|
||||
cells[(params.get("model", "?"), params.get("prompt_version", "?"))].append({
|
||||
"metrics": metrics,
|
||||
"scenario": params.get("scenario_id", "?"),
|
||||
"judged": tags.get("judge_pending") == "false",
|
||||
})
|
||||
|
||||
if not cells:
|
||||
print("No judged runs found. Did you run judge_cli.py --apply?")
|
||||
return 1
|
||||
|
||||
rows = []
|
||||
for (model, prompt), records in cells.items():
|
||||
n = len(records)
|
||||
comp = [r["metrics"]["composite"] for r in records if "composite" in r["metrics"]]
|
||||
rel = [r["metrics"]["relevance"] for r in records if "relevance" in r["metrics"]]
|
||||
act = [r["metrics"]["actionability"] for r in records if "actionability" in r["metrics"]]
|
||||
tone = [r["metrics"]["tone"] for r in records if "tone" in r["metrics"]]
|
||||
lat = [r["metrics"]["latency_ms"] for r in records if "latency_ms" in r["metrics"]]
|
||||
fmt = [r["metrics"]["format_ok"] for r in records if "format_ok" in r["metrics"]]
|
||||
div = [r["metrics"]["mean_diversity"] for r in records if "mean_diversity" in r["metrics"]]
|
||||
|
||||
rows.append({
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"n": n,
|
||||
"composite": statistics.mean(comp) if comp else None,
|
||||
"relevance": statistics.mean(rel) if rel else None,
|
||||
"actionability": statistics.mean(act) if act else None,
|
||||
"tone": statistics.mean(tone) if tone else None,
|
||||
"format_ok": statistics.mean(fmt) if fmt else None,
|
||||
"latency_p50": statistics.median(lat) if lat else None,
|
||||
"latency_p95": _p95(lat) if lat else None,
|
||||
"diversity": statistics.mean(div) if div else None,
|
||||
})
|
||||
|
||||
rows.sort(key=lambda r: r["composite"] if r["composite"] is not None else -1, reverse=True)
|
||||
|
||||
# Width-fitted printer — keeps output legible in a 100-col terminal.
|
||||
print()
|
||||
print(f"Experiment: {args.experiment} (id={exp_id})")
|
||||
print(f"Cells : {len(rows)}")
|
||||
print()
|
||||
header = (
|
||||
f"{'#':>2} {'model':18s} {'prompt':12s} {'n':>3s} "
|
||||
f"{'comp':>5s} {'rel':>4s} {'act':>4s} {'tone':>4s} "
|
||||
f"{'fmt':>4s} {'p50':>6s} {'p95':>6s} {'div':>5s}"
|
||||
)
|
||||
print(header)
|
||||
print("─" * len(header))
|
||||
for i, r in enumerate(rows, 1):
|
||||
comp = f"{r['composite']:.2f}" if r["composite"] is not None else " -- "
|
||||
rel = f"{r['relevance']:.1f}" if r["relevance"] is not None else " -- "
|
||||
act = f"{r['actionability']:.1f}" if r["actionability"] is not None else " -- "
|
||||
tone = f"{r['tone']:.1f}" if r["tone"] is not None else " -- "
|
||||
fmt = f"{r['format_ok']:.2f}" if r["format_ok"] is not None else " -- "
|
||||
p50 = f"{r['latency_p50']:.0f}" if r["latency_p50"] is not None else " -- "
|
||||
p95 = f"{r['latency_p95']:.0f}" if r["latency_p95"] is not None else " -- "
|
||||
div = f"{r['diversity']:.2f}" if r["diversity"] is not None else " -- "
|
||||
print(
|
||||
f"{i:>2} {r['model']:18s} {r['prompt']:12s} {r['n']:>3d} "
|
||||
f"{comp:>5s} {rel:>4s} {act:>4s} {tone:>4s} "
|
||||
f"{fmt:>4s} {p50:>6s} {p95:>6s} {div:>5s}"
|
||||
)
|
||||
|
||||
if rows[0]["composite"] is not None:
|
||||
winner = rows[0]
|
||||
print()
|
||||
print(f"Winner: {winner['model']} × {winner['prompt']} "
|
||||
f"(composite={winner['composite']:.2f}, n={winner['n']})")
|
||||
return 0
|
||||
|
||||
|
||||
def _p95(xs: list[float]) -> float:
|
||||
if not xs:
|
||||
return 0.0
|
||||
s = sorted(xs)
|
||||
idx = max(0, int(round(0.95 * (len(s) - 1))))
|
||||
return s[idx]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user