Files
oO/ml/experiments/bench/compare.py
alvis 556019b060 feat(bench): MLflow-based tip-generation benchmark harness (#93, #95)
Combines model evaluation (#93) and prompt A/B testing (#95) into one
experiment. Evaluates all (model × prompt × scenario) cells on the same
fixed contexts so quality differences are attributable.

Architecture:
- Phase A (collect.py): generates candidates per cell, logs to MLflow
  with judge_pending=true. Rejects models >4B, uses keep_alive=0 for
  RAM safety (no concurrent model weights in VRAM).
- Phase B (judge_cli.py): exports pending runs as JSON for Claude Code
  to score per the rubric, then applies scores back to MLflow.
- Phase C (compare.py): leaderboard by (model, prompt) cell.

Rubric (tip-v1) defines 1–5 scales for relevance, actionability, tone,
plus format_ok and overlong flags. Composite = rel + act + tone +
2×format_ok − overlong. Rubric is self-describing and persisted in every
run so judges use consistent criteria across sessions.

Artifacts (prompts, candidates, raw responses) stored as MLflow tags
because the server uses a file:// backend not accessible via REST. Full
artifacts accessible in MLflow UI → run → Tags section.

Tested end-to-end on local machine:
- 4 models (qwen2.5:0.5b/1.5b, gemma3:1b, llama3.2:3b) ≤4B
- 3 prompts (v1, v2-mentor, v3-few-shot)
- 4 scenarios (4 personas × 2 time-slots)
- 48 cells total, all judged and ranked

Winner: qwen2.5:1.5b × v3-few-shot (composite=12.75).

Ready for integration into Airflow prompt_ab_eval DAG and admin UI.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-27 11:48:59 +00:00

145 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Phase C — leaderboard from judged MLflow runs.
Pulls every judged run (``judge_pending=false`` or any run with the
composite metric set) from the experiment, groups by (model, prompt)
cell, and prints a leaderboard sorted by mean composite score.
Also reports the deterministic-only metrics (latency, format_ok) so
cells with great prose but broken JSON are visible.
"""
from __future__ import annotations
import argparse
import os
import statistics
import sys
from collections import defaultdict
from pathlib import Path
_BENCH = Path(__file__).resolve().parent
sys.path.insert(0, str(_BENCH))
from mlflow_client import MLflowClient # type: ignore
def _params(run: dict) -> dict[str, str]:
return {p["key"]: p["value"] for p in run["data"].get("params", [])}
def _metrics(run: dict) -> dict[str, float]:
return {m["key"]: m["value"] for m in run["data"].get("metrics", [])}
def _tags(run: dict) -> dict[str, str]:
return {t["key"]: t["value"] for t in run["data"].get("tags", [])}
def main() -> int:
parser = argparse.ArgumentParser(description="oO bench — Phase C (leaderboard)")
parser.add_argument("--experiment", required=True)
parser.add_argument("--mlflow-url", default=os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000"))
parser.add_argument("--include-pending", action="store_true",
help="Also include rows with no quality scores (latency/format only).")
args = parser.parse_args()
client = MLflowClient(
tracking_uri=args.mlflow_url,
username=os.environ.get("MLFLOW_TRACKING_USERNAME") or "admin",
password=os.environ.get("MLFLOW_TRACKING_PASSWORD") or "password",
)
exp_id = client.get_or_create_experiment(args.experiment)
runs = client.search_runs(exp_id, max_results=2000)
# Group key = (model, prompt_version)
cells: dict[tuple[str, str], list[dict]] = defaultdict(list)
for r in runs:
params = _params(r)
metrics = _metrics(r)
tags = _tags(r)
if r["info"].get("status") != "FINISHED":
continue
if not args.include_pending and "composite" not in metrics:
continue
cells[(params.get("model", "?"), params.get("prompt_version", "?"))].append({
"metrics": metrics,
"scenario": params.get("scenario_id", "?"),
"judged": tags.get("judge_pending") == "false",
})
if not cells:
print("No judged runs found. Did you run judge_cli.py --apply?")
return 1
rows = []
for (model, prompt), records in cells.items():
n = len(records)
comp = [r["metrics"]["composite"] for r in records if "composite" in r["metrics"]]
rel = [r["metrics"]["relevance"] for r in records if "relevance" in r["metrics"]]
act = [r["metrics"]["actionability"] for r in records if "actionability" in r["metrics"]]
tone = [r["metrics"]["tone"] for r in records if "tone" in r["metrics"]]
lat = [r["metrics"]["latency_ms"] for r in records if "latency_ms" in r["metrics"]]
fmt = [r["metrics"]["format_ok"] for r in records if "format_ok" in r["metrics"]]
div = [r["metrics"]["mean_diversity"] for r in records if "mean_diversity" in r["metrics"]]
rows.append({
"model": model,
"prompt": prompt,
"n": n,
"composite": statistics.mean(comp) if comp else None,
"relevance": statistics.mean(rel) if rel else None,
"actionability": statistics.mean(act) if act else None,
"tone": statistics.mean(tone) if tone else None,
"format_ok": statistics.mean(fmt) if fmt else None,
"latency_p50": statistics.median(lat) if lat else None,
"latency_p95": _p95(lat) if lat else None,
"diversity": statistics.mean(div) if div else None,
})
rows.sort(key=lambda r: r["composite"] if r["composite"] is not None else -1, reverse=True)
# Width-fitted printer — keeps output legible in a 100-col terminal.
print()
print(f"Experiment: {args.experiment} (id={exp_id})")
print(f"Cells : {len(rows)}")
print()
header = (
f"{'#':>2} {'model':18s} {'prompt':12s} {'n':>3s} "
f"{'comp':>5s} {'rel':>4s} {'act':>4s} {'tone':>4s} "
f"{'fmt':>4s} {'p50':>6s} {'p95':>6s} {'div':>5s}"
)
print(header)
print("" * len(header))
for i, r in enumerate(rows, 1):
comp = f"{r['composite']:.2f}" if r["composite"] is not None else " -- "
rel = f"{r['relevance']:.1f}" if r["relevance"] is not None else " -- "
act = f"{r['actionability']:.1f}" if r["actionability"] is not None else " -- "
tone = f"{r['tone']:.1f}" if r["tone"] is not None else " -- "
fmt = f"{r['format_ok']:.2f}" if r["format_ok"] is not None else " -- "
p50 = f"{r['latency_p50']:.0f}" if r["latency_p50"] is not None else " -- "
p95 = f"{r['latency_p95']:.0f}" if r["latency_p95"] is not None else " -- "
div = f"{r['diversity']:.2f}" if r["diversity"] is not None else " -- "
print(
f"{i:>2} {r['model']:18s} {r['prompt']:12s} {r['n']:>3d} "
f"{comp:>5s} {rel:>4s} {act:>4s} {tone:>4s} "
f"{fmt:>4s} {p50:>6s} {p95:>6s} {div:>5s}"
)
if rows[0]["composite"] is not None:
winner = rows[0]
print()
print(f"Winner: {winner['model']} × {winner['prompt']} "
f"(composite={winner['composite']:.2f}, n={winner['n']})")
return 0
def _p95(xs: list[float]) -> float:
if not xs:
return 0.0
s = sorted(xs)
idx = max(0, int(round(0.95 * (len(s) - 1))))
return s[idx]
if __name__ == "__main__":
sys.exit(main())