"""Phase C — leaderboard from judged MLflow runs. Pulls every judged run (``judge_pending=false`` or any run with the composite metric set) from the experiment, groups by (model, prompt) cell, and prints a leaderboard sorted by mean composite score. Also reports the deterministic-only metrics (latency, format_ok) so cells with great prose but broken JSON are visible. """ from __future__ import annotations import argparse import os import statistics import sys from collections import defaultdict from pathlib import Path _BENCH = Path(__file__).resolve().parent sys.path.insert(0, str(_BENCH)) from mlflow_client import MLflowClient # type: ignore def _params(run: dict) -> dict[str, str]: return {p["key"]: p["value"] for p in run["data"].get("params", [])} def _metrics(run: dict) -> dict[str, float]: return {m["key"]: m["value"] for m in run["data"].get("metrics", [])} def _tags(run: dict) -> dict[str, str]: return {t["key"]: t["value"] for t in run["data"].get("tags", [])} def main() -> int: parser = argparse.ArgumentParser(description="oO bench — Phase C (leaderboard)") parser.add_argument("--experiment", required=True) parser.add_argument("--mlflow-url", default=os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000")) parser.add_argument("--include-pending", action="store_true", help="Also include rows with no quality scores (latency/format only).") args = parser.parse_args() client = MLflowClient( tracking_uri=args.mlflow_url, username=os.environ.get("MLFLOW_TRACKING_USERNAME") or "admin", password=os.environ.get("MLFLOW_TRACKING_PASSWORD") or "password", ) exp_id = client.get_or_create_experiment(args.experiment) runs = client.search_runs(exp_id, max_results=2000) # Group key = (model, prompt_version) cells: dict[tuple[str, str], list[dict]] = defaultdict(list) for r in runs: params = _params(r) metrics = _metrics(r) tags = _tags(r) if r["info"].get("status") != "FINISHED": continue if not args.include_pending and "composite" not in metrics: continue cells[(params.get("model", "?"), params.get("prompt_version", "?"))].append({ "metrics": metrics, "scenario": params.get("scenario_id", "?"), "judged": tags.get("judge_pending") == "false", }) if not cells: print("No judged runs found. Did you run judge_cli.py --apply?") return 1 rows = [] for (model, prompt), records in cells.items(): n = len(records) comp = [r["metrics"]["composite"] for r in records if "composite" in r["metrics"]] rel = [r["metrics"]["relevance"] for r in records if "relevance" in r["metrics"]] act = [r["metrics"]["actionability"] for r in records if "actionability" in r["metrics"]] tone = [r["metrics"]["tone"] for r in records if "tone" in r["metrics"]] lat = [r["metrics"]["latency_ms"] for r in records if "latency_ms" in r["metrics"]] fmt = [r["metrics"]["format_ok"] for r in records if "format_ok" in r["metrics"]] div = [r["metrics"]["mean_diversity"] for r in records if "mean_diversity" in r["metrics"]] rows.append({ "model": model, "prompt": prompt, "n": n, "composite": statistics.mean(comp) if comp else None, "relevance": statistics.mean(rel) if rel else None, "actionability": statistics.mean(act) if act else None, "tone": statistics.mean(tone) if tone else None, "format_ok": statistics.mean(fmt) if fmt else None, "latency_p50": statistics.median(lat) if lat else None, "latency_p95": _p95(lat) if lat else None, "diversity": statistics.mean(div) if div else None, }) rows.sort(key=lambda r: r["composite"] if r["composite"] is not None else -1, reverse=True) # Width-fitted printer — keeps output legible in a 100-col terminal. print() print(f"Experiment: {args.experiment} (id={exp_id})") print(f"Cells : {len(rows)}") print() header = ( f"{'#':>2} {'model':18s} {'prompt':12s} {'n':>3s} " f"{'comp':>5s} {'rel':>4s} {'act':>4s} {'tone':>4s} " f"{'fmt':>4s} {'p50':>6s} {'p95':>6s} {'div':>5s}" ) print(header) print("─" * len(header)) for i, r in enumerate(rows, 1): comp = f"{r['composite']:.2f}" if r["composite"] is not None else " -- " rel = f"{r['relevance']:.1f}" if r["relevance"] is not None else " -- " act = f"{r['actionability']:.1f}" if r["actionability"] is not None else " -- " tone = f"{r['tone']:.1f}" if r["tone"] is not None else " -- " fmt = f"{r['format_ok']:.2f}" if r["format_ok"] is not None else " -- " p50 = f"{r['latency_p50']:.0f}" if r["latency_p50"] is not None else " -- " p95 = f"{r['latency_p95']:.0f}" if r["latency_p95"] is not None else " -- " div = f"{r['diversity']:.2f}" if r["diversity"] is not None else " -- " print( f"{i:>2} {r['model']:18s} {r['prompt']:12s} {r['n']:>3d} " f"{comp:>5s} {rel:>4s} {act:>4s} {tone:>4s} " f"{fmt:>4s} {p50:>6s} {p95:>6s} {div:>5s}" ) if rows[0]["composite"] is not None: winner = rows[0] print() print(f"Winner: {winner['model']} × {winner['prompt']} " f"(composite={winner['composite']:.2f}, n={winner['n']})") return 0 def _p95(xs: list[float]) -> float: if not xs: return 0.0 s = sorted(xs) idx = max(0, int(round(0.95 * (len(s) - 1)))) return s[idx] if __name__ == "__main__": sys.exit(main())