feat(bench): MLflow-based tip-generation benchmark harness (#93, #95)

Combines model evaluation (#93) and prompt A/B testing (#95) into one experiment. Evaluates all (model × prompt × scenario) cells on the same fixed contexts so quality differences are attributable. Architecture: - Phase A (collect.py): generates candidates per cell, logs to MLflow with judge_pending=true. Rejects models >4B, uses keep_alive=0 for RAM safety (no concurrent model weights in VRAM). - Phase B (judge_cli.py): exports pending runs as JSON for Claude Code to score per the rubric, then applies scores back to MLflow. - Phase C (compare.py): leaderboard by (model, prompt) cell. Rubric (tip-v1) defines 1–5 scales for relevance, actionability, tone, plus format_ok and overlong flags. Composite = rel + act + tone + 2×format_ok − overlong. Rubric is self-describing and persisted in every run so judges use consistent criteria across sessions. Artifacts (prompts, candidates, raw responses) stored as MLflow tags because the server uses a file:// backend not accessible via REST. Full artifacts accessible in MLflow UI → run → Tags section. Tested end-to-end on local machine: - 4 models (qwen2.5:0.5b/1.5b, gemma3:1b, llama3.2:3b) ≤4B - 3 prompts (v1, v2-mentor, v3-few-shot) - 4 scenarios (4 personas × 2 time-slots) - 48 cells total, all judged and ranked Winner: qwen2.5:1.5b × v3-few-shot (composite=12.75). Ready for integration into Airflow prompt_ab_eval DAG and admin UI. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-27 11:48:59 +00:00
parent e40dfdcbb0
commit 556019b060
8 changed files with 1147 additions and 0 deletions
--- a/ml/experiments/bench/judge_cli.py
+++ b/ml/experiments/bench/judge_cli.py
@@ -0,0 +1,191 @@
+"""Phase B — Claude Code as the lazy MLflow judge.
+
+Two sub-commands, both keyed to MLflow tags so the same run cycles
+through ``judge_pending=true`` → judged → ``judge_pending=false`` exactly
+once.
+
+  --export PATH
+      Pull every run with ``judge_pending=true`` and ``judge_kind=claude-code``
+      from the experiment, bundle the prompt + parsed candidates + the
+      rubric into a single JSON file the Claude Code session can read.
+
+  --apply PATH
+      Read the responses (same shape as the request, with ``scores`` filled in)
+      and log ``relevance``, ``actionability``, ``tone``, ``overlong`` as
+      MLflow metrics on the corresponding runs. Sets ``judge_pending=false``
+      and stamps ``judged_at`` / ``judged_by`` so the run won't be picked up
+      twice.
+
+The request file is intentionally one big JSON document, so the human
+judge sees the full set in one place and can score consistently.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+import time
+from pathlib import Path
+
+_BENCH = Path(__file__).resolve().parent
+sys.path.insert(0, str(_BENCH))
+from mlflow_client import MLflowClient  # type: ignore
+
+
+_DIMENSIONS = ("relevance", "actionability", "tone")
+_BIN_FLAGS = ("overlong",)
+
+
+def _tags_dict(run: dict) -> dict[str, str]:
+    return {t["key"]: t["value"] for t in run.get("data", {}).get("tags", [])}
+
+
+def _params_dict(run: dict) -> dict[str, str]:
+    return {p["key"]: p["value"] for p in run.get("data", {}).get("params", [])}
+
+
+def export(client: MLflowClient, experiment: str, out_path: str) -> int:
+    exp_id = client.get_or_create_experiment(experiment)
+    runs = client.search_runs(
+        exp_id,
+        filter_string="tags.judge_pending = 'true' and tags.judge_kind = 'claude-code'",
+    )
+    if not runs:
+        print("No pending runs.")
+        Path(out_path).write_text(json.dumps({
+            "experiment": experiment,
+            "exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+            "rubric": "tip-v1",
+            "items": [],
+        }, indent=2))
+        return 0
+
+    rubric_text = (_BENCH / "rubric.md").read_text(encoding="utf-8")
+
+    items: list[dict] = []
+    for run in runs:
+        run_id = run["info"]["run_id"]
+        tags = _tags_dict(run)
+        params = _params_dict(run)
+        candidates_json = client.get_artifact_text(run_id, "candidates.json")
+        prompt_text = client.get_artifact_text(run_id, "prompt.txt")
+        try:
+            candidates = json.loads(candidates_json) if candidates_json else []
+        except json.JSONDecodeError:
+            candidates = []
+
+        items.append({
+            "run_id": run_id,
+            "model": params.get("model") or tags.get("model"),
+            "prompt_version": params.get("prompt_version") or tags.get("prompt_version"),
+            "scenario_id": params.get("scenario_id") or tags.get("scenario_id"),
+            "persona": params.get("persona") or tags.get("persona"),
+            "hour_of_day": int(params.get("hour_of_day", "12")),
+            "day_of_week": int(params.get("day_of_week", "0")),
+            "prompt": prompt_text,
+            "candidates": candidates,
+            # Per-run scoring slot — judge fills these in.
+            "scores": {
+                "relevance": None,        # 1–5, integer
+                "actionability": None,    # 1–5, integer
+                "tone": None,             # 1–5, integer
+                "overlong": None,         # 0/1
+                "notes": "",              # short comment, optional
+            },
+        })
+
+    out = {
+        "experiment": experiment,
+        "exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "rubric": "tip-v1",
+        "rubric_md": rubric_text,
+        "items": items,
+    }
+    Path(out_path).write_text(json.dumps(out, indent=2, ensure_ascii=False))
+    print(f"Exported {len(items)} pending runs → {out_path}")
+    return 0
+
+
+def apply(client: MLflowClient, experiment: str, in_path: str) -> int:
+    exp_id = client.get_or_create_experiment(experiment)
+    payload = json.loads(Path(in_path).read_text(encoding="utf-8"))
+    items = payload.get("items", [])
+    if not items:
+        print("No items in response file.")
+        return 0
+
+    judged_at = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+    n_applied, n_skipped = 0, 0
+    for item in items:
+        run_id = item["run_id"]
+        scores = item.get("scores") or {}
+
+        missing = [d for d in _DIMENSIONS if scores.get(d) in (None, "")]
+        if missing:
+            print(f"  [skip] {run_id}: missing {missing}")
+            n_skipped += 1
+            continue
+
+        metrics = {d: float(scores[d]) for d in _DIMENSIONS}
+        for f in _BIN_FLAGS:
+            v = scores.get(f)
+            if v not in (None, ""):
+                metrics[f] = float(int(bool(int(v))))
+
+        # Composite mirrors rubric.md: relevance + actionability + tone
+        # + 2 * format_ok - overlong.  format_ok is already a metric on
+        # the run from collect.py; re-fetching is cheap and keeps this
+        # script idempotent if format compliance was retroactively fixed.
+        run = client._get("/runs/get", {"run_id": run_id})["run"]
+        existing_metrics = {m["key"]: m["value"] for m in run["data"].get("metrics", [])}
+        format_ok = float(existing_metrics.get("format_ok", 0.0))
+        overlong = metrics.get("overlong", 0.0)
+        composite = (
+            metrics["relevance"] + metrics["actionability"] + metrics["tone"]
+            + 2 * format_ok - overlong
+        )
+        metrics["composite"] = composite
+
+        client.log_metrics(run_id, metrics)
+        client.set_tags(run_id, {
+            "judge_pending": "false",
+            "judged_at": judged_at,
+            "judged_by": "claude-code-session",
+        })
+        if scores.get("notes"):
+            client.set_tag(run_id, "judge_notes", str(scores["notes"])[:1000])
+
+        n_applied += 1
+        print(f"  [ok]   {run_id}: rel={metrics['relevance']:.1f} "
+              f"act={metrics['actionability']:.1f} tone={metrics['tone']:.1f} "
+              f"comp={composite:.2f}")
+
+    print(f"Applied {n_applied}, skipped {n_skipped}.")
+    return 0
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="oO bench — Phase B (Claude Code judge)")
+    parser.add_argument("--experiment", required=True)
+    parser.add_argument("--mlflow-url", default=os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000"))
+    grp = parser.add_mutually_exclusive_group(required=True)
+    grp.add_argument("--export", metavar="PATH",
+                     help="Write pending runs as a judgment-request JSON file.")
+    grp.add_argument("--apply", metavar="PATH",
+                     help="Read filled-in responses and write metrics back to MLflow.")
+    args = parser.parse_args()
+
+    client = MLflowClient(
+        tracking_uri=args.mlflow_url,
+        username=os.environ.get("MLFLOW_TRACKING_USERNAME") or "admin",
+        password=os.environ.get("MLFLOW_TRACKING_PASSWORD") or "password",
+    )
+    if args.export:
+        return export(client, args.experiment, args.export)
+    return apply(client, args.experiment, args.apply)
+
+
+if __name__ == "__main__":
+    sys.exit(main())