"""Phase B — Claude Code as the lazy MLflow judge. Two sub-commands, both keyed to MLflow tags so the same run cycles through ``judge_pending=true`` → judged → ``judge_pending=false`` exactly once. --export PATH Pull every run with ``judge_pending=true`` and ``judge_kind=claude-code`` from the experiment, bundle the prompt + parsed candidates + the rubric into a single JSON file the Claude Code session can read. --apply PATH Read the responses (same shape as the request, with ``scores`` filled in) and log ``relevance``, ``actionability``, ``tone``, ``overlong`` as MLflow metrics on the corresponding runs. Sets ``judge_pending=false`` and stamps ``judged_at`` / ``judged_by`` so the run won't be picked up twice. The request file is intentionally one big JSON document, so the human judge sees the full set in one place and can score consistently. """ from __future__ import annotations import argparse import json import os import sys import time from pathlib import Path _BENCH = Path(__file__).resolve().parent sys.path.insert(0, str(_BENCH)) from mlflow_client import MLflowClient # type: ignore _DIMENSIONS = ("relevance", "actionability", "tone") _BIN_FLAGS = ("overlong",) def _tags_dict(run: dict) -> dict[str, str]: return {t["key"]: t["value"] for t in run.get("data", {}).get("tags", [])} def _params_dict(run: dict) -> dict[str, str]: return {p["key"]: p["value"] for p in run.get("data", {}).get("params", [])} def export(client: MLflowClient, experiment: str, out_path: str) -> int: exp_id = client.get_or_create_experiment(experiment) runs = client.search_runs( exp_id, filter_string="tags.judge_pending = 'true' and tags.judge_kind = 'claude-code'", ) if not runs: print("No pending runs.") Path(out_path).write_text(json.dumps({ "experiment": experiment, "exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), "rubric": "tip-v1", "items": [], }, indent=2)) return 0 rubric_text = (_BENCH / "rubric.md").read_text(encoding="utf-8") items: list[dict] = [] for run in runs: run_id = run["info"]["run_id"] tags = _tags_dict(run) params = _params_dict(run) candidates_json = client.get_artifact_text(run_id, "candidates.json") prompt_text = client.get_artifact_text(run_id, "prompt.txt") try: candidates = json.loads(candidates_json) if candidates_json else [] except json.JSONDecodeError: candidates = [] items.append({ "run_id": run_id, "model": params.get("model") or tags.get("model"), "prompt_version": params.get("prompt_version") or tags.get("prompt_version"), "scenario_id": params.get("scenario_id") or tags.get("scenario_id"), "persona": params.get("persona") or tags.get("persona"), "hour_of_day": int(params.get("hour_of_day", "12")), "day_of_week": int(params.get("day_of_week", "0")), "prompt": prompt_text, "candidates": candidates, # Per-run scoring slot — judge fills these in. "scores": { "relevance": None, # 1–5, integer "actionability": None, # 1–5, integer "tone": None, # 1–5, integer "overlong": None, # 0/1 "notes": "", # short comment, optional }, }) out = { "experiment": experiment, "exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), "rubric": "tip-v1", "rubric_md": rubric_text, "items": items, } Path(out_path).write_text(json.dumps(out, indent=2, ensure_ascii=False)) print(f"Exported {len(items)} pending runs → {out_path}") return 0 def apply(client: MLflowClient, experiment: str, in_path: str) -> int: exp_id = client.get_or_create_experiment(experiment) payload = json.loads(Path(in_path).read_text(encoding="utf-8")) items = payload.get("items", []) if not items: print("No items in response file.") return 0 judged_at = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) n_applied, n_skipped = 0, 0 for item in items: run_id = item["run_id"] scores = item.get("scores") or {} missing = [d for d in _DIMENSIONS if scores.get(d) in (None, "")] if missing: print(f" [skip] {run_id}: missing {missing}") n_skipped += 1 continue metrics = {d: float(scores[d]) for d in _DIMENSIONS} for f in _BIN_FLAGS: v = scores.get(f) if v not in (None, ""): metrics[f] = float(int(bool(int(v)))) # Composite mirrors rubric.md: relevance + actionability + tone # + 2 * format_ok - overlong. format_ok is already a metric on # the run from collect.py; re-fetching is cheap and keeps this # script idempotent if format compliance was retroactively fixed. run = client._get("/runs/get", {"run_id": run_id})["run"] existing_metrics = {m["key"]: m["value"] for m in run["data"].get("metrics", [])} format_ok = float(existing_metrics.get("format_ok", 0.0)) overlong = metrics.get("overlong", 0.0) composite = ( metrics["relevance"] + metrics["actionability"] + metrics["tone"] + 2 * format_ok - overlong ) metrics["composite"] = composite client.log_metrics(run_id, metrics) client.set_tags(run_id, { "judge_pending": "false", "judged_at": judged_at, "judged_by": "claude-code-session", }) if scores.get("notes"): client.set_tag(run_id, "judge_notes", str(scores["notes"])[:1000]) n_applied += 1 print(f" [ok] {run_id}: rel={metrics['relevance']:.1f} " f"act={metrics['actionability']:.1f} tone={metrics['tone']:.1f} " f"comp={composite:.2f}") print(f"Applied {n_applied}, skipped {n_skipped}.") return 0 def main() -> int: parser = argparse.ArgumentParser(description="oO bench — Phase B (Claude Code judge)") parser.add_argument("--experiment", required=True) parser.add_argument("--mlflow-url", default=os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000")) grp = parser.add_mutually_exclusive_group(required=True) grp.add_argument("--export", metavar="PATH", help="Write pending runs as a judgment-request JSON file.") grp.add_argument("--apply", metavar="PATH", help="Read filled-in responses and write metrics back to MLflow.") args = parser.parse_args() client = MLflowClient( tracking_uri=args.mlflow_url, username=os.environ.get("MLFLOW_TRACKING_USERNAME") or "admin", password=os.environ.get("MLFLOW_TRACKING_PASSWORD") or "password", ) if args.export: return export(client, args.experiment, args.export) return apply(client, args.experiment, args.apply) if __name__ == "__main__": sys.exit(main())