oO/ml/experiments/bench/judge_cli.py

"""Phase B — Claude Code as the lazy MLflow judge.

Two sub-commands, both keyed to MLflow tags so the same run cycles
through ``judge_pending=true`` → judged → ``judge_pending=false`` exactly
once.

  --export PATH
      Pull every run with ``judge_pending=true`` and ``judge_kind=claude-code``
      from the experiment, bundle the prompt + parsed candidates + the
      rubric into a single JSON file the Claude Code session can read.

  --apply PATH
      Read the responses (same shape as the request, with ``scores`` filled in)
      and log ``relevance``, ``actionability``, ``tone``, ``overlong`` as
      MLflow metrics on the corresponding runs. Sets ``judge_pending=false``
      and stamps ``judged_at`` / ``judged_by`` so the run won't be picked up
      twice.

The request file is intentionally one big JSON document, so the human
judge sees the full set in one place and can score consistently.
"""

from __future__ import annotations

import argparse
import json
import os
import sys
import time
from pathlib import Path

_BENCH = Path(__file__).resolve().parent
sys.path.insert(0, str(_BENCH))
from mlflow_client import MLflowClient  # type: ignore


_DIMENSIONS = ("relevance", "actionability", "tone")
_BIN_FLAGS = ("overlong",)


def _tags_dict(run: dict) -> dict[str, str]:
    return {t["key"]: t["value"] for t in run.get("data", {}).get("tags", [])}


def _params_dict(run: dict) -> dict[str, str]:
    return {p["key"]: p["value"] for p in run.get("data", {}).get("params", [])}


def export(client: MLflowClient, experiment: str, out_path: str) -> int:
    exp_id = client.get_or_create_experiment(experiment)
    runs = client.search_runs(
        exp_id,
        filter_string="tags.judge_pending = 'true' and tags.judge_kind = 'claude-code'",
    )
    if not runs:
        print("No pending runs.")
        Path(out_path).write_text(json.dumps({
            "experiment": experiment,
            "exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
            "rubric": "tip-v1",
            "items": [],
        }, indent=2))
        return 0

    rubric_text = (_BENCH / "rubric.md").read_text(encoding="utf-8")

    items: list[dict] = []
    for run in runs:
        run_id = run["info"]["run_id"]
        tags = _tags_dict(run)
        params = _params_dict(run)
        candidates_json = client.get_artifact_text(run_id, "candidates.json")
        prompt_text = client.get_artifact_text(run_id, "prompt.txt")
        try:
            candidates = json.loads(candidates_json) if candidates_json else []
        except json.JSONDecodeError:
            candidates = []

        items.append({
            "run_id": run_id,
            "model": params.get("model") or tags.get("model"),
            "prompt_version": params.get("prompt_version") or tags.get("prompt_version"),
            "scenario_id": params.get("scenario_id") or tags.get("scenario_id"),
            "persona": params.get("persona") or tags.get("persona"),
            "hour_of_day": int(params.get("hour_of_day", "12")),
            "day_of_week": int(params.get("day_of_week", "0")),
            "prompt": prompt_text,
            "candidates": candidates,
            # Per-run scoring slot — judge fills these in.
            "scores": {
                "relevance": None,        # 1–5, integer
                "actionability": None,    # 1–5, integer
                "tone": None,             # 1–5, integer
                "overlong": None,         # 0/1
                "notes": "",              # short comment, optional
            },
        })

    out = {
        "experiment": experiment,
        "exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "rubric": "tip-v1",
        "rubric_md": rubric_text,
        "items": items,
    }
    Path(out_path).write_text(json.dumps(out, indent=2, ensure_ascii=False))
    print(f"Exported {len(items)} pending runs → {out_path}")
    return 0


def apply(client: MLflowClient, experiment: str, in_path: str) -> int:
    exp_id = client.get_or_create_experiment(experiment)
    payload = json.loads(Path(in_path).read_text(encoding="utf-8"))
    items = payload.get("items", [])
    if not items:
        print("No items in response file.")
        return 0

    judged_at = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
    n_applied, n_skipped = 0, 0
    for item in items:
        run_id = item["run_id"]
        scores = item.get("scores") or {}

        missing = [d for d in _DIMENSIONS if scores.get(d) in (None, "")]
        if missing:
            print(f"  [skip] {run_id}: missing {missing}")
            n_skipped += 1
            continue

        metrics = {d: float(scores[d]) for d in _DIMENSIONS}
        for f in _BIN_FLAGS:
            v = scores.get(f)
            if v not in (None, ""):
                metrics[f] = float(int(bool(int(v))))

        # Composite mirrors rubric.md: relevance + actionability + tone
        # + 2 * format_ok - overlong.  format_ok is already a metric on
        # the run from collect.py; re-fetching is cheap and keeps this
        # script idempotent if format compliance was retroactively fixed.
        run = client._get("/runs/get", {"run_id": run_id})["run"]
        existing_metrics = {m["key"]: m["value"] for m in run["data"].get("metrics", [])}
        format_ok = float(existing_metrics.get("format_ok", 0.0))
        overlong = metrics.get("overlong", 0.0)
        composite = (
            metrics["relevance"] + metrics["actionability"] + metrics["tone"]
            + 2 * format_ok - overlong
        )
        metrics["composite"] = composite

        client.log_metrics(run_id, metrics)
        client.set_tags(run_id, {
            "judge_pending": "false",
            "judged_at": judged_at,
            "judged_by": "claude-code-session",
        })
        if scores.get("notes"):
            client.set_tag(run_id, "judge_notes", str(scores["notes"])[:1000])

        n_applied += 1
        print(f"  [ok]   {run_id}: rel={metrics['relevance']:.1f} "
              f"act={metrics['actionability']:.1f} tone={metrics['tone']:.1f} "
              f"comp={composite:.2f}")

    print(f"Applied {n_applied}, skipped {n_skipped}.")
    return 0


def main() -> int:
    parser = argparse.ArgumentParser(description="oO bench — Phase B (Claude Code judge)")
    parser.add_argument("--experiment", required=True)
    parser.add_argument("--mlflow-url", default=os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000"))
    grp = parser.add_mutually_exclusive_group(required=True)
    grp.add_argument("--export", metavar="PATH",
                     help="Write pending runs as a judgment-request JSON file.")
    grp.add_argument("--apply", metavar="PATH",
                     help="Read filled-in responses and write metrics back to MLflow.")
    args = parser.parse_args()

    client = MLflowClient(
        tracking_uri=args.mlflow_url,
        username=os.environ.get("MLFLOW_TRACKING_USERNAME") or "admin",
        password=os.environ.get("MLFLOW_TRACKING_PASSWORD") or "password",
    )
    if args.export:
        return export(client, args.experiment, args.export)
    return apply(client, args.experiment, args.apply)


if __name__ == "__main__":
    sys.exit(main())