Files
oO/ml/experiments/bench/judge_cli.py
alvis 556019b060 feat(bench): MLflow-based tip-generation benchmark harness (#93, #95)
Combines model evaluation (#93) and prompt A/B testing (#95) into one
experiment. Evaluates all (model × prompt × scenario) cells on the same
fixed contexts so quality differences are attributable.

Architecture:
- Phase A (collect.py): generates candidates per cell, logs to MLflow
  with judge_pending=true. Rejects models >4B, uses keep_alive=0 for
  RAM safety (no concurrent model weights in VRAM).
- Phase B (judge_cli.py): exports pending runs as JSON for Claude Code
  to score per the rubric, then applies scores back to MLflow.
- Phase C (compare.py): leaderboard by (model, prompt) cell.

Rubric (tip-v1) defines 1–5 scales for relevance, actionability, tone,
plus format_ok and overlong flags. Composite = rel + act + tone +
2×format_ok − overlong. Rubric is self-describing and persisted in every
run so judges use consistent criteria across sessions.

Artifacts (prompts, candidates, raw responses) stored as MLflow tags
because the server uses a file:// backend not accessible via REST. Full
artifacts accessible in MLflow UI → run → Tags section.

Tested end-to-end on local machine:
- 4 models (qwen2.5:0.5b/1.5b, gemma3:1b, llama3.2:3b) ≤4B
- 3 prompts (v1, v2-mentor, v3-few-shot)
- 4 scenarios (4 personas × 2 time-slots)
- 48 cells total, all judged and ranked

Winner: qwen2.5:1.5b × v3-few-shot (composite=12.75).

Ready for integration into Airflow prompt_ab_eval DAG and admin UI.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-27 11:48:59 +00:00

192 lines
7.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Phase B — Claude Code as the lazy MLflow judge.
Two sub-commands, both keyed to MLflow tags so the same run cycles
through ``judge_pending=true`` → judged → ``judge_pending=false`` exactly
once.
--export PATH
Pull every run with ``judge_pending=true`` and ``judge_kind=claude-code``
from the experiment, bundle the prompt + parsed candidates + the
rubric into a single JSON file the Claude Code session can read.
--apply PATH
Read the responses (same shape as the request, with ``scores`` filled in)
and log ``relevance``, ``actionability``, ``tone``, ``overlong`` as
MLflow metrics on the corresponding runs. Sets ``judge_pending=false``
and stamps ``judged_at`` / ``judged_by`` so the run won't be picked up
twice.
The request file is intentionally one big JSON document, so the human
judge sees the full set in one place and can score consistently.
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import time
from pathlib import Path
_BENCH = Path(__file__).resolve().parent
sys.path.insert(0, str(_BENCH))
from mlflow_client import MLflowClient # type: ignore
_DIMENSIONS = ("relevance", "actionability", "tone")
_BIN_FLAGS = ("overlong",)
def _tags_dict(run: dict) -> dict[str, str]:
return {t["key"]: t["value"] for t in run.get("data", {}).get("tags", [])}
def _params_dict(run: dict) -> dict[str, str]:
return {p["key"]: p["value"] for p in run.get("data", {}).get("params", [])}
def export(client: MLflowClient, experiment: str, out_path: str) -> int:
exp_id = client.get_or_create_experiment(experiment)
runs = client.search_runs(
exp_id,
filter_string="tags.judge_pending = 'true' and tags.judge_kind = 'claude-code'",
)
if not runs:
print("No pending runs.")
Path(out_path).write_text(json.dumps({
"experiment": experiment,
"exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"rubric": "tip-v1",
"items": [],
}, indent=2))
return 0
rubric_text = (_BENCH / "rubric.md").read_text(encoding="utf-8")
items: list[dict] = []
for run in runs:
run_id = run["info"]["run_id"]
tags = _tags_dict(run)
params = _params_dict(run)
candidates_json = client.get_artifact_text(run_id, "candidates.json")
prompt_text = client.get_artifact_text(run_id, "prompt.txt")
try:
candidates = json.loads(candidates_json) if candidates_json else []
except json.JSONDecodeError:
candidates = []
items.append({
"run_id": run_id,
"model": params.get("model") or tags.get("model"),
"prompt_version": params.get("prompt_version") or tags.get("prompt_version"),
"scenario_id": params.get("scenario_id") or tags.get("scenario_id"),
"persona": params.get("persona") or tags.get("persona"),
"hour_of_day": int(params.get("hour_of_day", "12")),
"day_of_week": int(params.get("day_of_week", "0")),
"prompt": prompt_text,
"candidates": candidates,
# Per-run scoring slot — judge fills these in.
"scores": {
"relevance": None, # 15, integer
"actionability": None, # 15, integer
"tone": None, # 15, integer
"overlong": None, # 0/1
"notes": "", # short comment, optional
},
})
out = {
"experiment": experiment,
"exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"rubric": "tip-v1",
"rubric_md": rubric_text,
"items": items,
}
Path(out_path).write_text(json.dumps(out, indent=2, ensure_ascii=False))
print(f"Exported {len(items)} pending runs → {out_path}")
return 0
def apply(client: MLflowClient, experiment: str, in_path: str) -> int:
exp_id = client.get_or_create_experiment(experiment)
payload = json.loads(Path(in_path).read_text(encoding="utf-8"))
items = payload.get("items", [])
if not items:
print("No items in response file.")
return 0
judged_at = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
n_applied, n_skipped = 0, 0
for item in items:
run_id = item["run_id"]
scores = item.get("scores") or {}
missing = [d for d in _DIMENSIONS if scores.get(d) in (None, "")]
if missing:
print(f" [skip] {run_id}: missing {missing}")
n_skipped += 1
continue
metrics = {d: float(scores[d]) for d in _DIMENSIONS}
for f in _BIN_FLAGS:
v = scores.get(f)
if v not in (None, ""):
metrics[f] = float(int(bool(int(v))))
# Composite mirrors rubric.md: relevance + actionability + tone
# + 2 * format_ok - overlong. format_ok is already a metric on
# the run from collect.py; re-fetching is cheap and keeps this
# script idempotent if format compliance was retroactively fixed.
run = client._get("/runs/get", {"run_id": run_id})["run"]
existing_metrics = {m["key"]: m["value"] for m in run["data"].get("metrics", [])}
format_ok = float(existing_metrics.get("format_ok", 0.0))
overlong = metrics.get("overlong", 0.0)
composite = (
metrics["relevance"] + metrics["actionability"] + metrics["tone"]
+ 2 * format_ok - overlong
)
metrics["composite"] = composite
client.log_metrics(run_id, metrics)
client.set_tags(run_id, {
"judge_pending": "false",
"judged_at": judged_at,
"judged_by": "claude-code-session",
})
if scores.get("notes"):
client.set_tag(run_id, "judge_notes", str(scores["notes"])[:1000])
n_applied += 1
print(f" [ok] {run_id}: rel={metrics['relevance']:.1f} "
f"act={metrics['actionability']:.1f} tone={metrics['tone']:.1f} "
f"comp={composite:.2f}")
print(f"Applied {n_applied}, skipped {n_skipped}.")
return 0
def main() -> int:
parser = argparse.ArgumentParser(description="oO bench — Phase B (Claude Code judge)")
parser.add_argument("--experiment", required=True)
parser.add_argument("--mlflow-url", default=os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000"))
grp = parser.add_mutually_exclusive_group(required=True)
grp.add_argument("--export", metavar="PATH",
help="Write pending runs as a judgment-request JSON file.")
grp.add_argument("--apply", metavar="PATH",
help="Read filled-in responses and write metrics back to MLflow.")
args = parser.parse_args()
client = MLflowClient(
tracking_uri=args.mlflow_url,
username=os.environ.get("MLFLOW_TRACKING_USERNAME") or "admin",
password=os.environ.get("MLFLOW_TRACKING_PASSWORD") or "password",
)
if args.export:
return export(client, args.experiment, args.export)
return apply(client, args.experiment, args.apply)
if __name__ == "__main__":
sys.exit(main())