Combines model evaluation (#93) and prompt A/B testing (#95) into one experiment. Evaluates all (model × prompt × scenario) cells on the same fixed contexts so quality differences are attributable. Architecture: - Phase A (collect.py): generates candidates per cell, logs to MLflow with judge_pending=true. Rejects models >4B, uses keep_alive=0 for RAM safety (no concurrent model weights in VRAM). - Phase B (judge_cli.py): exports pending runs as JSON for Claude Code to score per the rubric, then applies scores back to MLflow. - Phase C (compare.py): leaderboard by (model, prompt) cell. Rubric (tip-v1) defines 1–5 scales for relevance, actionability, tone, plus format_ok and overlong flags. Composite = rel + act + tone + 2×format_ok − overlong. Rubric is self-describing and persisted in every run so judges use consistent criteria across sessions. Artifacts (prompts, candidates, raw responses) stored as MLflow tags because the server uses a file:// backend not accessible via REST. Full artifacts accessible in MLflow UI → run → Tags section. Tested end-to-end on local machine: - 4 models (qwen2.5:0.5b/1.5b, gemma3:1b, llama3.2:3b) ≤4B - 3 prompts (v1, v2-mentor, v3-few-shot) - 4 scenarios (4 personas × 2 time-slots) - 48 cells total, all judged and ranked Winner: qwen2.5:1.5b × v3-few-shot (composite=12.75). Ready for integration into Airflow prompt_ab_eval DAG and admin UI. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
191
ml/experiments/bench/judge_cli.py
Normal file
191
ml/experiments/bench/judge_cli.py
Normal file
@@ -0,0 +1,191 @@
|
||||
"""Phase B — Claude Code as the lazy MLflow judge.
|
||||
|
||||
Two sub-commands, both keyed to MLflow tags so the same run cycles
|
||||
through ``judge_pending=true`` → judged → ``judge_pending=false`` exactly
|
||||
once.
|
||||
|
||||
--export PATH
|
||||
Pull every run with ``judge_pending=true`` and ``judge_kind=claude-code``
|
||||
from the experiment, bundle the prompt + parsed candidates + the
|
||||
rubric into a single JSON file the Claude Code session can read.
|
||||
|
||||
--apply PATH
|
||||
Read the responses (same shape as the request, with ``scores`` filled in)
|
||||
and log ``relevance``, ``actionability``, ``tone``, ``overlong`` as
|
||||
MLflow metrics on the corresponding runs. Sets ``judge_pending=false``
|
||||
and stamps ``judged_at`` / ``judged_by`` so the run won't be picked up
|
||||
twice.
|
||||
|
||||
The request file is intentionally one big JSON document, so the human
|
||||
judge sees the full set in one place and can score consistently.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
_BENCH = Path(__file__).resolve().parent
|
||||
sys.path.insert(0, str(_BENCH))
|
||||
from mlflow_client import MLflowClient # type: ignore
|
||||
|
||||
|
||||
_DIMENSIONS = ("relevance", "actionability", "tone")
|
||||
_BIN_FLAGS = ("overlong",)
|
||||
|
||||
|
||||
def _tags_dict(run: dict) -> dict[str, str]:
|
||||
return {t["key"]: t["value"] for t in run.get("data", {}).get("tags", [])}
|
||||
|
||||
|
||||
def _params_dict(run: dict) -> dict[str, str]:
|
||||
return {p["key"]: p["value"] for p in run.get("data", {}).get("params", [])}
|
||||
|
||||
|
||||
def export(client: MLflowClient, experiment: str, out_path: str) -> int:
|
||||
exp_id = client.get_or_create_experiment(experiment)
|
||||
runs = client.search_runs(
|
||||
exp_id,
|
||||
filter_string="tags.judge_pending = 'true' and tags.judge_kind = 'claude-code'",
|
||||
)
|
||||
if not runs:
|
||||
print("No pending runs.")
|
||||
Path(out_path).write_text(json.dumps({
|
||||
"experiment": experiment,
|
||||
"exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"rubric": "tip-v1",
|
||||
"items": [],
|
||||
}, indent=2))
|
||||
return 0
|
||||
|
||||
rubric_text = (_BENCH / "rubric.md").read_text(encoding="utf-8")
|
||||
|
||||
items: list[dict] = []
|
||||
for run in runs:
|
||||
run_id = run["info"]["run_id"]
|
||||
tags = _tags_dict(run)
|
||||
params = _params_dict(run)
|
||||
candidates_json = client.get_artifact_text(run_id, "candidates.json")
|
||||
prompt_text = client.get_artifact_text(run_id, "prompt.txt")
|
||||
try:
|
||||
candidates = json.loads(candidates_json) if candidates_json else []
|
||||
except json.JSONDecodeError:
|
||||
candidates = []
|
||||
|
||||
items.append({
|
||||
"run_id": run_id,
|
||||
"model": params.get("model") or tags.get("model"),
|
||||
"prompt_version": params.get("prompt_version") or tags.get("prompt_version"),
|
||||
"scenario_id": params.get("scenario_id") or tags.get("scenario_id"),
|
||||
"persona": params.get("persona") or tags.get("persona"),
|
||||
"hour_of_day": int(params.get("hour_of_day", "12")),
|
||||
"day_of_week": int(params.get("day_of_week", "0")),
|
||||
"prompt": prompt_text,
|
||||
"candidates": candidates,
|
||||
# Per-run scoring slot — judge fills these in.
|
||||
"scores": {
|
||||
"relevance": None, # 1–5, integer
|
||||
"actionability": None, # 1–5, integer
|
||||
"tone": None, # 1–5, integer
|
||||
"overlong": None, # 0/1
|
||||
"notes": "", # short comment, optional
|
||||
},
|
||||
})
|
||||
|
||||
out = {
|
||||
"experiment": experiment,
|
||||
"exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"rubric": "tip-v1",
|
||||
"rubric_md": rubric_text,
|
||||
"items": items,
|
||||
}
|
||||
Path(out_path).write_text(json.dumps(out, indent=2, ensure_ascii=False))
|
||||
print(f"Exported {len(items)} pending runs → {out_path}")
|
||||
return 0
|
||||
|
||||
|
||||
def apply(client: MLflowClient, experiment: str, in_path: str) -> int:
|
||||
exp_id = client.get_or_create_experiment(experiment)
|
||||
payload = json.loads(Path(in_path).read_text(encoding="utf-8"))
|
||||
items = payload.get("items", [])
|
||||
if not items:
|
||||
print("No items in response file.")
|
||||
return 0
|
||||
|
||||
judged_at = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||
n_applied, n_skipped = 0, 0
|
||||
for item in items:
|
||||
run_id = item["run_id"]
|
||||
scores = item.get("scores") or {}
|
||||
|
||||
missing = [d for d in _DIMENSIONS if scores.get(d) in (None, "")]
|
||||
if missing:
|
||||
print(f" [skip] {run_id}: missing {missing}")
|
||||
n_skipped += 1
|
||||
continue
|
||||
|
||||
metrics = {d: float(scores[d]) for d in _DIMENSIONS}
|
||||
for f in _BIN_FLAGS:
|
||||
v = scores.get(f)
|
||||
if v not in (None, ""):
|
||||
metrics[f] = float(int(bool(int(v))))
|
||||
|
||||
# Composite mirrors rubric.md: relevance + actionability + tone
|
||||
# + 2 * format_ok - overlong. format_ok is already a metric on
|
||||
# the run from collect.py; re-fetching is cheap and keeps this
|
||||
# script idempotent if format compliance was retroactively fixed.
|
||||
run = client._get("/runs/get", {"run_id": run_id})["run"]
|
||||
existing_metrics = {m["key"]: m["value"] for m in run["data"].get("metrics", [])}
|
||||
format_ok = float(existing_metrics.get("format_ok", 0.0))
|
||||
overlong = metrics.get("overlong", 0.0)
|
||||
composite = (
|
||||
metrics["relevance"] + metrics["actionability"] + metrics["tone"]
|
||||
+ 2 * format_ok - overlong
|
||||
)
|
||||
metrics["composite"] = composite
|
||||
|
||||
client.log_metrics(run_id, metrics)
|
||||
client.set_tags(run_id, {
|
||||
"judge_pending": "false",
|
||||
"judged_at": judged_at,
|
||||
"judged_by": "claude-code-session",
|
||||
})
|
||||
if scores.get("notes"):
|
||||
client.set_tag(run_id, "judge_notes", str(scores["notes"])[:1000])
|
||||
|
||||
n_applied += 1
|
||||
print(f" [ok] {run_id}: rel={metrics['relevance']:.1f} "
|
||||
f"act={metrics['actionability']:.1f} tone={metrics['tone']:.1f} "
|
||||
f"comp={composite:.2f}")
|
||||
|
||||
print(f"Applied {n_applied}, skipped {n_skipped}.")
|
||||
return 0
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="oO bench — Phase B (Claude Code judge)")
|
||||
parser.add_argument("--experiment", required=True)
|
||||
parser.add_argument("--mlflow-url", default=os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000"))
|
||||
grp = parser.add_mutually_exclusive_group(required=True)
|
||||
grp.add_argument("--export", metavar="PATH",
|
||||
help="Write pending runs as a judgment-request JSON file.")
|
||||
grp.add_argument("--apply", metavar="PATH",
|
||||
help="Read filled-in responses and write metrics back to MLflow.")
|
||||
args = parser.parse_args()
|
||||
|
||||
client = MLflowClient(
|
||||
tracking_uri=args.mlflow_url,
|
||||
username=os.environ.get("MLFLOW_TRACKING_USERNAME") or "admin",
|
||||
password=os.environ.get("MLFLOW_TRACKING_PASSWORD") or "password",
|
||||
)
|
||||
if args.export:
|
||||
return export(client, args.experiment, args.export)
|
||||
return apply(client, args.experiment, args.apply)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user