Combines model evaluation (#93) and prompt A/B testing (#95) into one experiment. Evaluates all (model × prompt × scenario) cells on the same fixed contexts so quality differences are attributable. Architecture: - Phase A (collect.py): generates candidates per cell, logs to MLflow with judge_pending=true. Rejects models >4B, uses keep_alive=0 for RAM safety (no concurrent model weights in VRAM). - Phase B (judge_cli.py): exports pending runs as JSON for Claude Code to score per the rubric, then applies scores back to MLflow. - Phase C (compare.py): leaderboard by (model, prompt) cell. Rubric (tip-v1) defines 1–5 scales for relevance, actionability, tone, plus format_ok and overlong flags. Composite = rel + act + tone + 2×format_ok − overlong. Rubric is self-describing and persisted in every run so judges use consistent criteria across sessions. Artifacts (prompts, candidates, raw responses) stored as MLflow tags because the server uses a file:// backend not accessible via REST. Full artifacts accessible in MLflow UI → run → Tags section. Tested end-to-end on local machine: - 4 models (qwen2.5:0.5b/1.5b, gemma3:1b, llama3.2:3b) ≤4B - 3 prompts (v1, v2-mentor, v3-few-shot) - 4 scenarios (4 personas × 2 time-slots) - 48 cells total, all judged and ranked Winner: qwen2.5:1.5b × v3-few-shot (composite=12.75). Ready for integration into Airflow prompt_ab_eval DAG and admin UI. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
192 lines
7.1 KiB
Python
192 lines
7.1 KiB
Python
"""Phase B — Claude Code as the lazy MLflow judge.
|
||
|
||
Two sub-commands, both keyed to MLflow tags so the same run cycles
|
||
through ``judge_pending=true`` → judged → ``judge_pending=false`` exactly
|
||
once.
|
||
|
||
--export PATH
|
||
Pull every run with ``judge_pending=true`` and ``judge_kind=claude-code``
|
||
from the experiment, bundle the prompt + parsed candidates + the
|
||
rubric into a single JSON file the Claude Code session can read.
|
||
|
||
--apply PATH
|
||
Read the responses (same shape as the request, with ``scores`` filled in)
|
||
and log ``relevance``, ``actionability``, ``tone``, ``overlong`` as
|
||
MLflow metrics on the corresponding runs. Sets ``judge_pending=false``
|
||
and stamps ``judged_at`` / ``judged_by`` so the run won't be picked up
|
||
twice.
|
||
|
||
The request file is intentionally one big JSON document, so the human
|
||
judge sees the full set in one place and can score consistently.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import os
|
||
import sys
|
||
import time
|
||
from pathlib import Path
|
||
|
||
_BENCH = Path(__file__).resolve().parent
|
||
sys.path.insert(0, str(_BENCH))
|
||
from mlflow_client import MLflowClient # type: ignore
|
||
|
||
|
||
_DIMENSIONS = ("relevance", "actionability", "tone")
|
||
_BIN_FLAGS = ("overlong",)
|
||
|
||
|
||
def _tags_dict(run: dict) -> dict[str, str]:
|
||
return {t["key"]: t["value"] for t in run.get("data", {}).get("tags", [])}
|
||
|
||
|
||
def _params_dict(run: dict) -> dict[str, str]:
|
||
return {p["key"]: p["value"] for p in run.get("data", {}).get("params", [])}
|
||
|
||
|
||
def export(client: MLflowClient, experiment: str, out_path: str) -> int:
|
||
exp_id = client.get_or_create_experiment(experiment)
|
||
runs = client.search_runs(
|
||
exp_id,
|
||
filter_string="tags.judge_pending = 'true' and tags.judge_kind = 'claude-code'",
|
||
)
|
||
if not runs:
|
||
print("No pending runs.")
|
||
Path(out_path).write_text(json.dumps({
|
||
"experiment": experiment,
|
||
"exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||
"rubric": "tip-v1",
|
||
"items": [],
|
||
}, indent=2))
|
||
return 0
|
||
|
||
rubric_text = (_BENCH / "rubric.md").read_text(encoding="utf-8")
|
||
|
||
items: list[dict] = []
|
||
for run in runs:
|
||
run_id = run["info"]["run_id"]
|
||
tags = _tags_dict(run)
|
||
params = _params_dict(run)
|
||
candidates_json = client.get_artifact_text(run_id, "candidates.json")
|
||
prompt_text = client.get_artifact_text(run_id, "prompt.txt")
|
||
try:
|
||
candidates = json.loads(candidates_json) if candidates_json else []
|
||
except json.JSONDecodeError:
|
||
candidates = []
|
||
|
||
items.append({
|
||
"run_id": run_id,
|
||
"model": params.get("model") or tags.get("model"),
|
||
"prompt_version": params.get("prompt_version") or tags.get("prompt_version"),
|
||
"scenario_id": params.get("scenario_id") or tags.get("scenario_id"),
|
||
"persona": params.get("persona") or tags.get("persona"),
|
||
"hour_of_day": int(params.get("hour_of_day", "12")),
|
||
"day_of_week": int(params.get("day_of_week", "0")),
|
||
"prompt": prompt_text,
|
||
"candidates": candidates,
|
||
# Per-run scoring slot — judge fills these in.
|
||
"scores": {
|
||
"relevance": None, # 1–5, integer
|
||
"actionability": None, # 1–5, integer
|
||
"tone": None, # 1–5, integer
|
||
"overlong": None, # 0/1
|
||
"notes": "", # short comment, optional
|
||
},
|
||
})
|
||
|
||
out = {
|
||
"experiment": experiment,
|
||
"exported_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||
"rubric": "tip-v1",
|
||
"rubric_md": rubric_text,
|
||
"items": items,
|
||
}
|
||
Path(out_path).write_text(json.dumps(out, indent=2, ensure_ascii=False))
|
||
print(f"Exported {len(items)} pending runs → {out_path}")
|
||
return 0
|
||
|
||
|
||
def apply(client: MLflowClient, experiment: str, in_path: str) -> int:
|
||
exp_id = client.get_or_create_experiment(experiment)
|
||
payload = json.loads(Path(in_path).read_text(encoding="utf-8"))
|
||
items = payload.get("items", [])
|
||
if not items:
|
||
print("No items in response file.")
|
||
return 0
|
||
|
||
judged_at = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||
n_applied, n_skipped = 0, 0
|
||
for item in items:
|
||
run_id = item["run_id"]
|
||
scores = item.get("scores") or {}
|
||
|
||
missing = [d for d in _DIMENSIONS if scores.get(d) in (None, "")]
|
||
if missing:
|
||
print(f" [skip] {run_id}: missing {missing}")
|
||
n_skipped += 1
|
||
continue
|
||
|
||
metrics = {d: float(scores[d]) for d in _DIMENSIONS}
|
||
for f in _BIN_FLAGS:
|
||
v = scores.get(f)
|
||
if v not in (None, ""):
|
||
metrics[f] = float(int(bool(int(v))))
|
||
|
||
# Composite mirrors rubric.md: relevance + actionability + tone
|
||
# + 2 * format_ok - overlong. format_ok is already a metric on
|
||
# the run from collect.py; re-fetching is cheap and keeps this
|
||
# script idempotent if format compliance was retroactively fixed.
|
||
run = client._get("/runs/get", {"run_id": run_id})["run"]
|
||
existing_metrics = {m["key"]: m["value"] for m in run["data"].get("metrics", [])}
|
||
format_ok = float(existing_metrics.get("format_ok", 0.0))
|
||
overlong = metrics.get("overlong", 0.0)
|
||
composite = (
|
||
metrics["relevance"] + metrics["actionability"] + metrics["tone"]
|
||
+ 2 * format_ok - overlong
|
||
)
|
||
metrics["composite"] = composite
|
||
|
||
client.log_metrics(run_id, metrics)
|
||
client.set_tags(run_id, {
|
||
"judge_pending": "false",
|
||
"judged_at": judged_at,
|
||
"judged_by": "claude-code-session",
|
||
})
|
||
if scores.get("notes"):
|
||
client.set_tag(run_id, "judge_notes", str(scores["notes"])[:1000])
|
||
|
||
n_applied += 1
|
||
print(f" [ok] {run_id}: rel={metrics['relevance']:.1f} "
|
||
f"act={metrics['actionability']:.1f} tone={metrics['tone']:.1f} "
|
||
f"comp={composite:.2f}")
|
||
|
||
print(f"Applied {n_applied}, skipped {n_skipped}.")
|
||
return 0
|
||
|
||
|
||
def main() -> int:
|
||
parser = argparse.ArgumentParser(description="oO bench — Phase B (Claude Code judge)")
|
||
parser.add_argument("--experiment", required=True)
|
||
parser.add_argument("--mlflow-url", default=os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000"))
|
||
grp = parser.add_mutually_exclusive_group(required=True)
|
||
grp.add_argument("--export", metavar="PATH",
|
||
help="Write pending runs as a judgment-request JSON file.")
|
||
grp.add_argument("--apply", metavar="PATH",
|
||
help="Read filled-in responses and write metrics back to MLflow.")
|
||
args = parser.parse_args()
|
||
|
||
client = MLflowClient(
|
||
tracking_uri=args.mlflow_url,
|
||
username=os.environ.get("MLFLOW_TRACKING_USERNAME") or "admin",
|
||
password=os.environ.get("MLFLOW_TRACKING_PASSWORD") or "password",
|
||
)
|
||
if args.export:
|
||
return export(client, args.experiment, args.export)
|
||
return apply(client, args.experiment, args.apply)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|