Combines model evaluation (#93) and prompt A/B testing (#95) into one experiment. Evaluates all (model × prompt × scenario) cells on the same fixed contexts so quality differences are attributable. Architecture: - Phase A (collect.py): generates candidates per cell, logs to MLflow with judge_pending=true. Rejects models >4B, uses keep_alive=0 for RAM safety (no concurrent model weights in VRAM). - Phase B (judge_cli.py): exports pending runs as JSON for Claude Code to score per the rubric, then applies scores back to MLflow. - Phase C (compare.py): leaderboard by (model, prompt) cell. Rubric (tip-v1) defines 1–5 scales for relevance, actionability, tone, plus format_ok and overlong flags. Composite = rel + act + tone + 2×format_ok − overlong. Rubric is self-describing and persisted in every run so judges use consistent criteria across sessions. Artifacts (prompts, candidates, raw responses) stored as MLflow tags because the server uses a file:// backend not accessible via REST. Full artifacts accessible in MLflow UI → run → Tags section. Tested end-to-end on local machine: - 4 models (qwen2.5:0.5b/1.5b, gemma3:1b, llama3.2:3b) ≤4B - 3 prompts (v1, v2-mentor, v3-few-shot) - 4 scenarios (4 personas × 2 time-slots) - 48 cells total, all judged and ranked Winner: qwen2.5:1.5b × v3-few-shot (composite=12.75). Ready for integration into Airflow prompt_ab_eval DAG and admin UI. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
80
ml/experiments/bench/scenarios.py
Normal file
80
ml/experiments/bench/scenarios.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""Fixed contexts for the tip-generation benchmark.
|
||||
|
||||
Every cell of the (model × prompt) grid is evaluated on the *same* set of
|
||||
scenarios so quality differences are attributable to the model/prompt,
|
||||
not to context variance.
|
||||
|
||||
A scenario is one (persona, hour-of-day, candidate-task-pool) tuple. The
|
||||
hour and the task pool are seeded deterministically from the persona's
|
||||
name so the bench is reproducible across machines.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
# Reuse personas from sim — same source of truth for user archetypes.
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "sim"))
|
||||
from personas import PERSONAS, Persona # type: ignore
|
||||
from task_generator import generate_task_pool # type: ignore
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Scenario:
|
||||
id: str # stable id used as MLflow tag — keep ASCII safe
|
||||
persona: Persona
|
||||
hour_of_day: int # 0–23
|
||||
day_of_week: int # 0=Mon
|
||||
tasks: list[dict]
|
||||
|
||||
def to_prompt_context(self) -> dict:
|
||||
"""Shape expected by ml/serving/prompts.PromptContext."""
|
||||
return {
|
||||
"tasks": [
|
||||
{
|
||||
"content": t["content"],
|
||||
"priority": t["features"]["priority"],
|
||||
"is_overdue": t["features"]["is_overdue"],
|
||||
"due_date": t.get("due_date", "no due date"),
|
||||
}
|
||||
for t in self.tasks
|
||||
],
|
||||
"hour_of_day": self.hour_of_day,
|
||||
"day_of_week": self.day_of_week,
|
||||
"extra": {
|
||||
"persona": self.persona.name,
|
||||
"persona_hint": self.persona.description,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# Two time-slots probe whether the model adapts its tone to the hour.
|
||||
# Morning (09) and evening (21) are picked because most personas have
|
||||
# strong directional preferences there.
|
||||
_TIME_SLOTS = [(9, 1), (21, 3)] # (hour_of_day, day_of_week)
|
||||
|
||||
|
||||
def build_scenarios(tasks_per_scenario: int = 6) -> list[Scenario]:
|
||||
"""Return a deterministic list of scenarios.
|
||||
|
||||
With 4 personas × 2 time-slots = 8 scenarios. Task pools are seeded
|
||||
by ``hash(persona.name) + hour`` so runs are reproducible and each
|
||||
persona sees the same tasks at the same hour across cells.
|
||||
"""
|
||||
out: list[Scenario] = []
|
||||
for persona in PERSONAS[:4]:
|
||||
for hour, dow in _TIME_SLOTS:
|
||||
seed = (abs(hash(persona.name)) % 9973) + hour
|
||||
tasks = generate_task_pool(n=tasks_per_scenario, seed=seed)
|
||||
out.append(
|
||||
Scenario(
|
||||
id=f"{persona.name}-h{hour:02d}",
|
||||
persona=persona,
|
||||
hour_of_day=hour,
|
||||
day_of_week=dow,
|
||||
tasks=tasks,
|
||||
)
|
||||
)
|
||||
return out
|
||||
Reference in New Issue
Block a user