oO/ml/experiments/bench/scenarios.py

"""Fixed contexts for the tip-generation benchmark.

Every cell of the (model × prompt) grid is evaluated on the *same* set of
scenarios so quality differences are attributable to the model/prompt,
not to context variance.

A scenario is one (persona, hour-of-day, candidate-task-pool) tuple. The
hour and the task pool are seeded deterministically from the persona's
name so the bench is reproducible across machines.
"""

from __future__ import annotations

import sys
from dataclasses import dataclass
from pathlib import Path

# Reuse personas from sim — same source of truth for user archetypes.
sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "sim"))
from personas import PERSONAS, Persona  # type: ignore
from task_generator import generate_task_pool  # type: ignore


@dataclass(frozen=True)
class Scenario:
    id: str           # stable id used as MLflow tag — keep ASCII safe
    persona: Persona
    hour_of_day: int  # 0–23
    day_of_week: int  # 0=Mon
    tasks: list[dict]

    def to_prompt_context(self) -> dict:
        """Shape expected by ml/serving/prompts.PromptContext."""
        return {
            "tasks": [
                {
                    "content": t["content"],
                    "priority": t["features"]["priority"],
                    "is_overdue": t["features"]["is_overdue"],
                    "due_date": t.get("due_date", "no due date"),
                }
                for t in self.tasks
            ],
            "hour_of_day": self.hour_of_day,
            "day_of_week": self.day_of_week,
            "extra": {
                "persona": self.persona.name,
                "persona_hint": self.persona.description,
            },
        }


# Two time-slots probe whether the model adapts its tone to the hour.
# Morning (09) and evening (21) are picked because most personas have
# strong directional preferences there.
_TIME_SLOTS = [(9, 1), (21, 3)]   # (hour_of_day, day_of_week)


def build_scenarios(tasks_per_scenario: int = 6) -> list[Scenario]:
    """Return a deterministic list of scenarios.

    With 4 personas × 2 time-slots = 8 scenarios. Task pools are seeded
    by ``hash(persona.name) + hour`` so runs are reproducible and each
    persona sees the same tasks at the same hour across cells.
    """
    out: list[Scenario] = []
    for persona in PERSONAS[:4]:
        for hour, dow in _TIME_SLOTS:
            seed = (abs(hash(persona.name)) % 9973) + hour
            tasks = generate_task_pool(n=tasks_per_scenario, seed=seed)
            out.append(
                Scenario(
                    id=f"{persona.name}-h{hour:02d}",
                    persona=persona,
                    hour_of_day=hour,
                    day_of_week=dow,
                    tasks=tasks,
                )
            )
    return out