oO/ml/experiments/sim/llm_judge.py

"""
LLM-based user reaction judge.

Uses Claude Haiku when ANTHROPIC_API_KEY is set; falls back to a
deterministic persona-based rule when it is not.
"""

from __future__ import annotations

import os
import random

from personas import Persona

ACTIONS = ["done", "snooze", "dismiss"]

# Reward is NOT a fixed map anymore — it depends on action + simulated dwell time.
# Use infer_reward() to compute the final reward after simulating dwell.
_BASE_REWARDS: dict[str, float] = {
    "done": 1.0,   # placeholder; real reward computed from dwell
    "snooze": 0.1,
    "dismiss": -1.0,
}


def infer_reward(action: str, dwell_ms: int) -> float:
    """Mirror of production inferReward() in recommender.ts."""
    if action == "dismiss":
        return -1.0
    if action == "snooze":
        return 0.1
    # done — dwell-based
    if dwell_ms < 15_000:
        return -0.3   # stale / reflex done
    if dwell_ms < 120_000:
        return 1.0    # magic zone
    if dwell_ms < 600_000:
        return 0.6    # good
    return 0.3        # eventually done

_HOUR_PERIODS = {
    (5, 10): "morning",
    (10, 14): "midday",
    (14, 18): "afternoon",
    (18, 22): "evening",
}


def _period(hour: int) -> str:
    for (lo, hi), name in _HOUR_PERIODS.items():
        if lo <= hour < hi:
            return name
    return "night"


# ── Deterministic judge ────────────────────────────────────────────────────

def _engagement_score(persona: Persona, tip: dict, hour: int) -> float:
    """0–1 score of how well this tip fits this persona right now."""
    features = tip.get("features", {})
    priority = features.get("priority", 1)
    is_overdue = features.get("is_overdue", False)

    p = 0.35
    priority_norm = (priority - 1) / 3.0
    p += (priority_norm - 0.5) * persona.prefers_high_priority * 0.4
    if is_overdue:
        p += (persona.prefers_overdue - 0.5) * 0.3

    is_morning = 5 <= hour < 10
    is_evening = 18 <= hour < 22
    if persona.morning_active and is_morning:
        p += 0.15
    elif persona.evening_active and is_evening:
        p += 0.15
    elif persona.morning_active and not is_morning and not is_evening:
        p -= 0.10
    elif persona.evening_active and not is_evening and not is_morning:
        p -= 0.10

    return max(0.05, min(0.90, p))


def _simulate_dwell_ms(engagement: float, rng: random.Random) -> int:
    """
    Simulate how many milliseconds the user takes to act on a tip.

    High engagement → quick action (magic zone, 15s–2min).
    Medium engagement → slower (2–10min).
    Low engagement → very slow (>10min) — tip helped eventually but not 'magic'.
    For snooze/dismiss the dwell doesn't affect reward; return a short value.
    """
    if engagement >= 0.70:
        # Strong match — magic zone: 15s–90s
        return rng.randint(15_000, 90_000)
    elif engagement >= 0.50:
        # Moderate match — good zone: 2–8min
        return rng.randint(120_000, 480_000)
    else:
        # Weak match but still done — eventually: 10–30min
        return rng.randint(600_000, 1_800_000)


def _rule_judge(persona: Persona, tip: dict, hour: int, rng: random.Random) -> tuple[str, int]:
    """Return (action, dwell_ms) based on persona preferences and task features."""
    engagement = _engagement_score(persona, tip, hour)

    r = rng.random()
    if r < engagement * 0.55:
        # done — dwell depends on engagement
        dwell = _simulate_dwell_ms(engagement, rng)
        return "done", dwell
    elif r < engagement:
        return "snooze", rng.randint(3_000, 20_000)
    else:
        return "dismiss", rng.randint(1_000, 5_000)


# ── LLM judge ─────────────────────────────────────────────────────────────

_anthropic_client = None

def _get_client():
    global _anthropic_client
    if _anthropic_client is None:
        try:
            import anthropic  # type: ignore
            key = os.environ.get("ANTHROPIC_API_KEY", "")
            if key:
                _anthropic_client = anthropic.Anthropic(api_key=key)
        except ImportError:
            pass
    return _anthropic_client


def _llm_judge(
    persona: Persona, tip: dict, hour: int, day_of_week: int, rng: random.Random,
) -> tuple[str, int]:
    client = _get_client()
    if client is None:
        return _rule_judge(persona, tip, hour, rng)

    features = tip.get("features", {})
    priority = features.get("priority", 1)
    is_overdue = features.get("is_overdue", False)
    age_days = features.get("task_age_days", 0)

    priority_label = {1: "low", 2: "normal", 3: "high", 4: "urgent"}.get(priority, "normal")
    overdue_str = f", overdue by {age_days:.0f} day(s)" if is_overdue else ""
    days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    day_str = days[day_of_week % 7]

    prompt = (
        f"You are simulating how a specific user reacts to a task recommendation app.\n\n"
        f"User persona: {persona.name}\n"
        f"Persona: {persona.description}\n\n"
        f'Recommended task: "{tip.get("content", "Unknown task")}"\n'
        f"Task: priority={priority_label}{overdue_str}\n"
        f"Current time: {_period(hour)} ({hour}:00, {day_str})\n\n"
        f"How does this user react? Reply with exactly one word: done | snooze | dismiss\n\n"
        f"- done: acts on this tip (marks task complete)\n"
        f"- snooze: acknowledges but not now\n"
        f"- dismiss: ignores or rejects it"
    )

    try:
        message = client.messages.create(
            model="claude-haiku-4-5-20251001",
            max_tokens=10,
            messages=[{"role": "user", "content": prompt}],
        )
        raw = message.content[0].text.strip().lower().split()[0]
        action = raw if raw in ACTIONS else _rule_judge(persona, tip, hour, rng)[0]
    except Exception:
        action, _ = _rule_judge(persona, tip, hour, rng)

    # Simulate dwell based on engagement level
    engagement = _engagement_score(persona, tip, hour)
    dwell = _simulate_dwell_ms(engagement, rng) if action == "done" else rng.randint(2_000, 15_000)
    return action, dwell


# ── Public API ─────────────────────────────────────────────────────────────

def judge(
    persona: Persona,
    tip: dict,
    hour: int,
    day_of_week: int,
    rng: random.Random,
    use_llm: bool = True,
) -> tuple[str, int, float]:
    """Return (action, dwell_ms, reward).

    action   — 'done' | 'snooze' | 'dismiss'
    dwell_ms — simulated milliseconds between tip appearance and user action
    reward   — inferred from action + dwell_ms via infer_reward()
    """
    if use_llm and os.environ.get("ANTHROPIC_API_KEY"):
        action, dwell_ms = _llm_judge(persona, tip, hour, day_of_week, rng)
    else:
        action, dwell_ms = _rule_judge(persona, tip, hour, rng)

    return action, dwell_ms, infer_reward(action, dwell_ms)