feat: ε-greedy v1 as active policy; dwell-time reward inference; offline sim framework

- Promote egreedy-v1 to active serving policy (ADR-0007): /score/egreedy + /reward/egreedy replaces linucb-v1 endpoints after offline sim shows +10.7% mean reward (−0.548 vs −0.606) - Replace explicit helpful/not_helpful feedback with dwell-time inferred reward (inferReward): dismiss=−1.0, snooze=+0.1, done<15s=−0.3, done 15s–2min=+1.0, done 2–10min=+0.6, done>10min=+0.3 - Add ml/serving ε-greedy endpoints: /score/egreedy, /reward/egreedy, /stats/egreedy/{user_id} with d=7 feature vector (base 5 + sin/cos day-of-week encoding) - Add offline simulation framework (ml/experiments/sim): rule/LLM/claude-code judges, two-phase score+reward, synthetic personas, task generator; results stored in sim_runs/sim_events - Add /admin/simulations page: start runs, live-poll status, reward curve SVG, action/persona tables - Fix egreedy day_of_week training skew: reward endpoint now uses actual dow instead of hardcoded 0 - Fix runner.py proxy bypass: httpx.Client(trust_env=False) for localhost ML calls - Add dwellMs to TipFeedbackEvent contract and bus.test.ts fixture - Schema: sim_runs, sim_events tables; tip_feedback gains dwell_ms, reward_milli columns - ADR-0006: admin console framework; ADR-0007: egreedy-v1 policy selection rationale Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-16 07:44:37 +00:00
parent c5ea18ec6e
commit faf44c18fc
48 changed files with 6151 additions and 40 deletions
--- a/ml/experiments/sim/llm_judge.py
+++ b/ml/experiments/sim/llm_judge.py
@@ -0,0 +1,204 @@
+"""
+LLM-based user reaction judge.
+
+Uses Claude Haiku when ANTHROPIC_API_KEY is set; falls back to a
+deterministic persona-based rule when it is not.
+"""
+
+from __future__ import annotations
+
+import os
+import random
+
+from personas import Persona
+
+ACTIONS = ["done", "snooze", "dismiss"]
+
+# Reward is NOT a fixed map anymore — it depends on action + simulated dwell time.
+# Use infer_reward() to compute the final reward after simulating dwell.
+_BASE_REWARDS: dict[str, float] = {
+    "done": 1.0,   # placeholder; real reward computed from dwell
+    "snooze": 0.1,
+    "dismiss": -1.0,
+}
+
+
+def infer_reward(action: str, dwell_ms: int) -> float:
+    """Mirror of production inferReward() in recommender.ts."""
+    if action == "dismiss":
+        return -1.0
+    if action == "snooze":
+        return 0.1
+    # done — dwell-based
+    if dwell_ms < 15_000:
+        return -0.3   # stale / reflex done
+    if dwell_ms < 120_000:
+        return 1.0    # magic zone
+    if dwell_ms < 600_000:
+        return 0.6    # good
+    return 0.3        # eventually done
+
+_HOUR_PERIODS = {
+    (5, 10): "morning",
+    (10, 14): "midday",
+    (14, 18): "afternoon",
+    (18, 22): "evening",
+}
+
+
+def _period(hour: int) -> str:
+    for (lo, hi), name in _HOUR_PERIODS.items():
+        if lo <= hour < hi:
+            return name
+    return "night"
+
+
+# ── Deterministic judge ────────────────────────────────────────────────────
+
+def _engagement_score(persona: Persona, tip: dict, hour: int) -> float:
+    """0–1 score of how well this tip fits this persona right now."""
+    features = tip.get("features", {})
+    priority = features.get("priority", 1)
+    is_overdue = features.get("is_overdue", False)
+
+    p = 0.35
+    priority_norm = (priority - 1) / 3.0
+    p += (priority_norm - 0.5) * persona.prefers_high_priority * 0.4
+    if is_overdue:
+        p += (persona.prefers_overdue - 0.5) * 0.3
+
+    is_morning = 5 <= hour < 10
+    is_evening = 18 <= hour < 22
+    if persona.morning_active and is_morning:
+        p += 0.15
+    elif persona.evening_active and is_evening:
+        p += 0.15
+    elif persona.morning_active and not is_morning and not is_evening:
+        p -= 0.10
+    elif persona.evening_active and not is_evening and not is_morning:
+        p -= 0.10
+
+    return max(0.05, min(0.90, p))
+
+
+def _simulate_dwell_ms(engagement: float, rng: random.Random) -> int:
+    """
+    Simulate how many milliseconds the user takes to act on a tip.
+
+    High engagement → quick action (magic zone, 15s–2min).
+    Medium engagement → slower (2–10min).
+    Low engagement → very slow (>10min) — tip helped eventually but not 'magic'.
+    For snooze/dismiss the dwell doesn't affect reward; return a short value.
+    """
+    if engagement >= 0.70:
+        # Strong match — magic zone: 15s–90s
+        return rng.randint(15_000, 90_000)
+    elif engagement >= 0.50:
+        # Moderate match — good zone: 2–8min
+        return rng.randint(120_000, 480_000)
+    else:
+        # Weak match but still done — eventually: 10–30min
+        return rng.randint(600_000, 1_800_000)
+
+
+def _rule_judge(persona: Persona, tip: dict, hour: int, rng: random.Random) -> tuple[str, int]:
+    """Return (action, dwell_ms) based on persona preferences and task features."""
+    engagement = _engagement_score(persona, tip, hour)
+
+    r = rng.random()
+    if r < engagement * 0.55:
+        # done — dwell depends on engagement
+        dwell = _simulate_dwell_ms(engagement, rng)
+        return "done", dwell
+    elif r < engagement:
+        return "snooze", rng.randint(3_000, 20_000)
+    else:
+        return "dismiss", rng.randint(1_000, 5_000)
+
+
+# ── LLM judge ─────────────────────────────────────────────────────────────
+
+_anthropic_client = None
+
+def _get_client():
+    global _anthropic_client
+    if _anthropic_client is None:
+        try:
+            import anthropic  # type: ignore
+            key = os.environ.get("ANTHROPIC_API_KEY", "")
+            if key:
+                _anthropic_client = anthropic.Anthropic(api_key=key)
+        except ImportError:
+            pass
+    return _anthropic_client
+
+
+def _llm_judge(
+    persona: Persona, tip: dict, hour: int, day_of_week: int, rng: random.Random,
+) -> tuple[str, int]:
+    client = _get_client()
+    if client is None:
+        return _rule_judge(persona, tip, hour, rng)
+
+    features = tip.get("features", {})
+    priority = features.get("priority", 1)
+    is_overdue = features.get("is_overdue", False)
+    age_days = features.get("task_age_days", 0)
+
+    priority_label = {1: "low", 2: "normal", 3: "high", 4: "urgent"}.get(priority, "normal")
+    overdue_str = f", overdue by {age_days:.0f} day(s)" if is_overdue else ""
+    days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
+    day_str = days[day_of_week % 7]
+
+    prompt = (
+        f"You are simulating how a specific user reacts to a task recommendation app.\n\n"
+        f"User persona: {persona.name}\n"
+        f"Persona: {persona.description}\n\n"
+        f'Recommended task: "{tip.get("content", "Unknown task")}"\n'
+        f"Task: priority={priority_label}{overdue_str}\n"
+        f"Current time: {_period(hour)} ({hour}:00, {day_str})\n\n"
+        f"How does this user react? Reply with exactly one word: done | snooze | dismiss\n\n"
+        f"- done: acts on this tip (marks task complete)\n"
+        f"- snooze: acknowledges but not now\n"
+        f"- dismiss: ignores or rejects it"
+    )
+
+    try:
+        message = client.messages.create(
+            model="claude-haiku-4-5-20251001",
+            max_tokens=10,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        raw = message.content[0].text.strip().lower().split()[0]
+        action = raw if raw in ACTIONS else _rule_judge(persona, tip, hour, rng)[0]
+    except Exception:
+        action, _ = _rule_judge(persona, tip, hour, rng)
+
+    # Simulate dwell based on engagement level
+    engagement = _engagement_score(persona, tip, hour)
+    dwell = _simulate_dwell_ms(engagement, rng) if action == "done" else rng.randint(2_000, 15_000)
+    return action, dwell
+
+
+# ── Public API ─────────────────────────────────────────────────────────────
+
+def judge(
+    persona: Persona,
+    tip: dict,
+    hour: int,
+    day_of_week: int,
+    rng: random.Random,
+    use_llm: bool = True,
+) -> tuple[str, int, float]:
+    """Return (action, dwell_ms, reward).
+
+    action   — 'done' | 'snooze' | 'dismiss'
+    dwell_ms — simulated milliseconds between tip appearance and user action
+    reward   — inferred from action + dwell_ms via infer_reward()
+    """
+    if use_llm and os.environ.get("ANTHROPIC_API_KEY"):
+        action, dwell_ms = _llm_judge(persona, tip, hour, day_of_week, rng)
+    else:
+        action, dwell_ms = _rule_judge(persona, tip, hour, rng)
+
+    return action, dwell_ms, infer_reward(action, dwell_ms)