feat: ε-greedy v1 as active policy; dwell-time reward inference; offline sim framework
- Promote egreedy-v1 to active serving policy (ADR-0007): /score/egreedy + /reward/egreedy
replaces linucb-v1 endpoints after offline sim shows +10.7% mean reward (−0.548 vs −0.606)
- Replace explicit helpful/not_helpful feedback with dwell-time inferred reward (inferReward):
dismiss=−1.0, snooze=+0.1, done<15s=−0.3, done 15s–2min=+1.0, done 2–10min=+0.6, done>10min=+0.3
- Add ml/serving ε-greedy endpoints: /score/egreedy, /reward/egreedy, /stats/egreedy/{user_id}
with d=7 feature vector (base 5 + sin/cos day-of-week encoding)
- Add offline simulation framework (ml/experiments/sim): rule/LLM/claude-code judges,
two-phase score+reward, synthetic personas, task generator; results stored in sim_runs/sim_events
- Add /admin/simulations page: start runs, live-poll status, reward curve SVG, action/persona tables
- Fix egreedy day_of_week training skew: reward endpoint now uses actual dow instead of hardcoded 0
- Fix runner.py proxy bypass: httpx.Client(trust_env=False) for localhost ML calls
- Add dwellMs to TipFeedbackEvent contract and bus.test.ts fixture
- Schema: sim_runs, sim_events tables; tip_feedback gains dwell_ms, reward_milli columns
- ADR-0006: admin console framework; ADR-0007: egreedy-v1 policy selection rationale
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
204
ml/experiments/sim/llm_judge.py
Normal file
204
ml/experiments/sim/llm_judge.py
Normal file
@@ -0,0 +1,204 @@
|
||||
"""
|
||||
LLM-based user reaction judge.
|
||||
|
||||
Uses Claude Haiku when ANTHROPIC_API_KEY is set; falls back to a
|
||||
deterministic persona-based rule when it is not.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import random
|
||||
|
||||
from personas import Persona
|
||||
|
||||
ACTIONS = ["done", "snooze", "dismiss"]
|
||||
|
||||
# Reward is NOT a fixed map anymore — it depends on action + simulated dwell time.
|
||||
# Use infer_reward() to compute the final reward after simulating dwell.
|
||||
_BASE_REWARDS: dict[str, float] = {
|
||||
"done": 1.0, # placeholder; real reward computed from dwell
|
||||
"snooze": 0.1,
|
||||
"dismiss": -1.0,
|
||||
}
|
||||
|
||||
|
||||
def infer_reward(action: str, dwell_ms: int) -> float:
|
||||
"""Mirror of production inferReward() in recommender.ts."""
|
||||
if action == "dismiss":
|
||||
return -1.0
|
||||
if action == "snooze":
|
||||
return 0.1
|
||||
# done — dwell-based
|
||||
if dwell_ms < 15_000:
|
||||
return -0.3 # stale / reflex done
|
||||
if dwell_ms < 120_000:
|
||||
return 1.0 # magic zone
|
||||
if dwell_ms < 600_000:
|
||||
return 0.6 # good
|
||||
return 0.3 # eventually done
|
||||
|
||||
_HOUR_PERIODS = {
|
||||
(5, 10): "morning",
|
||||
(10, 14): "midday",
|
||||
(14, 18): "afternoon",
|
||||
(18, 22): "evening",
|
||||
}
|
||||
|
||||
|
||||
def _period(hour: int) -> str:
|
||||
for (lo, hi), name in _HOUR_PERIODS.items():
|
||||
if lo <= hour < hi:
|
||||
return name
|
||||
return "night"
|
||||
|
||||
|
||||
# ── Deterministic judge ────────────────────────────────────────────────────
|
||||
|
||||
def _engagement_score(persona: Persona, tip: dict, hour: int) -> float:
|
||||
"""0–1 score of how well this tip fits this persona right now."""
|
||||
features = tip.get("features", {})
|
||||
priority = features.get("priority", 1)
|
||||
is_overdue = features.get("is_overdue", False)
|
||||
|
||||
p = 0.35
|
||||
priority_norm = (priority - 1) / 3.0
|
||||
p += (priority_norm - 0.5) * persona.prefers_high_priority * 0.4
|
||||
if is_overdue:
|
||||
p += (persona.prefers_overdue - 0.5) * 0.3
|
||||
|
||||
is_morning = 5 <= hour < 10
|
||||
is_evening = 18 <= hour < 22
|
||||
if persona.morning_active and is_morning:
|
||||
p += 0.15
|
||||
elif persona.evening_active and is_evening:
|
||||
p += 0.15
|
||||
elif persona.morning_active and not is_morning and not is_evening:
|
||||
p -= 0.10
|
||||
elif persona.evening_active and not is_evening and not is_morning:
|
||||
p -= 0.10
|
||||
|
||||
return max(0.05, min(0.90, p))
|
||||
|
||||
|
||||
def _simulate_dwell_ms(engagement: float, rng: random.Random) -> int:
|
||||
"""
|
||||
Simulate how many milliseconds the user takes to act on a tip.
|
||||
|
||||
High engagement → quick action (magic zone, 15s–2min).
|
||||
Medium engagement → slower (2–10min).
|
||||
Low engagement → very slow (>10min) — tip helped eventually but not 'magic'.
|
||||
For snooze/dismiss the dwell doesn't affect reward; return a short value.
|
||||
"""
|
||||
if engagement >= 0.70:
|
||||
# Strong match — magic zone: 15s–90s
|
||||
return rng.randint(15_000, 90_000)
|
||||
elif engagement >= 0.50:
|
||||
# Moderate match — good zone: 2–8min
|
||||
return rng.randint(120_000, 480_000)
|
||||
else:
|
||||
# Weak match but still done — eventually: 10–30min
|
||||
return rng.randint(600_000, 1_800_000)
|
||||
|
||||
|
||||
def _rule_judge(persona: Persona, tip: dict, hour: int, rng: random.Random) -> tuple[str, int]:
|
||||
"""Return (action, dwell_ms) based on persona preferences and task features."""
|
||||
engagement = _engagement_score(persona, tip, hour)
|
||||
|
||||
r = rng.random()
|
||||
if r < engagement * 0.55:
|
||||
# done — dwell depends on engagement
|
||||
dwell = _simulate_dwell_ms(engagement, rng)
|
||||
return "done", dwell
|
||||
elif r < engagement:
|
||||
return "snooze", rng.randint(3_000, 20_000)
|
||||
else:
|
||||
return "dismiss", rng.randint(1_000, 5_000)
|
||||
|
||||
|
||||
# ── LLM judge ─────────────────────────────────────────────────────────────
|
||||
|
||||
_anthropic_client = None
|
||||
|
||||
def _get_client():
|
||||
global _anthropic_client
|
||||
if _anthropic_client is None:
|
||||
try:
|
||||
import anthropic # type: ignore
|
||||
key = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
if key:
|
||||
_anthropic_client = anthropic.Anthropic(api_key=key)
|
||||
except ImportError:
|
||||
pass
|
||||
return _anthropic_client
|
||||
|
||||
|
||||
def _llm_judge(
|
||||
persona: Persona, tip: dict, hour: int, day_of_week: int, rng: random.Random,
|
||||
) -> tuple[str, int]:
|
||||
client = _get_client()
|
||||
if client is None:
|
||||
return _rule_judge(persona, tip, hour, rng)
|
||||
|
||||
features = tip.get("features", {})
|
||||
priority = features.get("priority", 1)
|
||||
is_overdue = features.get("is_overdue", False)
|
||||
age_days = features.get("task_age_days", 0)
|
||||
|
||||
priority_label = {1: "low", 2: "normal", 3: "high", 4: "urgent"}.get(priority, "normal")
|
||||
overdue_str = f", overdue by {age_days:.0f} day(s)" if is_overdue else ""
|
||||
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
|
||||
day_str = days[day_of_week % 7]
|
||||
|
||||
prompt = (
|
||||
f"You are simulating how a specific user reacts to a task recommendation app.\n\n"
|
||||
f"User persona: {persona.name}\n"
|
||||
f"Persona: {persona.description}\n\n"
|
||||
f'Recommended task: "{tip.get("content", "Unknown task")}"\n'
|
||||
f"Task: priority={priority_label}{overdue_str}\n"
|
||||
f"Current time: {_period(hour)} ({hour}:00, {day_str})\n\n"
|
||||
f"How does this user react? Reply with exactly one word: done | snooze | dismiss\n\n"
|
||||
f"- done: acts on this tip (marks task complete)\n"
|
||||
f"- snooze: acknowledges but not now\n"
|
||||
f"- dismiss: ignores or rejects it"
|
||||
)
|
||||
|
||||
try:
|
||||
message = client.messages.create(
|
||||
model="claude-haiku-4-5-20251001",
|
||||
max_tokens=10,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
)
|
||||
raw = message.content[0].text.strip().lower().split()[0]
|
||||
action = raw if raw in ACTIONS else _rule_judge(persona, tip, hour, rng)[0]
|
||||
except Exception:
|
||||
action, _ = _rule_judge(persona, tip, hour, rng)
|
||||
|
||||
# Simulate dwell based on engagement level
|
||||
engagement = _engagement_score(persona, tip, hour)
|
||||
dwell = _simulate_dwell_ms(engagement, rng) if action == "done" else rng.randint(2_000, 15_000)
|
||||
return action, dwell
|
||||
|
||||
|
||||
# ── Public API ─────────────────────────────────────────────────────────────
|
||||
|
||||
def judge(
|
||||
persona: Persona,
|
||||
tip: dict,
|
||||
hour: int,
|
||||
day_of_week: int,
|
||||
rng: random.Random,
|
||||
use_llm: bool = True,
|
||||
) -> tuple[str, int, float]:
|
||||
"""Return (action, dwell_ms, reward).
|
||||
|
||||
action — 'done' | 'snooze' | 'dismiss'
|
||||
dwell_ms — simulated milliseconds between tip appearance and user action
|
||||
reward — inferred from action + dwell_ms via infer_reward()
|
||||
"""
|
||||
if use_llm and os.environ.get("ANTHROPIC_API_KEY"):
|
||||
action, dwell_ms = _llm_judge(persona, tip, hour, day_of_week, rng)
|
||||
else:
|
||||
action, dwell_ms = _rule_judge(persona, tip, hour, rng)
|
||||
|
||||
return action, dwell_ms, infer_reward(action, dwell_ms)
|
||||
Reference in New Issue
Block a user