Files
oO/ml/experiments/sim/llm_judge.py
alvis faf44c18fc feat: ε-greedy v1 as active policy; dwell-time reward inference; offline sim framework
- Promote egreedy-v1 to active serving policy (ADR-0007): /score/egreedy + /reward/egreedy
  replaces linucb-v1 endpoints after offline sim shows +10.7% mean reward (−0.548 vs −0.606)
- Replace explicit helpful/not_helpful feedback with dwell-time inferred reward (inferReward):
  dismiss=−1.0, snooze=+0.1, done<15s=−0.3, done 15s–2min=+1.0, done 2–10min=+0.6, done>10min=+0.3
- Add ml/serving ε-greedy endpoints: /score/egreedy, /reward/egreedy, /stats/egreedy/{user_id}
  with d=7 feature vector (base 5 + sin/cos day-of-week encoding)
- Add offline simulation framework (ml/experiments/sim): rule/LLM/claude-code judges,
  two-phase score+reward, synthetic personas, task generator; results stored in sim_runs/sim_events
- Add /admin/simulations page: start runs, live-poll status, reward curve SVG, action/persona tables
- Fix egreedy day_of_week training skew: reward endpoint now uses actual dow instead of hardcoded 0
- Fix runner.py proxy bypass: httpx.Client(trust_env=False) for localhost ML calls
- Add dwellMs to TipFeedbackEvent contract and bus.test.ts fixture
- Schema: sim_runs, sim_events tables; tip_feedback gains dwell_ms, reward_milli columns
- ADR-0006: admin console framework; ADR-0007: egreedy-v1 policy selection rationale

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-16 07:44:37 +00:00

205 lines
7.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
LLM-based user reaction judge.
Uses Claude Haiku when ANTHROPIC_API_KEY is set; falls back to a
deterministic persona-based rule when it is not.
"""
from __future__ import annotations
import os
import random
from personas import Persona
ACTIONS = ["done", "snooze", "dismiss"]
# Reward is NOT a fixed map anymore — it depends on action + simulated dwell time.
# Use infer_reward() to compute the final reward after simulating dwell.
_BASE_REWARDS: dict[str, float] = {
"done": 1.0, # placeholder; real reward computed from dwell
"snooze": 0.1,
"dismiss": -1.0,
}
def infer_reward(action: str, dwell_ms: int) -> float:
"""Mirror of production inferReward() in recommender.ts."""
if action == "dismiss":
return -1.0
if action == "snooze":
return 0.1
# done — dwell-based
if dwell_ms < 15_000:
return -0.3 # stale / reflex done
if dwell_ms < 120_000:
return 1.0 # magic zone
if dwell_ms < 600_000:
return 0.6 # good
return 0.3 # eventually done
_HOUR_PERIODS = {
(5, 10): "morning",
(10, 14): "midday",
(14, 18): "afternoon",
(18, 22): "evening",
}
def _period(hour: int) -> str:
for (lo, hi), name in _HOUR_PERIODS.items():
if lo <= hour < hi:
return name
return "night"
# ── Deterministic judge ────────────────────────────────────────────────────
def _engagement_score(persona: Persona, tip: dict, hour: int) -> float:
"""01 score of how well this tip fits this persona right now."""
features = tip.get("features", {})
priority = features.get("priority", 1)
is_overdue = features.get("is_overdue", False)
p = 0.35
priority_norm = (priority - 1) / 3.0
p += (priority_norm - 0.5) * persona.prefers_high_priority * 0.4
if is_overdue:
p += (persona.prefers_overdue - 0.5) * 0.3
is_morning = 5 <= hour < 10
is_evening = 18 <= hour < 22
if persona.morning_active and is_morning:
p += 0.15
elif persona.evening_active and is_evening:
p += 0.15
elif persona.morning_active and not is_morning and not is_evening:
p -= 0.10
elif persona.evening_active and not is_evening and not is_morning:
p -= 0.10
return max(0.05, min(0.90, p))
def _simulate_dwell_ms(engagement: float, rng: random.Random) -> int:
"""
Simulate how many milliseconds the user takes to act on a tip.
High engagement → quick action (magic zone, 15s2min).
Medium engagement → slower (210min).
Low engagement → very slow (>10min) — tip helped eventually but not 'magic'.
For snooze/dismiss the dwell doesn't affect reward; return a short value.
"""
if engagement >= 0.70:
# Strong match — magic zone: 15s90s
return rng.randint(15_000, 90_000)
elif engagement >= 0.50:
# Moderate match — good zone: 28min
return rng.randint(120_000, 480_000)
else:
# Weak match but still done — eventually: 1030min
return rng.randint(600_000, 1_800_000)
def _rule_judge(persona: Persona, tip: dict, hour: int, rng: random.Random) -> tuple[str, int]:
"""Return (action, dwell_ms) based on persona preferences and task features."""
engagement = _engagement_score(persona, tip, hour)
r = rng.random()
if r < engagement * 0.55:
# done — dwell depends on engagement
dwell = _simulate_dwell_ms(engagement, rng)
return "done", dwell
elif r < engagement:
return "snooze", rng.randint(3_000, 20_000)
else:
return "dismiss", rng.randint(1_000, 5_000)
# ── LLM judge ─────────────────────────────────────────────────────────────
_anthropic_client = None
def _get_client():
global _anthropic_client
if _anthropic_client is None:
try:
import anthropic # type: ignore
key = os.environ.get("ANTHROPIC_API_KEY", "")
if key:
_anthropic_client = anthropic.Anthropic(api_key=key)
except ImportError:
pass
return _anthropic_client
def _llm_judge(
persona: Persona, tip: dict, hour: int, day_of_week: int, rng: random.Random,
) -> tuple[str, int]:
client = _get_client()
if client is None:
return _rule_judge(persona, tip, hour, rng)
features = tip.get("features", {})
priority = features.get("priority", 1)
is_overdue = features.get("is_overdue", False)
age_days = features.get("task_age_days", 0)
priority_label = {1: "low", 2: "normal", 3: "high", 4: "urgent"}.get(priority, "normal")
overdue_str = f", overdue by {age_days:.0f} day(s)" if is_overdue else ""
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
day_str = days[day_of_week % 7]
prompt = (
f"You are simulating how a specific user reacts to a task recommendation app.\n\n"
f"User persona: {persona.name}\n"
f"Persona: {persona.description}\n\n"
f'Recommended task: "{tip.get("content", "Unknown task")}"\n'
f"Task: priority={priority_label}{overdue_str}\n"
f"Current time: {_period(hour)} ({hour}:00, {day_str})\n\n"
f"How does this user react? Reply with exactly one word: done | snooze | dismiss\n\n"
f"- done: acts on this tip (marks task complete)\n"
f"- snooze: acknowledges but not now\n"
f"- dismiss: ignores or rejects it"
)
try:
message = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=10,
messages=[{"role": "user", "content": prompt}],
)
raw = message.content[0].text.strip().lower().split()[0]
action = raw if raw in ACTIONS else _rule_judge(persona, tip, hour, rng)[0]
except Exception:
action, _ = _rule_judge(persona, tip, hour, rng)
# Simulate dwell based on engagement level
engagement = _engagement_score(persona, tip, hour)
dwell = _simulate_dwell_ms(engagement, rng) if action == "done" else rng.randint(2_000, 15_000)
return action, dwell
# ── Public API ─────────────────────────────────────────────────────────────
def judge(
persona: Persona,
tip: dict,
hour: int,
day_of_week: int,
rng: random.Random,
use_llm: bool = True,
) -> tuple[str, int, float]:
"""Return (action, dwell_ms, reward).
action — 'done' | 'snooze' | 'dismiss'
dwell_ms — simulated milliseconds between tip appearance and user action
reward — inferred from action + dwell_ms via infer_reward()
"""
if use_llm and os.environ.get("ANTHROPIC_API_KEY"):
action, dwell_ms = _llm_judge(persona, tip, hour, day_of_week, rng)
else:
action, dwell_ms = _rule_judge(persona, tip, hour, rng)
return action, dwell_ms, infer_reward(action, dwell_ms)