feat: ε-greedy v1 as active policy; dwell-time reward inference; offline sim framework

- Promote egreedy-v1 to active serving policy (ADR-0007): /score/egreedy + /reward/egreedy
  replaces linucb-v1 endpoints after offline sim shows +10.7% mean reward (−0.548 vs −0.606)
- Replace explicit helpful/not_helpful feedback with dwell-time inferred reward (inferReward):
  dismiss=−1.0, snooze=+0.1, done<15s=−0.3, done 15s–2min=+1.0, done 2–10min=+0.6, done>10min=+0.3
- Add ml/serving ε-greedy endpoints: /score/egreedy, /reward/egreedy, /stats/egreedy/{user_id}
  with d=7 feature vector (base 5 + sin/cos day-of-week encoding)
- Add offline simulation framework (ml/experiments/sim): rule/LLM/claude-code judges,
  two-phase score+reward, synthetic personas, task generator; results stored in sim_runs/sim_events
- Add /admin/simulations page: start runs, live-poll status, reward curve SVG, action/persona tables
- Fix egreedy day_of_week training skew: reward endpoint now uses actual dow instead of hardcoded 0
- Fix runner.py proxy bypass: httpx.Client(trust_env=False) for localhost ML calls
- Add dwellMs to TipFeedbackEvent contract and bus.test.ts fixture
- Schema: sim_runs, sim_events tables; tip_feedback gains dwell_ms, reward_milli columns
- ADR-0006: admin console framework; ADR-0007: egreedy-v1 policy selection rationale

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-16 07:44:37 +00:00
parent c5ea18ec6e
commit faf44c18fc
48 changed files with 6151 additions and 40 deletions

View File

@@ -0,0 +1,204 @@
"""
LLM-based user reaction judge.
Uses Claude Haiku when ANTHROPIC_API_KEY is set; falls back to a
deterministic persona-based rule when it is not.
"""
from __future__ import annotations
import os
import random
from personas import Persona
ACTIONS = ["done", "snooze", "dismiss"]
# Reward is NOT a fixed map anymore — it depends on action + simulated dwell time.
# Use infer_reward() to compute the final reward after simulating dwell.
_BASE_REWARDS: dict[str, float] = {
"done": 1.0, # placeholder; real reward computed from dwell
"snooze": 0.1,
"dismiss": -1.0,
}
def infer_reward(action: str, dwell_ms: int) -> float:
"""Mirror of production inferReward() in recommender.ts."""
if action == "dismiss":
return -1.0
if action == "snooze":
return 0.1
# done — dwell-based
if dwell_ms < 15_000:
return -0.3 # stale / reflex done
if dwell_ms < 120_000:
return 1.0 # magic zone
if dwell_ms < 600_000:
return 0.6 # good
return 0.3 # eventually done
_HOUR_PERIODS = {
(5, 10): "morning",
(10, 14): "midday",
(14, 18): "afternoon",
(18, 22): "evening",
}
def _period(hour: int) -> str:
for (lo, hi), name in _HOUR_PERIODS.items():
if lo <= hour < hi:
return name
return "night"
# ── Deterministic judge ────────────────────────────────────────────────────
def _engagement_score(persona: Persona, tip: dict, hour: int) -> float:
"""01 score of how well this tip fits this persona right now."""
features = tip.get("features", {})
priority = features.get("priority", 1)
is_overdue = features.get("is_overdue", False)
p = 0.35
priority_norm = (priority - 1) / 3.0
p += (priority_norm - 0.5) * persona.prefers_high_priority * 0.4
if is_overdue:
p += (persona.prefers_overdue - 0.5) * 0.3
is_morning = 5 <= hour < 10
is_evening = 18 <= hour < 22
if persona.morning_active and is_morning:
p += 0.15
elif persona.evening_active and is_evening:
p += 0.15
elif persona.morning_active and not is_morning and not is_evening:
p -= 0.10
elif persona.evening_active and not is_evening and not is_morning:
p -= 0.10
return max(0.05, min(0.90, p))
def _simulate_dwell_ms(engagement: float, rng: random.Random) -> int:
"""
Simulate how many milliseconds the user takes to act on a tip.
High engagement → quick action (magic zone, 15s2min).
Medium engagement → slower (210min).
Low engagement → very slow (>10min) — tip helped eventually but not 'magic'.
For snooze/dismiss the dwell doesn't affect reward; return a short value.
"""
if engagement >= 0.70:
# Strong match — magic zone: 15s90s
return rng.randint(15_000, 90_000)
elif engagement >= 0.50:
# Moderate match — good zone: 28min
return rng.randint(120_000, 480_000)
else:
# Weak match but still done — eventually: 1030min
return rng.randint(600_000, 1_800_000)
def _rule_judge(persona: Persona, tip: dict, hour: int, rng: random.Random) -> tuple[str, int]:
"""Return (action, dwell_ms) based on persona preferences and task features."""
engagement = _engagement_score(persona, tip, hour)
r = rng.random()
if r < engagement * 0.55:
# done — dwell depends on engagement
dwell = _simulate_dwell_ms(engagement, rng)
return "done", dwell
elif r < engagement:
return "snooze", rng.randint(3_000, 20_000)
else:
return "dismiss", rng.randint(1_000, 5_000)
# ── LLM judge ─────────────────────────────────────────────────────────────
_anthropic_client = None
def _get_client():
global _anthropic_client
if _anthropic_client is None:
try:
import anthropic # type: ignore
key = os.environ.get("ANTHROPIC_API_KEY", "")
if key:
_anthropic_client = anthropic.Anthropic(api_key=key)
except ImportError:
pass
return _anthropic_client
def _llm_judge(
persona: Persona, tip: dict, hour: int, day_of_week: int, rng: random.Random,
) -> tuple[str, int]:
client = _get_client()
if client is None:
return _rule_judge(persona, tip, hour, rng)
features = tip.get("features", {})
priority = features.get("priority", 1)
is_overdue = features.get("is_overdue", False)
age_days = features.get("task_age_days", 0)
priority_label = {1: "low", 2: "normal", 3: "high", 4: "urgent"}.get(priority, "normal")
overdue_str = f", overdue by {age_days:.0f} day(s)" if is_overdue else ""
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
day_str = days[day_of_week % 7]
prompt = (
f"You are simulating how a specific user reacts to a task recommendation app.\n\n"
f"User persona: {persona.name}\n"
f"Persona: {persona.description}\n\n"
f'Recommended task: "{tip.get("content", "Unknown task")}"\n'
f"Task: priority={priority_label}{overdue_str}\n"
f"Current time: {_period(hour)} ({hour}:00, {day_str})\n\n"
f"How does this user react? Reply with exactly one word: done | snooze | dismiss\n\n"
f"- done: acts on this tip (marks task complete)\n"
f"- snooze: acknowledges but not now\n"
f"- dismiss: ignores or rejects it"
)
try:
message = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=10,
messages=[{"role": "user", "content": prompt}],
)
raw = message.content[0].text.strip().lower().split()[0]
action = raw if raw in ACTIONS else _rule_judge(persona, tip, hour, rng)[0]
except Exception:
action, _ = _rule_judge(persona, tip, hour, rng)
# Simulate dwell based on engagement level
engagement = _engagement_score(persona, tip, hour)
dwell = _simulate_dwell_ms(engagement, rng) if action == "done" else rng.randint(2_000, 15_000)
return action, dwell
# ── Public API ─────────────────────────────────────────────────────────────
def judge(
persona: Persona,
tip: dict,
hour: int,
day_of_week: int,
rng: random.Random,
use_llm: bool = True,
) -> tuple[str, int, float]:
"""Return (action, dwell_ms, reward).
action — 'done' | 'snooze' | 'dismiss'
dwell_ms — simulated milliseconds between tip appearance and user action
reward — inferred from action + dwell_ms via infer_reward()
"""
if use_llm and os.environ.get("ANTHROPIC_API_KEY"):
action, dwell_ms = _llm_judge(persona, tip, hour, day_of_week, rng)
else:
action, dwell_ms = _rule_judge(persona, tip, hour, rng)
return action, dwell_ms, infer_reward(action, dwell_ms)