feat: ε-greedy v1 as active policy; dwell-time reward inference; offline sim framework

- Promote egreedy-v1 to active serving policy (ADR-0007): /score/egreedy + /reward/egreedy
  replaces linucb-v1 endpoints after offline sim shows +10.7% mean reward (−0.548 vs −0.606)
- Replace explicit helpful/not_helpful feedback with dwell-time inferred reward (inferReward):
  dismiss=−1.0, snooze=+0.1, done<15s=−0.3, done 15s–2min=+1.0, done 2–10min=+0.6, done>10min=+0.3
- Add ml/serving ε-greedy endpoints: /score/egreedy, /reward/egreedy, /stats/egreedy/{user_id}
  with d=7 feature vector (base 5 + sin/cos day-of-week encoding)
- Add offline simulation framework (ml/experiments/sim): rule/LLM/claude-code judges,
  two-phase score+reward, synthetic personas, task generator; results stored in sim_runs/sim_events
- Add /admin/simulations page: start runs, live-poll status, reward curve SVG, action/persona tables
- Fix egreedy day_of_week training skew: reward endpoint now uses actual dow instead of hardcoded 0
- Fix runner.py proxy bypass: httpx.Client(trust_env=False) for localhost ML calls
- Add dwellMs to TipFeedbackEvent contract and bus.test.ts fixture
- Schema: sim_runs, sim_events tables; tip_feedback gains dwell_ms, reward_milli columns
- ADR-0006: admin console framework; ADR-0007: egreedy-v1 policy selection rationale

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-16 07:44:37 +00:00
parent c5ea18ec6e
commit faf44c18fc
48 changed files with 6151 additions and 40 deletions

View File

@@ -35,8 +35,10 @@ app = FastAPI(title="oO ML Serving", version="1.0.0")
STATE_DIR = Path(os.getenv("STATE_DIR", "/tmp/oo-bandit-state"))
STATE_DIR.mkdir(parents=True, exist_ok=True)
ALPHA = 1.0 # exploration coefficient
D = 5 # feature dimension
ALPHA = 1.0 # LinUCB exploration coefficient
D = 5 # LinUCB feature dimension
D7 = 7 # ε-greedy feature dimension (adds day-of-week cyclical encoding)
EPSILON = 0.1 # ε-greedy exploration rate
FEATURE_HISTORY_SIZE = 100 # per-user ring buffer
@@ -63,6 +65,8 @@ def build_feature_vector(features: dict) -> np.ndarray:
# ── Per-user bandit state (disjoint LinUCB, global arm) ───────────────────
# ── LinUCB state helpers ───────────────────────────────────────────────────
def state_path(user_id: str) -> Path:
safe = "".join(c if c.isalnum() else "_" for c in user_id)
return STATE_DIR / f"{safe}.json"
@@ -85,6 +89,37 @@ def save_state(user_id: str, A: np.ndarray, b: np.ndarray, meta: dict) -> None:
p.write_text(json.dumps({"A": A.tolist(), "b": b.tolist(), "meta": meta}))
# ── ε-greedy state helpers (d=7, extended features) ───────────────────────
def build_feature_vector_7(features: dict, day_of_week: int = 0) -> np.ndarray:
"""d=7: base 5 features + day-of-week cyclical encoding."""
base = build_feature_vector(features)
dow_sin = math.sin(2 * math.pi * day_of_week / 7)
dow_cos = math.cos(2 * math.pi * day_of_week / 7)
return np.append(base, [dow_sin, dow_cos])
def state7_path(user_id: str) -> Path:
safe = "".join(c if c.isalnum() else "_" for c in user_id)
return STATE_DIR / f"{safe}_egreedy.json"
def load_state7(user_id: str) -> tuple[np.ndarray, np.ndarray, dict]:
"""Returns (A, b, meta) for ε-greedy d=7 policy."""
p = state7_path(user_id)
if p.exists():
raw = json.loads(p.read_text())
A = np.array(raw["A"], dtype=np.float64)
b = np.array(raw["b"], dtype=np.float64)
return A, b, raw.get("meta", {})
return np.identity(D7, dtype=np.float64), np.zeros(D7, dtype=np.float64), {}
def save_state7(user_id: str, A: np.ndarray, b: np.ndarray, meta: dict) -> None:
p = state7_path(user_id)
p.write_text(json.dumps({"A": A.tolist(), "b": b.tolist(), "meta": meta}))
# ── API models ─────────────────────────────────────────────────────────────
class CandidateFeatures(BaseModel):
@@ -124,6 +159,7 @@ class RewardRequest(BaseModel):
tip_id: str
reward: float # +1 done, +0.5 helpful, 0 snooze, -0.5 not_helpful, -1 dismiss
features: CandidateFeatures
day_of_week: int = 0 # included so egreedy can train dow features correctly
class RewardResponse(BaseModel):
@@ -209,12 +245,131 @@ def reward(req: RewardRequest) -> RewardResponse:
return RewardResponse(ok=True)
@app.post("/score/egreedy", response_model=ScoreResponse)
def score_egreedy(req: ScoreRequest) -> ScoreResponse:
"""ε-greedy policy with d=7 features (adds day-of-week encoding).
Exploration: pick uniformly at random with probability ε.
Exploitation: pick argmax of linear payoff estimate θ·x.
Differs from LinUCB in: no UCB bonus, richer feature space.
"""
if not req.candidates:
raise HTTPException(status_code=422, detail="No candidates")
A, b, meta = load_state7(req.user_id)
try:
A_inv = np.linalg.inv(A)
except np.linalg.LinAlgError:
A_inv = np.identity(D7, dtype=np.float64)
theta = A_inv @ b
dow = req.context.day_of_week
exploring = np.random.random() < EPSILON
if exploring:
chosen = req.candidates[np.random.randint(len(req.candidates))]
feat_dict = {
"hour_of_day": req.context.hour_of_day,
"is_overdue": chosen.features.is_overdue,
"task_age_days": chosen.features.task_age_days,
"priority": chosen.features.priority,
}
x = build_feature_vector_7(feat_dict, dow)
best_score = float(theta @ x)
best_id = chosen.id
else:
best_id = None
best_score = -float("inf")
feat_dict = {}
for candidate in req.candidates:
fd = {
"hour_of_day": req.context.hour_of_day,
"is_overdue": candidate.features.is_overdue,
"task_age_days": candidate.features.task_age_days,
"priority": candidate.features.priority,
}
x = build_feature_vector_7(fd, dow)
s = float(theta @ x)
if s > best_score:
best_score = s
best_id = candidate.id
feat_dict = fd
history = get_feature_history(req.user_id)
history.append({
"ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"features": {**feat_dict, "day_of_week": dow, "exploring": exploring},
"score": best_score,
"tip_id": best_id,
"policy": "egreedy-v1",
})
meta["pulls"] = meta.get("pulls", 0) + 1
meta["explore_count"] = meta.get("explore_count", 0) + int(exploring)
meta["last_updated"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
save_state7(req.user_id, A, b, meta)
return ScoreResponse(tip_id=best_id, score=best_score, policy="egreedy-v1")
@app.post("/reward/egreedy", response_model=RewardResponse)
def reward_egreedy(req: RewardRequest) -> RewardResponse:
"""Update ε-greedy ridge estimator with observed reward."""
A, b, meta = load_state7(req.user_id)
feat_dict = {
"hour_of_day": req.features.hour_of_day,
"is_overdue": req.features.is_overdue,
"task_age_days": req.features.task_age_days,
"priority": req.features.priority,
}
x = build_feature_vector_7(feat_dict, day_of_week=req.day_of_week)
A += np.outer(x, x)
b += req.reward * x
meta["cumulative_reward"] = meta.get("cumulative_reward", 0.0) + req.reward
meta["reward_count"] = meta.get("reward_count", 0) + 1
meta["last_updated"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
save_state7(req.user_id, A, b, meta)
return RewardResponse(ok=True)
@app.get("/stats/egreedy/{user_id}")
def stats_egreedy(user_id: str):
"""ε-greedy policy stats — pulls, cumulative reward, θ vector."""
A, b, meta = load_state7(user_id)
try:
theta = (np.linalg.inv(A) @ b).tolist()
except np.linalg.LinAlgError:
theta = [0.0] * D7
pulls = meta.get("pulls", 0)
cumulative_reward = meta.get("cumulative_reward", 0.0)
reward_count = meta.get("reward_count", 0)
explore_count = meta.get("explore_count", 0)
return {
"user_id": user_id,
"policy": "egreedy-v1",
"pulls": pulls,
"reward_count": reward_count,
"cumulative_reward": cumulative_reward,
"estimated_mean_reward": cumulative_reward / reward_count if reward_count > 0 else 0.0,
"exploration_rate": explore_count / pulls if pulls > 0 else 0.0,
"theta": theta,
"feature_labels": ["hour_sin", "hour_cos", "is_overdue", "task_age", "priority", "dow_sin", "dow_cos"],
"last_updated": meta.get("last_updated"),
}
@app.post("/reset/{user_id}", response_model=RewardResponse)
def reset(user_id: str) -> RewardResponse:
"""Reset per-user bandit state (admin action)."""
p = state_path(user_id)
if p.exists():
p.unlink()
p7 = state7_path(user_id)
if p7.exists():
p7.unlink()
if user_id in _feature_history:
_feature_history[user_id].clear()
return RewardResponse(ok=True)

View File

@@ -4,6 +4,7 @@
"private": true,
"scripts": {
"dev": ".venv/bin/uvicorn main:app --reload --port 8000",
"start": ".venv/bin/uvicorn main:app --port 8000"
"start": ".venv/bin/uvicorn main:app --port 8000",
"test": ".venv/bin/python -m pytest tests/ -v"
}
}

View File

@@ -0,0 +1,4 @@
-r requirements.txt
pytest==8.3.5
pytest-asyncio==0.24.0
httpx==0.28.1

View File

@@ -2,3 +2,5 @@ fastapi==0.115.6
uvicorn[standard]==0.32.1
pydantic==2.10.4
numpy>=1.26.0
httpx>=0.27.0
anthropic>=0.40.0

View File

View File

@@ -0,0 +1,261 @@
"""
Unit tests for ml/serving — feature building and scoring contract.
Run with: pytest ml/serving/tests/
"""
import math
import pytest
from httpx import AsyncClient, ASGITransport
from main import app, build_feature_vector
class TestFeatureVector:
def test_shape(self):
v = build_feature_vector({"hour_of_day": 8, "is_overdue": True, "task_age_days": 3, "priority": 3})
assert v.shape == (5,)
def test_hour_encoding_noon(self):
v = build_feature_vector({"hour_of_day": 12})
# sin(2π * 12/24) = sin(π) ≈ 0
assert abs(v[0]) < 1e-10
# cos(2π * 12/24) = cos(π) = -1
assert abs(v[1] - (-1.0)) < 1e-10
def test_hour_encoding_midnight(self):
v = build_feature_vector({"hour_of_day": 0})
# sin(0) = 0
assert abs(v[0]) < 1e-10
# cos(0) = 1
assert abs(v[1] - 1.0) < 1e-10
def test_hour_encoding_6am(self):
v = build_feature_vector({"hour_of_day": 6})
# sin(2π * 6/24) = sin(π/2) = 1
assert abs(v[0] - 1.0) < 1e-10
# cos(π/2) = 0
assert abs(v[1]) < 1e-10
def test_age_clipped_at_30(self):
v_long = build_feature_vector({"task_age_days": 100})
v_cap = build_feature_vector({"task_age_days": 30})
assert v_long[3] == v_cap[3] == 1.0
def test_age_zero(self):
v = build_feature_vector({"task_age_days": 0})
assert v[3] == pytest.approx(0.0)
def test_age_15_days_normalised(self):
v = build_feature_vector({"task_age_days": 15})
assert v[3] == pytest.approx(0.5)
def test_priority_normalised(self):
v1 = build_feature_vector({"priority": 1})
v4 = build_feature_vector({"priority": 4})
assert v1[4] == pytest.approx(0.0)
assert v4[4] == pytest.approx(1.0)
def test_priority_2_and_3(self):
v2 = build_feature_vector({"priority": 2})
v3 = build_feature_vector({"priority": 3})
assert v2[4] == pytest.approx(1 / 3)
assert v3[4] == pytest.approx(2 / 3)
def test_is_overdue_true(self):
v = build_feature_vector({"is_overdue": True})
assert v[2] == 1.0
def test_is_overdue_false(self):
v = build_feature_vector({"is_overdue": False})
assert v[2] == 0.0
def test_defaults_when_no_keys(self):
v = build_feature_vector({})
# hour=12 → sin(π)≈0, cos(π)=-1
assert abs(v[0]) < 1e-10
assert abs(v[1] - (-1.0)) < 1e-10
assert v[2] == 0.0 # is_overdue=False
assert v[3] == 0.0 # task_age_days=0
assert v[4] == 0.0 # priority=1 → (1-1)/3=0
@pytest.mark.asyncio
async def test_health():
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
r = await client.get("/health")
assert r.status_code == 200
assert r.json()["ok"] is True
@pytest.mark.asyncio
async def test_score_returns_a_candidate():
payload = {
"user_id": "test-user",
"candidates": [
{"id": "t:1", "content": "Task A", "source": "todoist", "source_id": "1",
"features": {"is_overdue": True, "task_age_days": 2, "priority": 3}},
{"id": "t:2", "content": "Task B", "source": "todoist", "source_id": "2",
"features": {"is_overdue": False, "task_age_days": 0, "priority": 1}},
],
"context": {"hour_of_day": 9, "day_of_week": 1},
}
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
r = await client.post("/score", json=payload)
assert r.status_code == 200
body = r.json()
assert body["tip_id"] in {"t:1", "t:2"}
assert "policy" in body
assert body["policy"] == "linucb-v1"
assert isinstance(body["score"], float)
@pytest.mark.asyncio
async def test_score_single_candidate_always_selected():
"""With a single candidate there is no choice — it must be returned."""
payload = {
"user_id": "solo-user",
"candidates": [
{"id": "only:1", "content": "Only task", "source": "todoist",
"features": {"is_overdue": False, "task_age_days": 0, "priority": 1}},
],
"context": {"hour_of_day": 10, "day_of_week": 0},
}
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
r = await client.post("/score", json=payload)
assert r.status_code == 200
assert r.json()["tip_id"] == "only:1"
@pytest.mark.asyncio
async def test_score_empty_candidates_returns_422():
payload = {"user_id": "u", "candidates": [], "context": {"hour_of_day": 9, "day_of_week": 1}}
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
r = await client.post("/score", json=payload)
assert r.status_code == 422
@pytest.mark.asyncio
async def test_reward_accepted():
payload = {
"user_id": "reward-user",
"tip_id": "t:1",
"reward": 1.0,
"features": {"hour_of_day": 9, "is_overdue": True, "task_age_days": 2, "priority": 3},
}
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
r = await client.post("/reward", json=payload)
assert r.status_code == 200
assert r.json()["ok"] is True
@pytest.mark.asyncio
async def test_reward_updates_stats():
"""Posting a reward should increase cumulative_reward in /stats."""
user_id = "reward-stats-user"
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
r0 = await client.get(f"/stats/{user_id}")
before = r0.json()["cumulative_reward"]
await client.post("/reward", json={
"user_id": user_id,
"tip_id": "tip:x",
"reward": 1.0,
"features": {"hour_of_day": 8, "is_overdue": False, "task_age_days": 0, "priority": 2},
})
r1 = await client.get(f"/stats/{user_id}")
assert r1.json()["cumulative_reward"] == pytest.approx(before + 1.0)
@pytest.mark.asyncio
async def test_score_increments_pulls():
user_id = "pull-counter-user"
payload = {
"user_id": user_id,
"candidates": [
{"id": "t:p1", "content": "Pull task", "source": "todoist",
"features": {"is_overdue": False, "task_age_days": 1, "priority": 2}},
],
"context": {"hour_of_day": 10, "day_of_week": 2},
}
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
r0 = await client.get(f"/stats/{user_id}")
pulls_before = r0.json()["pulls"]
await client.post("/score", json=payload)
await client.post("/score", json=payload)
r1 = await client.get(f"/stats/{user_id}")
assert r1.json()["pulls"] == pulls_before + 2
@pytest.mark.asyncio
async def test_reset_clears_state():
user_id = "reset-user"
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
# Score once to build state
await client.post("/score", json={
"user_id": user_id,
"candidates": [
{"id": "t:r", "content": "Reset task", "source": "todoist",
"features": {"is_overdue": True, "task_age_days": 5, "priority": 4}},
],
"context": {"hour_of_day": 14, "day_of_week": 3},
})
r_reset = await client.post(f"/reset/{user_id}")
assert r_reset.json()["ok"] is True
r_stats = await client.get(f"/stats/{user_id}")
assert r_stats.json()["pulls"] == 0
@pytest.mark.asyncio
async def test_features_endpoint_returns_history():
user_id = "features-user"
payload = {
"user_id": user_id,
"candidates": [
{"id": "t:f1", "content": "Feature task", "source": "todoist",
"features": {"is_overdue": False, "task_age_days": 0, "priority": 1}},
],
"context": {"hour_of_day": 7, "day_of_week": 0},
}
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
await client.post("/score", json=payload)
r = await client.get(f"/features/{user_id}")
body = r.json()
assert r.status_code == 200
assert "history" in body
assert len(body["history"]) >= 1
entry = body["history"][-1]
assert "ts" in entry
assert "score" in entry
assert "tip_id" in entry
@pytest.mark.asyncio
async def test_stats_for_fresh_user():
"""A user with no history should return zero/default stats without error."""
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
r = await client.get("/stats/brand-new-user-xyz-abc")
body = r.json()
assert r.status_code == 200
assert body["pulls"] == 0
assert body["cumulative_reward"] == 0.0
assert body["estimated_mean_reward"] == 0.0
@pytest.mark.asyncio
async def test_reward_negative_value():
"""Dismissing a tip should decrease cumulative_reward."""
user_id = "dismiss-user-neg"
async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
r0 = await client.get(f"/stats/{user_id}")
before = r0.json()["cumulative_reward"]
await client.post("/reward", json={
"user_id": user_id,
"tip_id": "t:neg",
"reward": -1.0,
"features": {"hour_of_day": 20, "is_overdue": False, "task_age_days": 0, "priority": 1},
})
r1 = await client.get(f"/stats/{user_id}")
assert r1.json()["cumulative_reward"] == pytest.approx(before - 1.0)