feat: ε-greedy v1 as active policy; dwell-time reward inference; offline sim framework
- Promote egreedy-v1 to active serving policy (ADR-0007): /score/egreedy + /reward/egreedy
replaces linucb-v1 endpoints after offline sim shows +10.7% mean reward (−0.548 vs −0.606)
- Replace explicit helpful/not_helpful feedback with dwell-time inferred reward (inferReward):
dismiss=−1.0, snooze=+0.1, done<15s=−0.3, done 15s–2min=+1.0, done 2–10min=+0.6, done>10min=+0.3
- Add ml/serving ε-greedy endpoints: /score/egreedy, /reward/egreedy, /stats/egreedy/{user_id}
with d=7 feature vector (base 5 + sin/cos day-of-week encoding)
- Add offline simulation framework (ml/experiments/sim): rule/LLM/claude-code judges,
two-phase score+reward, synthetic personas, task generator; results stored in sim_runs/sim_events
- Add /admin/simulations page: start runs, live-poll status, reward curve SVG, action/persona tables
- Fix egreedy day_of_week training skew: reward endpoint now uses actual dow instead of hardcoded 0
- Fix runner.py proxy bypass: httpx.Client(trust_env=False) for localhost ML calls
- Add dwellMs to TipFeedbackEvent contract and bus.test.ts fixture
- Schema: sim_runs, sim_events tables; tip_feedback gains dwell_ms, reward_milli columns
- ADR-0006: admin console framework; ADR-0007: egreedy-v1 policy selection rationale
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -35,8 +35,10 @@ app = FastAPI(title="oO ML Serving", version="1.0.0")
|
||||
STATE_DIR = Path(os.getenv("STATE_DIR", "/tmp/oo-bandit-state"))
|
||||
STATE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
ALPHA = 1.0 # exploration coefficient
|
||||
D = 5 # feature dimension
|
||||
ALPHA = 1.0 # LinUCB exploration coefficient
|
||||
D = 5 # LinUCB feature dimension
|
||||
D7 = 7 # ε-greedy feature dimension (adds day-of-week cyclical encoding)
|
||||
EPSILON = 0.1 # ε-greedy exploration rate
|
||||
FEATURE_HISTORY_SIZE = 100 # per-user ring buffer
|
||||
|
||||
|
||||
@@ -63,6 +65,8 @@ def build_feature_vector(features: dict) -> np.ndarray:
|
||||
|
||||
# ── Per-user bandit state (disjoint LinUCB, global arm) ───────────────────
|
||||
|
||||
# ── LinUCB state helpers ───────────────────────────────────────────────────
|
||||
|
||||
def state_path(user_id: str) -> Path:
|
||||
safe = "".join(c if c.isalnum() else "_" for c in user_id)
|
||||
return STATE_DIR / f"{safe}.json"
|
||||
@@ -85,6 +89,37 @@ def save_state(user_id: str, A: np.ndarray, b: np.ndarray, meta: dict) -> None:
|
||||
p.write_text(json.dumps({"A": A.tolist(), "b": b.tolist(), "meta": meta}))
|
||||
|
||||
|
||||
# ── ε-greedy state helpers (d=7, extended features) ───────────────────────
|
||||
|
||||
def build_feature_vector_7(features: dict, day_of_week: int = 0) -> np.ndarray:
|
||||
"""d=7: base 5 features + day-of-week cyclical encoding."""
|
||||
base = build_feature_vector(features)
|
||||
dow_sin = math.sin(2 * math.pi * day_of_week / 7)
|
||||
dow_cos = math.cos(2 * math.pi * day_of_week / 7)
|
||||
return np.append(base, [dow_sin, dow_cos])
|
||||
|
||||
|
||||
def state7_path(user_id: str) -> Path:
|
||||
safe = "".join(c if c.isalnum() else "_" for c in user_id)
|
||||
return STATE_DIR / f"{safe}_egreedy.json"
|
||||
|
||||
|
||||
def load_state7(user_id: str) -> tuple[np.ndarray, np.ndarray, dict]:
|
||||
"""Returns (A, b, meta) for ε-greedy d=7 policy."""
|
||||
p = state7_path(user_id)
|
||||
if p.exists():
|
||||
raw = json.loads(p.read_text())
|
||||
A = np.array(raw["A"], dtype=np.float64)
|
||||
b = np.array(raw["b"], dtype=np.float64)
|
||||
return A, b, raw.get("meta", {})
|
||||
return np.identity(D7, dtype=np.float64), np.zeros(D7, dtype=np.float64), {}
|
||||
|
||||
|
||||
def save_state7(user_id: str, A: np.ndarray, b: np.ndarray, meta: dict) -> None:
|
||||
p = state7_path(user_id)
|
||||
p.write_text(json.dumps({"A": A.tolist(), "b": b.tolist(), "meta": meta}))
|
||||
|
||||
|
||||
# ── API models ─────────────────────────────────────────────────────────────
|
||||
|
||||
class CandidateFeatures(BaseModel):
|
||||
@@ -124,6 +159,7 @@ class RewardRequest(BaseModel):
|
||||
tip_id: str
|
||||
reward: float # +1 done, +0.5 helpful, 0 snooze, -0.5 not_helpful, -1 dismiss
|
||||
features: CandidateFeatures
|
||||
day_of_week: int = 0 # included so egreedy can train dow features correctly
|
||||
|
||||
|
||||
class RewardResponse(BaseModel):
|
||||
@@ -209,12 +245,131 @@ def reward(req: RewardRequest) -> RewardResponse:
|
||||
return RewardResponse(ok=True)
|
||||
|
||||
|
||||
@app.post("/score/egreedy", response_model=ScoreResponse)
|
||||
def score_egreedy(req: ScoreRequest) -> ScoreResponse:
|
||||
"""ε-greedy policy with d=7 features (adds day-of-week encoding).
|
||||
|
||||
Exploration: pick uniformly at random with probability ε.
|
||||
Exploitation: pick argmax of linear payoff estimate θ·x.
|
||||
Differs from LinUCB in: no UCB bonus, richer feature space.
|
||||
"""
|
||||
if not req.candidates:
|
||||
raise HTTPException(status_code=422, detail="No candidates")
|
||||
|
||||
A, b, meta = load_state7(req.user_id)
|
||||
try:
|
||||
A_inv = np.linalg.inv(A)
|
||||
except np.linalg.LinAlgError:
|
||||
A_inv = np.identity(D7, dtype=np.float64)
|
||||
theta = A_inv @ b
|
||||
|
||||
dow = req.context.day_of_week
|
||||
exploring = np.random.random() < EPSILON
|
||||
|
||||
if exploring:
|
||||
chosen = req.candidates[np.random.randint(len(req.candidates))]
|
||||
feat_dict = {
|
||||
"hour_of_day": req.context.hour_of_day,
|
||||
"is_overdue": chosen.features.is_overdue,
|
||||
"task_age_days": chosen.features.task_age_days,
|
||||
"priority": chosen.features.priority,
|
||||
}
|
||||
x = build_feature_vector_7(feat_dict, dow)
|
||||
best_score = float(theta @ x)
|
||||
best_id = chosen.id
|
||||
else:
|
||||
best_id = None
|
||||
best_score = -float("inf")
|
||||
feat_dict = {}
|
||||
for candidate in req.candidates:
|
||||
fd = {
|
||||
"hour_of_day": req.context.hour_of_day,
|
||||
"is_overdue": candidate.features.is_overdue,
|
||||
"task_age_days": candidate.features.task_age_days,
|
||||
"priority": candidate.features.priority,
|
||||
}
|
||||
x = build_feature_vector_7(fd, dow)
|
||||
s = float(theta @ x)
|
||||
if s > best_score:
|
||||
best_score = s
|
||||
best_id = candidate.id
|
||||
feat_dict = fd
|
||||
|
||||
history = get_feature_history(req.user_id)
|
||||
history.append({
|
||||
"ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"features": {**feat_dict, "day_of_week": dow, "exploring": exploring},
|
||||
"score": best_score,
|
||||
"tip_id": best_id,
|
||||
"policy": "egreedy-v1",
|
||||
})
|
||||
|
||||
meta["pulls"] = meta.get("pulls", 0) + 1
|
||||
meta["explore_count"] = meta.get("explore_count", 0) + int(exploring)
|
||||
meta["last_updated"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||
save_state7(req.user_id, A, b, meta)
|
||||
|
||||
return ScoreResponse(tip_id=best_id, score=best_score, policy="egreedy-v1")
|
||||
|
||||
|
||||
@app.post("/reward/egreedy", response_model=RewardResponse)
|
||||
def reward_egreedy(req: RewardRequest) -> RewardResponse:
|
||||
"""Update ε-greedy ridge estimator with observed reward."""
|
||||
A, b, meta = load_state7(req.user_id)
|
||||
feat_dict = {
|
||||
"hour_of_day": req.features.hour_of_day,
|
||||
"is_overdue": req.features.is_overdue,
|
||||
"task_age_days": req.features.task_age_days,
|
||||
"priority": req.features.priority,
|
||||
}
|
||||
x = build_feature_vector_7(feat_dict, day_of_week=req.day_of_week)
|
||||
A += np.outer(x, x)
|
||||
b += req.reward * x
|
||||
|
||||
meta["cumulative_reward"] = meta.get("cumulative_reward", 0.0) + req.reward
|
||||
meta["reward_count"] = meta.get("reward_count", 0) + 1
|
||||
meta["last_updated"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||
save_state7(req.user_id, A, b, meta)
|
||||
return RewardResponse(ok=True)
|
||||
|
||||
|
||||
@app.get("/stats/egreedy/{user_id}")
|
||||
def stats_egreedy(user_id: str):
|
||||
"""ε-greedy policy stats — pulls, cumulative reward, θ vector."""
|
||||
A, b, meta = load_state7(user_id)
|
||||
try:
|
||||
theta = (np.linalg.inv(A) @ b).tolist()
|
||||
except np.linalg.LinAlgError:
|
||||
theta = [0.0] * D7
|
||||
|
||||
pulls = meta.get("pulls", 0)
|
||||
cumulative_reward = meta.get("cumulative_reward", 0.0)
|
||||
reward_count = meta.get("reward_count", 0)
|
||||
explore_count = meta.get("explore_count", 0)
|
||||
|
||||
return {
|
||||
"user_id": user_id,
|
||||
"policy": "egreedy-v1",
|
||||
"pulls": pulls,
|
||||
"reward_count": reward_count,
|
||||
"cumulative_reward": cumulative_reward,
|
||||
"estimated_mean_reward": cumulative_reward / reward_count if reward_count > 0 else 0.0,
|
||||
"exploration_rate": explore_count / pulls if pulls > 0 else 0.0,
|
||||
"theta": theta,
|
||||
"feature_labels": ["hour_sin", "hour_cos", "is_overdue", "task_age", "priority", "dow_sin", "dow_cos"],
|
||||
"last_updated": meta.get("last_updated"),
|
||||
}
|
||||
|
||||
|
||||
@app.post("/reset/{user_id}", response_model=RewardResponse)
|
||||
def reset(user_id: str) -> RewardResponse:
|
||||
"""Reset per-user bandit state (admin action)."""
|
||||
p = state_path(user_id)
|
||||
if p.exists():
|
||||
p.unlink()
|
||||
p7 = state7_path(user_id)
|
||||
if p7.exists():
|
||||
p7.unlink()
|
||||
if user_id in _feature_history:
|
||||
_feature_history[user_id].clear()
|
||||
return RewardResponse(ok=True)
|
||||
|
||||
Reference in New Issue
Block a user