feat(ml): egreedy-v2 shadow policy — D=12 with profile features (#99)

Ship the scaffolding for #99 (phase B.3 of #81):

- ml/serving: add /score/egreedy/v2, /reward/egreedy/v2, /stats/egreedy/v2
  endpoints (D=12). New feature dims: completion/dismiss rates, mean dwell
  (clipped 10min), preferred-hour alignment (cosine, 1-dim), tip volume (log).
  Separate state file per user (_egreedy_v2.json). /reset clears v2 state too.
- ADR-0012: documents D=7→12 dimension change, normalization choices, shadow
  rollout protocol, and promotion gate (offline sim win per ADR-0002).
- recommender.ts: register egreedy-v2-shadow in shadow-policy map (disabled by
  default). When enabled, calls /score/egreedy/v2 fire-and-forget and publishes
  shadow:egreedy-v2-shadow serve signal. No reward to shadow — sim is the gate.
- sim runner/personas: personas carry synthetic profile_features per persona;
  _call_score/_call_reward thread profile_features through (None-safe for v1/linucb).
- 18 new Python tests; all 56 Python + 170 TS tests pass.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-25 10:00:38 +00:00
parent b8113d4bda
commit 2d7cf217a9
6 changed files with 629 additions and 20 deletions

View File

@@ -2,12 +2,17 @@
oO ML Serving — Phase 1: LinUCB contextual bandit.
Contract:
POST /score { user_id, candidates, context } → { tip_id, score, policy }
POST /reward { user_id, tip_id, reward, features } → { ok }
POST /reset/{user_id}{ ok }
GET /stats/{user_id}{ pulls, cumulative_reward, estimated_mean, last_updated }
GET /features/{user_id} { history: [{ ts, features, score }] }
GET /health → { ok }
POST /score LinUCB d=5 (baseline, kept as shadow-eligible)
POST /score/egreedy ε-greedy v1, d=7 (active — ADR-0007)
POST /score/egreedy/v2 ε-greedy v2, d=12, profile features (shadow — ADR-0012)
POST /reward, /reward/egreedy, /reward/egreedy/v2
GET /stats/{user_id} LinUCB stats
GET /stats/egreedy/{user_id} ε-greedy v1 stats
GET /stats/egreedy/v2/{user_id} ε-greedy v2 stats
POST /reset/{user_id} clear all per-user bandit state
GET /features/{user_id} last 100 scored feature vectors
POST /generate LLM tip candidates via LiteLLM
GET /health { ok }
Features (d=5):
hour_sin, hour_cos — cyclical time-of-day encoding
@@ -43,7 +48,8 @@ STATE_DIR.mkdir(parents=True, exist_ok=True)
ALPHA = 1.0 # LinUCB exploration coefficient
D = 5 # LinUCB feature dimension
D7 = 7 # ε-greedy feature dimension (adds day-of-week cyclical encoding)
D7 = 7 # ε-greedy v1 feature dimension (adds day-of-week cyclical encoding)
D12 = 12 # ε-greedy v2 feature dimension (adds 5 profile features — ADR-0012)
EPSILON = 0.1 # ε-greedy exploration rate
FEATURE_HISTORY_SIZE = 100 # per-user ring buffer
@@ -126,6 +132,98 @@ def save_state7(user_id: str, A: np.ndarray, b: np.ndarray, meta: dict) -> None:
p.write_text(json.dumps({"A": A.tolist(), "b": b.tolist(), "meta": meta}))
# ── ε-greedy v2 state helpers (d=12, profile features — ADR-0012) ─────────
#
# Normalization choices (see ADR-0012):
# completion_rate_30d — already 01, passthrough; null → 0
# dismiss_rate_30d — already 01, passthrough; null → 0
# mean_dwell_ms_30d — clipped to [0, 600_000 ms] (10 min), then /600_000
# preferred_hour — circular alignment with context hour:
# (cos(2π·(now pref)/24) + 1) / 2 → 01
# captures "is the user's habitual peak near now?"
# null → 0.5 (neutral)
# tip_volume_30d — log1p(n) / log1p(100), clipped to [0, 1]
_DWELL_CLIP_MS = 600_000.0 # 10 minutes
_VOLUME_LOG_MAX = math.log1p(100.0)
def _profile_value(profile: Optional[dict], key: str) -> Optional[float]:
if not profile:
return None
v = profile.get(key)
if v is None:
return None
try:
return float(v)
except (TypeError, ValueError):
return None
def _norm_rate(v: Optional[float]) -> float:
return 0.0 if v is None else max(0.0, min(1.0, v))
def _norm_dwell(v: Optional[float]) -> float:
if v is None:
return 0.0
return max(0.0, min(1.0, v / _DWELL_CLIP_MS))
def _norm_volume(v: Optional[float]) -> float:
if v is None or v <= 0:
return 0.0
return min(1.0, math.log1p(float(v)) / _VOLUME_LOG_MAX)
def _norm_preferred_hour(pref: Optional[float], now_hour: int) -> float:
if pref is None:
return 0.5 # neutral when the user has no established peak yet
delta = (float(pref) - float(now_hour)) * (2.0 * math.pi / 24.0)
return (math.cos(delta) + 1.0) / 2.0
def build_feature_vector_12(
features: dict,
day_of_week: int = 0,
profile: Optional[dict] = None,
) -> np.ndarray:
"""d=12: egreedy-v1's 7 dims + 5 normalized profile features (ADR-0012)."""
base7 = build_feature_vector_7(features, day_of_week)
now_hour = int(features.get("hour_of_day", 12))
profile_dims = np.array(
[
_norm_rate(_profile_value(profile, "completion_rate_30d")),
_norm_rate(_profile_value(profile, "dismiss_rate_30d")),
_norm_dwell(_profile_value(profile, "mean_dwell_ms_30d")),
_norm_preferred_hour(_profile_value(profile, "preferred_hour"), now_hour),
_norm_volume(_profile_value(profile, "tip_volume_30d")),
],
dtype=np.float64,
)
return np.concatenate([base7, profile_dims])
def state12_path(user_id: str) -> Path:
safe = "".join(c if c.isalnum() else "_" for c in user_id)
return STATE_DIR / f"{safe}_egreedy_v2.json"
def load_state12(user_id: str) -> tuple[np.ndarray, np.ndarray, dict]:
p = state12_path(user_id)
if p.exists():
raw = json.loads(p.read_text())
A = np.array(raw["A"], dtype=np.float64)
b = np.array(raw["b"], dtype=np.float64)
return A, b, raw.get("meta", {})
return np.identity(D12, dtype=np.float64), np.zeros(D12, dtype=np.float64), {}
def save_state12(user_id: str, A: np.ndarray, b: np.ndarray, meta: dict) -> None:
p = state12_path(user_id)
p.write_text(json.dumps({"A": A.tolist(), "b": b.tolist(), "meta": meta}))
# ── API models ─────────────────────────────────────────────────────────────
class CandidateFeatures(BaseModel):
@@ -171,6 +269,10 @@ class RewardRequest(BaseModel):
reward: float # +1 done, +0.5 helpful, 0 snooze, -0.5 not_helpful, -1 dismiss
features: CandidateFeatures
day_of_week: int = 0 # included so egreedy can train dow features correctly
# Profile features at the time the tip was served. Ignored by /reward and
# /reward/egreedy; consumed by /reward/egreedy/v2 so the ridge update uses
# the same feature vector as the matching /score/egreedy/v2 call.
profile_features: Optional[dict] = None
class RewardResponse(BaseModel):
@@ -472,6 +574,128 @@ def reward_egreedy(req: RewardRequest) -> RewardResponse:
return RewardResponse(ok=True)
@app.post("/score/egreedy/v2", response_model=ScoreResponse)
def score_egreedy_v2(req: ScoreRequest) -> ScoreResponse:
"""ε-greedy v2 — d=12, adds 5 normalized profile features (ADR-0012).
Shadow-only until offline sim + rollout per ADR-0002 completes.
Accepts the same ScoreRequest shape as v1; `profile_features` drives the
extra 5 dims (defaults: zeros for rates/volume/dwell, 0.5 neutral for
preferred_hour alignment).
"""
if not req.candidates:
raise HTTPException(status_code=422, detail="No candidates")
A, b, meta = load_state12(req.user_id)
try:
A_inv = np.linalg.inv(A)
except np.linalg.LinAlgError:
A_inv = np.identity(D12, dtype=np.float64)
theta = A_inv @ b
dow = req.context.day_of_week
exploring = np.random.random() < EPSILON
if exploring:
chosen = req.candidates[np.random.randint(len(req.candidates))]
feat_dict = {
"hour_of_day": req.context.hour_of_day,
"is_overdue": chosen.features.is_overdue,
"task_age_days": chosen.features.task_age_days,
"priority": chosen.features.priority,
}
x = build_feature_vector_12(feat_dict, dow, req.profile_features)
best_score = float(theta @ x)
best_id = chosen.id
else:
best_id = None
best_score = -float("inf")
feat_dict = {}
for candidate in req.candidates:
fd = {
"hour_of_day": req.context.hour_of_day,
"is_overdue": candidate.features.is_overdue,
"task_age_days": candidate.features.task_age_days,
"priority": candidate.features.priority,
}
x = build_feature_vector_12(fd, dow, req.profile_features)
s = float(theta @ x)
if s > best_score:
best_score = s
best_id = candidate.id
feat_dict = fd
history = get_feature_history(req.user_id)
history.append({
"ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"features": {**feat_dict, "day_of_week": dow, "exploring": exploring},
"score": best_score,
"tip_id": best_id,
"policy": "egreedy-v2",
})
meta["pulls"] = meta.get("pulls", 0) + 1
meta["explore_count"] = meta.get("explore_count", 0) + int(exploring)
meta["last_updated"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
save_state12(req.user_id, A, b, meta)
return ScoreResponse(tip_id=best_id, score=best_score, policy="egreedy-v2")
@app.post("/reward/egreedy/v2", response_model=RewardResponse)
def reward_egreedy_v2(req: RewardRequest) -> RewardResponse:
"""Update ε-greedy v2 ridge estimator using the d=12 feature vector."""
A, b, meta = load_state12(req.user_id)
feat_dict = {
"hour_of_day": req.features.hour_of_day,
"is_overdue": req.features.is_overdue,
"task_age_days": req.features.task_age_days,
"priority": req.features.priority,
}
x = build_feature_vector_12(feat_dict, req.day_of_week, req.profile_features)
A += np.outer(x, x)
b += req.reward * x
meta["cumulative_reward"] = meta.get("cumulative_reward", 0.0) + req.reward
meta["reward_count"] = meta.get("reward_count", 0) + 1
meta["last_updated"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
save_state12(req.user_id, A, b, meta)
return RewardResponse(ok=True)
@app.get("/stats/egreedy/v2/{user_id}")
def stats_egreedy_v2(user_id: str):
"""ε-greedy v2 policy stats — pulls, cumulative reward, θ vector."""
A, b, meta = load_state12(user_id)
try:
theta = (np.linalg.inv(A) @ b).tolist()
except np.linalg.LinAlgError:
theta = [0.0] * D12
pulls = meta.get("pulls", 0)
cumulative_reward = meta.get("cumulative_reward", 0.0)
reward_count = meta.get("reward_count", 0)
explore_count = meta.get("explore_count", 0)
return {
"user_id": user_id,
"policy": "egreedy-v2",
"pulls": pulls,
"reward_count": reward_count,
"cumulative_reward": cumulative_reward,
"estimated_mean_reward": cumulative_reward / reward_count if reward_count > 0 else 0.0,
"exploration_rate": explore_count / pulls if pulls > 0 else 0.0,
"theta": theta,
"feature_labels": [
"hour_sin", "hour_cos", "is_overdue", "task_age", "priority",
"dow_sin", "dow_cos",
"completion_rate_30d", "dismiss_rate_30d", "mean_dwell_norm",
"preferred_hour_alignment", "tip_volume_norm",
],
"last_updated": meta.get("last_updated"),
}
@app.get("/stats/egreedy/{user_id}")
def stats_egreedy(user_id: str):
"""ε-greedy policy stats — pulls, cumulative reward, θ vector."""
@@ -509,6 +733,9 @@ def reset(user_id: str) -> RewardResponse:
p7 = state7_path(user_id)
if p7.exists():
p7.unlink()
p12 = state12_path(user_id)
if p12.exists():
p12.unlink()
if user_id in _feature_history:
_feature_history[user_id].clear()
return RewardResponse(ok=True)