feat: ε-greedy v1 as active policy; dwell-time reward inference; offline sim framework

- Promote egreedy-v1 to active serving policy (ADR-0007): /score/egreedy + /reward/egreedy replaces linucb-v1 endpoints after offline sim shows +10.7% mean reward (−0.548 vs −0.606) - Replace explicit helpful/not_helpful feedback with dwell-time inferred reward (inferReward): dismiss=−1.0, snooze=+0.1, done<15s=−0.3, done 15s–2min=+1.0, done 2–10min=+0.6, done>10min=+0.3 - Add ml/serving ε-greedy endpoints: /score/egreedy, /reward/egreedy, /stats/egreedy/{user_id} with d=7 feature vector (base 5 + sin/cos day-of-week encoding) - Add offline simulation framework (ml/experiments/sim): rule/LLM/claude-code judges, two-phase score+reward, synthetic personas, task generator; results stored in sim_runs/sim_events - Add /admin/simulations page: start runs, live-poll status, reward curve SVG, action/persona tables - Fix egreedy day_of_week training skew: reward endpoint now uses actual dow instead of hardcoded 0 - Fix runner.py proxy bypass: httpx.Client(trust_env=False) for localhost ML calls - Add dwellMs to TipFeedbackEvent contract and bus.test.ts fixture - Schema: sim_runs, sim_events tables; tip_feedback gains dwell_ms, reward_milli columns - ADR-0006: admin console framework; ADR-0007: egreedy-v1 policy selection rationale Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-16 07:44:37 +00:00
parent c5ea18ec6e
commit faf44c18fc
48 changed files with 6151 additions and 40 deletions
--- a/services/api/src/routes/recommender.ts
+++ b/services/api/src/routes/recommender.ts
@@ -2,7 +2,7 @@ import { type Router as ExpressRouter, Router, Response } from 'express';
 import { nanoid } from 'nanoid';
 import { db } from '../db/index.js';
 import { integrationTokens, tipFeedback, tipViews, tipScores } from '../db/schema.js';
-import { eq, and } from 'drizzle-orm';
+import { eq, and, desc } from 'drizzle-orm';
 import { requireAuth, AuthenticatedRequest } from '../middleware/session.js';
 import { config } from '../config.js';
 import { bus } from '../events/bus.js';
@@ -105,7 +105,7 @@ async function fetchTodoistTasks(userId: string, accessToken: string): Promise<C
 async function remotePolicy(
  userId: string,
  tasks: CachedTask[],
-): Promise<{ tipId: string; score: number } | null> {
+): Promise<{ tipId: string; score: number; policy: string } | null> {
  const hour = new Date().getHours();
  const dayOfWeek = new Date().getDay();

@@ -121,8 +121,9 @@ async function remotePolicy(
    context: { hour_of_day: hour, day_of_week: dayOfWeek },
  };

+  // Active policy: egreedy-v1 (selected over linucb-v1 after offline sim — ADR-0007)
  try {
-    const res = await fetch(`${config.ML_SERVING_URL}/score`, {
+    const res = await fetch(`${config.ML_SERVING_URL}/score/egreedy`, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify(body),
@@ -130,7 +131,7 @@ async function remotePolicy(
    });
    if (!res.ok) return null;
    const data = (await res.json()) as { tip_id: string; score: number };
-    return { tipId: data.tip_id, score: data.score };
+    return { tipId: data.tip_id, score: data.score, policy: 'egreedy-v1' };
  } catch {
    return null;
  }
@@ -178,7 +179,7 @@ router.post('/recommend', requireAuth, async (req: AuthenticatedRequest, res: Re
    return;
  }

-  const policy = scored ? 'linucb-v1' : 'random';
+  const policy = scored ? scored.policy : 'random';
  const servedAt = new Date().toISOString();

  await db.insert(tipViews).values({ id: nanoid(), userId: req.userId!, tipId: tip.id, servedAt });
@@ -226,55 +227,85 @@ router.post('/recommend', requireAuth, async (req: AuthenticatedRequest, res: Re
  res.json({ tip });
 });

+// ---------------------------------------------------------------------------
+// Reward inference from action + dwell time
+//
+// Feedback is now 3 signals only: done / snooze / dismiss.
+// "Helpfulness" is inferred from how long the user took to act on a tip:
+//   dismiss              → -1.0 (clear rejection)
+//   snooze               → +0.1 (tip noticed, timing off — mild positive)
+//   done < 15 s          → -0.3 (almost certainly a stale task, not magic)
+//   done 15 s – 2 min    → +1.0 (magic zone: user saw tip and acted)
+//   done 2 – 10 min      → +0.6 (good: user engaged, acted in same session)
+//   done > 10 min        → +0.3 (eventually done; tip may have helped, unclear)
+// ---------------------------------------------------------------------------
+function inferReward(action: string, dwellMs: number | null): number {
+  if (action === 'dismiss') return -1.0;
+  if (action === 'snooze')  return 0.1;
+  // done — use dwell time
+  if (dwellMs === null || dwellMs < 0) return 0.5; // unknown dwell: neutral positive
+  if (dwellMs < 15_000)   return -0.3; // stale / reflex
+  if (dwellMs < 120_000)  return 1.0;  // magic zone
+  if (dwellMs < 600_000)  return 0.6;  // good
+  return 0.3;                           // eventually
+}
+
 // ---------------------------------------------------------------------------
 // POST /api/tip/:id/feedback
 // ---------------------------------------------------------------------------
 router.post('/tip/:id/feedback', requireAuth, async (req: AuthenticatedRequest, res: Response) => {
  const { action } = req.body as { action: string };
  const tipId = String(req.params.id);
+  const now = new Date();

-  const validActions = ['done', 'dismiss', 'snooze', 'helpful', 'not_helpful'];
+  const validActions = ['done', 'dismiss', 'snooze'];
  if (!validActions.includes(action)) {
    res.status(400).json({ error: 'Invalid action' });
    return;
  }

+  // Compute dwell time from the most recent tipViews record for this user+tip
+  let dwellMs: number | null = null;
+  const [lastView] = await db
+    .select({ servedAt: tipViews.servedAt })
+    .from(tipViews)
+    .where(and(eq(tipViews.userId, req.userId!), eq(tipViews.tipId, tipId)))
+    .orderBy(desc(tipViews.servedAt))
+    .limit(1);
+
+  if (lastView?.servedAt) {
+    dwellMs = now.getTime() - new Date(lastView.servedAt).getTime();
+  }
+
+  const reward = inferReward(action, dwellMs);
+
  await db.insert(tipFeedback).values({
    id: nanoid(),
    userId: req.userId!,
    tipId,
    action,
    sourceId: tipId.startsWith('todoist:') ? tipId.slice(8) : null,
-    createdAt: new Date().toISOString(),
+    dwellMs: dwellMs !== null ? Math.round(dwellMs) : null,
+    rewardMilli: Math.round(reward * 1000),
+    createdAt: now.toISOString(),
  });

-  // Map action to reward (helpful/not_helpful supplement behavioural signals)
-  const rewardMap: Record<string, number> = {
-    done: 1.0,
-    helpful: 0.5,
-    snooze: 0.0,
-    not_helpful: -0.5,
-    dismiss: -1.0,
-  };
-  const reward = rewardMap[action] ?? 0.0;
-
  const task = taskCache.get(req.userId!)?.tasks.find((t) => t.id === tipId);

-  // Clear cache on behavioural actions (not on explicit helpful/not_helpful)
-  if (['done', 'dismiss', 'snooze'].includes(action)) {
-    taskCache.delete(req.userId!);
-  }
+  taskCache.delete(req.userId!);

  bus.publish('signals.tip.feedback', {
    userId: req.userId!,
    tipId,
-    action: action as 'done' | 'dismiss' | 'snooze' | 'helpful' | 'not_helpful',
+    action: action as 'done' | 'dismiss' | 'snooze',
    reward,
-    createdAt: new Date().toISOString(),
+    dwellMs,
+    createdAt: now.toISOString(),
  });

  if (task) {
-    fetch(`${config.ML_SERVING_URL}/reward`, {
+    // Send reward to egreedy-v1 (active policy — ADR-0007)
+    fetch(`${config.ML_SERVING_URL}/reward/egreedy`, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({
@@ -282,6 +313,7 @@ router.post('/tip/:id/feedback', requireAuth, async (req: AuthenticatedRequest,
        tip_id: tipId,
        reward,
        features: task.features,
+        day_of_week: new Date().getDay(),
      }),
    }).catch(() => {});
  }