feat(ml): prompt registry + per-request variant selection

Replaces the hardcoded "v1" label with a real prompt registry: ml/serving/prompts.py — keyed by version: v1 (baseline), v2-mentor (calm/specific persona), v3-few-shot (v1 persona + curated examples) ml/serving/main.py — POST /generate accepts optional prompt_version, 422 on unknown, echoes the version actually used back in the response services/api/src/config.ts — TIP_PROMPT_VERSION: empty / single / comma-list (uniform random per request) services/api/src/routes/recommender.ts — pickPromptVersion() drives selection; the response's prompt_version (not a stale TS constant) is what lands in tip_scores so the #92 reward-analytics dashboard shows real per-variant reaction rates Closes #84. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-24 15:44:04 +00:00
parent aa4bdd8f09
commit 430804e9a5
9 changed files with 294 additions and 44 deletions
--- a/services/api/src/config.ts
+++ b/services/api/src/config.ts
@@ -43,4 +43,12 @@ export const config = {

  /** How often to proactively sync Todoist tasks in the background (ms) */
  TODOIST_SYNC_INTERVAL_MS: parseInt(optional('TODOIST_SYNC_INTERVAL_MS', String(15 * 60 * 1000)), 10),
+
+  /**
+   * Tip prompt version selection. Single value (e.g. "v2-mentor") pins one
+   * variant; comma-separated list (e.g. "v1,v2-mentor,v3-few-shot") rotates
+   * uniformly per request so #92's reward-analytics dashboard accumulates
+   * comparable buckets. Empty → ml/serving's own default ("v1").
+   */
+  TIP_PROMPT_VERSION: optional('TIP_PROMPT_VERSION', ''),
 };
--- a/services/api/src/routes/tests/recommender.test.ts
+++ b/services/api/src/routes/tests/recommender.test.ts
@@ -134,6 +134,7 @@ describe('POST /recommend integration', () => {
          json: async () => ({
            candidates: [{ id: 'adv-1', content: 'Take a break.', rationale: 'You deserve it.' }],
            model: 'tip-generator',
+            prompt_version: 'v1',
          }),
        } as any);
      }
--- a/services/api/src/routes/tests/recommender.unit.test.ts
+++ b/services/api/src/routes/tests/recommender.unit.test.ts
@@ -2,8 +2,9 @@
 * Pure-function unit tests for recommender logic — no DB, no HTTP.
 * These can import directly from the module without any mocking.
 */
-import { describe, it, expect } from 'vitest';
-import { inferReward, dueAgeDays } from '../recommender.js';
+import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
+import { inferReward, dueAgeDays, pickPromptVersion } from '../recommender.js';
+import { config } from '../../config.js';

 describe('inferReward', () => {
  it('dismiss → -1', () => expect(inferReward('dismiss', null)).toBe(-1.0));
@@ -37,3 +38,45 @@ describe('dueAgeDays', () => {
    expect(dueAgeDays({ date: yesterday })).toBeGreaterThan(0);
  });
 });
+
+describe('pickPromptVersion', () => {
+  // Save + restore the original env-driven config field across tests.
+  let original: string;
+  beforeEach(() => { original = config.TIP_PROMPT_VERSION; });
+  afterEach(() => { (config as { TIP_PROMPT_VERSION: string }).TIP_PROMPT_VERSION = original; });
+
+  it('empty config → null (let ml/serving pick its default)', () => {
+    (config as { TIP_PROMPT_VERSION: string }).TIP_PROMPT_VERSION = '';
+    expect(pickPromptVersion()).toBeNull();
+  });
+
+  it('whitespace-only config → null', () => {
+    (config as { TIP_PROMPT_VERSION: string }).TIP_PROMPT_VERSION = '  ';
+    expect(pickPromptVersion()).toBeNull();
+  });
+
+  it('single value → that value', () => {
+    (config as { TIP_PROMPT_VERSION: string }).TIP_PROMPT_VERSION = 'v2-mentor';
+    expect(pickPromptVersion()).toBe('v2-mentor');
+  });
+
+  it('comma-separated → uniformly samples from the set', () => {
+    (config as { TIP_PROMPT_VERSION: string }).TIP_PROMPT_VERSION = 'v1,v2-mentor,v3-few-shot';
+    const seen = new Set<string>();
+    // With 100 trials, the chance of missing any of 3 buckets is (2/3)^100 ≈ 0 — test is reliable.
+    for (let i = 0; i < 100; i++) {
+      const picked = pickPromptVersion();
+      expect(picked).not.toBeNull();
+      seen.add(picked!);
+    }
+    expect(seen).toEqual(new Set(['v1', 'v2-mentor', 'v3-few-shot']));
+  });
+
+  it('trims whitespace around comma-separated entries', () => {
+    (config as { TIP_PROMPT_VERSION: string }).TIP_PROMPT_VERSION = ' v1 , v2-mentor ';
+    for (let i = 0; i < 20; i++) {
+      const picked = pickPromptVersion()!;
+      expect(['v1', 'v2-mentor']).toContain(picked);
+    }
+  });
+});
--- a/services/api/src/routes/recommender.ts
+++ b/services/api/src/routes/recommender.ts
@@ -13,7 +13,19 @@ import { SignalAggregator } from '../signals/aggregator.js';

 const router: ExpressRouter = Router();

-const PROMPT_VERSION = 'v1';
+/**
+ * Pick a prompt version for this request. `config.TIP_PROMPT_VERSION` is either
+ * empty (let ml/serving pick its default), a single version, or a comma-separated
+ * list to rotate uniformly across requests so the #92 dashboard accumulates
+ * comparable buckets per variant. Exported for testing.
+ */
+export function pickPromptVersion(): string | null {
+  const raw = config.TIP_PROMPT_VERSION.trim();
+  if (!raw) return null;
+  const versions = raw.split(',').map((v) => v.trim()).filter(Boolean);
+  if (!versions.length) return null;
+  return versions[Math.floor(Math.random() * versions.length)] ?? null;
+}

 // ---------------------------------------------------------------------------
 // Signal aggregator — register sources here as new integrations are added
@@ -117,12 +129,19 @@ interface LlmCandidate {
  rationale?: string;
 }

+interface LlmGenerateResult {
+  candidates: TipCandidate[];
+  promptVersion: string | null;
+  model: string | null;
+}
+
 async function fetchLlmCandidates(
  userId: string,
  signals: Signal[],
  hour: number,
  dayOfWeek: number,
-): Promise<TipCandidate[]> {
+  promptVersion: string | null,
+): Promise<LlmGenerateResult> {
  try {
    const tasks = signals.slice(0, 10).map((s) => ({
      content: s.content,
@@ -137,13 +156,18 @@ async function fetchLlmCandidates(
        user_id: userId,
        context: { tasks, hour_of_day: hour, day_of_week: dayOfWeek },
        n: 3,
+        ...(promptVersion ? { prompt_version: promptVersion } : {}),
      }),
      signal: AbortSignal.timeout(15_000),
    });
-    if (!res.ok) return [];
-    const data = (await res.json()) as { candidates: LlmCandidate[]; model?: string };
+    if (!res.ok) return { candidates: [], promptVersion: null, model: null };
+    const data = (await res.json()) as {
+      candidates: LlmCandidate[];
+      model?: string;
+      prompt_version?: string;
+    };
    const now = new Date().toISOString();
-    return data.candidates.map((c) => ({
+    const candidates: TipCandidate[] = data.candidates.map((c) => ({
      id: `llm:${c.id}`,
      content: c.content,
      source: 'llm' as const,
@@ -152,8 +176,13 @@ async function fetchLlmCandidates(
      createdAt: now,
      features: { is_overdue: false, task_age_days: 0, priority: 1 },
    }));
+    return {
+      candidates,
+      promptVersion: data.prompt_version ?? null,
+      model: data.model ?? null,
+    };
  } catch {
-    return [];
+    return { candidates: [], promptVersion: null, model: null };
  }
 }

@@ -181,9 +210,16 @@ router.post('/recommend', requireAuth, async (req: AuthenticatedRequest, res: Re
  const signals = await aggregator.fetchAll(req.userId!);

  const signalCandidates = signals.map(signalToCandidate);
-  const llmCandidates = await fetchLlmCandidates(req.userId!, signals, hour, dayOfWeek);
+  const requestedPromptVersion = pickPromptVersion();
+  const llmResult = await fetchLlmCandidates(
+    req.userId!,
+    signals,
+    hour,
+    dayOfWeek,
+    requestedPromptVersion,
+  );

-  const allCandidates: TipCandidate[] = [...signalCandidates, ...llmCandidates];
+  const allCandidates: TipCandidate[] = [...signalCandidates, ...llmResult.candidates];
  if (!allCandidates.length) {
    res.status(204).end();
    return;
@@ -227,8 +263,10 @@ router.post('/recommend', requireAuth, async (req: AuthenticatedRequest, res: Re
    candidateCount: allCandidates.length,
    latencyMs,
    servedAt,
-    promptVersion: isLlmTip ? PROMPT_VERSION : null,
-    llmModel: isLlmTip ? 'tip-generator' : null,
+    // Trust the version/model the generator reports; falls back to whatever
+    // we asked for so the bucket isn't mislabeled if /generate omits it.
+    promptVersion: isLlmTip ? (llmResult.promptVersion ?? requestedPromptVersion ?? null) : null,
+    llmModel: isLlmTip ? (llmResult.model ?? 'tip-generator') : null,
    tipKind: tip.kind ?? null,
  });