feat(admin): LLM tip quality dashboard — per-model/prompt/kind breakdowns

/admin/reward-analytics now surfaces served count, reaction rate, and avg reward grouped by llm_model, prompt_version, and tip_kind — closing the loop so model/prompt iterations in M2 are legible next to the bandit policy view. Data comes from the tip_scores columns added in ffdf707 and tip_feedback.reward_milli; bandit-only tips show as "(bandit-only)". Closes #92. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-24 15:24:52 +00:00
parent 75d0e89906
commit aa4bdd8f09
7 changed files with 227 additions and 9 deletions
--- a/services/api/src/routes/tests/admin.test.ts
+++ b/services/api/src/routes/tests/admin.test.ts
@@ -8,12 +8,12 @@ import { describe, it, expect, vi, beforeAll } from 'vitest';
 import express from 'express';
 import * as http from 'http';
 import { makeTestDb } from '../../test/db.js';
-import { users, integrationTokens, tipViews, tipFeedback } from '../../db/schema.js';
+import { users, integrationTokens, tipViews, tipFeedback, tipScores } from '../../db/schema.js';

 // ---- in-memory DB ----
 const testDb = makeTestDb();

-vi.mock('../../db/index.js', () => ({ db: testDb }));
+vi.mock('../../db/index.js', () => ({ db: testDb, rawSqlite: testDb.rawSqlite }));

 // Bypass auth — all requests arrive pre-authenticated as 'admin-1'
 vi.mock('../../middleware/session.js', () => ({
@@ -51,8 +51,20 @@ beforeAll(async () => {
    { id: 'tv-3', userId: 'user-2', tipId: 'tip:c', servedAt: NOW },
  ]);
  await testDb.insert(tipFeedback).values([
-    { id: 'tf-1', userId: 'user-1', tipId: 'tip:a', action: 'done', createdAt: DAY_AGO },
-    { id: 'tf-2', userId: 'user-1', tipId: 'tip:b', action: 'snooze', createdAt: NOW },
+    { id: 'tf-1', userId: 'user-1', tipId: 'tip:a', action: 'done',   dwellMs:  60_000, rewardMilli: 1000, createdAt: DAY_AGO },
+    { id: 'tf-2', userId: 'user-1', tipId: 'tip:b', action: 'snooze', dwellMs: null,    rewardMilli:  100, createdAt: NOW     },
+  ]);
+  // Seed tip_scores with two LLM models + two prompt_versions for #92.
+  // tip:a (done, r=1.0)  → qwen2.5 / v1 / task
+  // tip:b (snooze, r=.1) → qwen2.5 / v2 / advice
+  // tip:c (no feedback)  → llama3  / v1 / task
+  await testDb.insert(tipScores).values([
+    { id: 'ts-1', userId: 'user-1', tipId: 'tip:a', policy: 'egreedy', servedAt: DAY_AGO,
+      llmModel: 'qwen2.5:7b', promptVersion: 'v1', tipKind: 'task' },
+    { id: 'ts-2', userId: 'user-1', tipId: 'tip:b', policy: 'egreedy', servedAt: NOW,
+      llmModel: 'qwen2.5:7b', promptVersion: 'v2', tipKind: 'advice' },
+    { id: 'ts-3', userId: 'user-2', tipId: 'tip:c', policy: 'egreedy', servedAt: NOW,
+      llmModel: 'llama3:3b',  promptVersion: 'v1', tipKind: 'task' },
  ]);
 });

@@ -354,6 +366,73 @@ describe('GET /api/admin/users/:id — edge cases', () => {
  });
 });

+describe('GET /api/admin/reward-analytics — #92 quality breakdowns', () => {
+  type Row = {
+    key: string | null;
+    served: number;
+    done: number;
+    snooze: number;
+    dismiss: number;
+    avgRewardMilli: number | null;
+  };
+  type Body = { byModel: Row[]; byPromptVersion: Row[]; byKind: Row[] };
+
+  it('groups tips by llm_model with reaction + reward aggregates', async () => {
+    const { server, call } = await startServer(buildApp());
+    try {
+      const { status, body } = await call('GET', '/api/admin/reward-analytics?days=30');
+      expect(status).toBe(200);
+      const b = body as Body;
+
+      const qwen = b.byModel.find((r) => r.key === 'qwen2.5:7b')!;
+      expect(qwen).toBeDefined();
+      expect(qwen.served).toBe(2);        // tip:a + tip:b
+      expect(qwen.done).toBe(1);
+      expect(qwen.snooze).toBe(1);
+      // avg of reward_milli: (1000 + 100) / 2 = 550
+      expect(qwen.avgRewardMilli).toBeCloseTo(550, 0);
+
+      const llama = b.byModel.find((r) => r.key === 'llama3:3b')!;
+      expect(llama.served).toBe(1);
+      expect(llama.done).toBe(0);
+      expect(llama.avgRewardMilli).toBeNull();  // no reaction → no reward
+    } finally {
+      server.close();
+    }
+  });
+
+  it('groups by prompt_version', async () => {
+    const { server, call } = await startServer(buildApp());
+    try {
+      const { body } = await call('GET', '/api/admin/reward-analytics?days=30');
+      const b = body as Body;
+      const v1 = b.byPromptVersion.find((r) => r.key === 'v1')!;
+      expect(v1.served).toBe(2);  // tip:a + tip:c
+      expect(v1.done).toBe(1);
+      const v2 = b.byPromptVersion.find((r) => r.key === 'v2')!;
+      expect(v2.served).toBe(1);
+      expect(v2.snooze).toBe(1);
+    } finally {
+      server.close();
+    }
+  });
+
+  it('groups by tip_kind', async () => {
+    const { server, call } = await startServer(buildApp());
+    try {
+      const { body } = await call('GET', '/api/admin/reward-analytics?days=30');
+      const b = body as Body;
+      const task = b.byKind.find((r) => r.key === 'task')!;
+      expect(task.served).toBe(2);  // tip:a + tip:c
+      const advice = b.byKind.find((r) => r.key === 'advice')!;
+      expect(advice.served).toBe(1);
+      expect(advice.snooze).toBe(1);
+    } finally {
+      server.close();
+    }
+  });
+});
+
 describe('GET /api/admin/stats — field types', () => {
  it('reactionsLast7d has correct action counts', async () => {
    const { server, call } = await startServer(buildApp());