feat(admin): LLM tip quality dashboard — per-model/prompt/kind breakdowns
/admin/reward-analytics now surfaces served count, reaction rate, and avg
reward grouped by llm_model, prompt_version, and tip_kind — closing the
loop so model/prompt iterations in M2 are legible next to the bandit
policy view. Data comes from the tip_scores columns added in ffdf707 and
tip_feedback.reward_milli; bandit-only tips show as "(bandit-only)".
Closes #92.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -8,12 +8,12 @@ import { describe, it, expect, vi, beforeAll } from 'vitest';
|
||||
import express from 'express';
|
||||
import * as http from 'http';
|
||||
import { makeTestDb } from '../../test/db.js';
|
||||
import { users, integrationTokens, tipViews, tipFeedback } from '../../db/schema.js';
|
||||
import { users, integrationTokens, tipViews, tipFeedback, tipScores } from '../../db/schema.js';
|
||||
|
||||
// ---- in-memory DB ----
|
||||
const testDb = makeTestDb();
|
||||
|
||||
vi.mock('../../db/index.js', () => ({ db: testDb }));
|
||||
vi.mock('../../db/index.js', () => ({ db: testDb, rawSqlite: testDb.rawSqlite }));
|
||||
|
||||
// Bypass auth — all requests arrive pre-authenticated as 'admin-1'
|
||||
vi.mock('../../middleware/session.js', () => ({
|
||||
@@ -51,8 +51,20 @@ beforeAll(async () => {
|
||||
{ id: 'tv-3', userId: 'user-2', tipId: 'tip:c', servedAt: NOW },
|
||||
]);
|
||||
await testDb.insert(tipFeedback).values([
|
||||
{ id: 'tf-1', userId: 'user-1', tipId: 'tip:a', action: 'done', createdAt: DAY_AGO },
|
||||
{ id: 'tf-2', userId: 'user-1', tipId: 'tip:b', action: 'snooze', createdAt: NOW },
|
||||
{ id: 'tf-1', userId: 'user-1', tipId: 'tip:a', action: 'done', dwellMs: 60_000, rewardMilli: 1000, createdAt: DAY_AGO },
|
||||
{ id: 'tf-2', userId: 'user-1', tipId: 'tip:b', action: 'snooze', dwellMs: null, rewardMilli: 100, createdAt: NOW },
|
||||
]);
|
||||
// Seed tip_scores with two LLM models + two prompt_versions for #92.
|
||||
// tip:a (done, r=1.0) → qwen2.5 / v1 / task
|
||||
// tip:b (snooze, r=.1) → qwen2.5 / v2 / advice
|
||||
// tip:c (no feedback) → llama3 / v1 / task
|
||||
await testDb.insert(tipScores).values([
|
||||
{ id: 'ts-1', userId: 'user-1', tipId: 'tip:a', policy: 'egreedy', servedAt: DAY_AGO,
|
||||
llmModel: 'qwen2.5:7b', promptVersion: 'v1', tipKind: 'task' },
|
||||
{ id: 'ts-2', userId: 'user-1', tipId: 'tip:b', policy: 'egreedy', servedAt: NOW,
|
||||
llmModel: 'qwen2.5:7b', promptVersion: 'v2', tipKind: 'advice' },
|
||||
{ id: 'ts-3', userId: 'user-2', tipId: 'tip:c', policy: 'egreedy', servedAt: NOW,
|
||||
llmModel: 'llama3:3b', promptVersion: 'v1', tipKind: 'task' },
|
||||
]);
|
||||
});
|
||||
|
||||
@@ -354,6 +366,73 @@ describe('GET /api/admin/users/:id — edge cases', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('GET /api/admin/reward-analytics — #92 quality breakdowns', () => {
|
||||
type Row = {
|
||||
key: string | null;
|
||||
served: number;
|
||||
done: number;
|
||||
snooze: number;
|
||||
dismiss: number;
|
||||
avgRewardMilli: number | null;
|
||||
};
|
||||
type Body = { byModel: Row[]; byPromptVersion: Row[]; byKind: Row[] };
|
||||
|
||||
it('groups tips by llm_model with reaction + reward aggregates', async () => {
|
||||
const { server, call } = await startServer(buildApp());
|
||||
try {
|
||||
const { status, body } = await call('GET', '/api/admin/reward-analytics?days=30');
|
||||
expect(status).toBe(200);
|
||||
const b = body as Body;
|
||||
|
||||
const qwen = b.byModel.find((r) => r.key === 'qwen2.5:7b')!;
|
||||
expect(qwen).toBeDefined();
|
||||
expect(qwen.served).toBe(2); // tip:a + tip:b
|
||||
expect(qwen.done).toBe(1);
|
||||
expect(qwen.snooze).toBe(1);
|
||||
// avg of reward_milli: (1000 + 100) / 2 = 550
|
||||
expect(qwen.avgRewardMilli).toBeCloseTo(550, 0);
|
||||
|
||||
const llama = b.byModel.find((r) => r.key === 'llama3:3b')!;
|
||||
expect(llama.served).toBe(1);
|
||||
expect(llama.done).toBe(0);
|
||||
expect(llama.avgRewardMilli).toBeNull(); // no reaction → no reward
|
||||
} finally {
|
||||
server.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('groups by prompt_version', async () => {
|
||||
const { server, call } = await startServer(buildApp());
|
||||
try {
|
||||
const { body } = await call('GET', '/api/admin/reward-analytics?days=30');
|
||||
const b = body as Body;
|
||||
const v1 = b.byPromptVersion.find((r) => r.key === 'v1')!;
|
||||
expect(v1.served).toBe(2); // tip:a + tip:c
|
||||
expect(v1.done).toBe(1);
|
||||
const v2 = b.byPromptVersion.find((r) => r.key === 'v2')!;
|
||||
expect(v2.served).toBe(1);
|
||||
expect(v2.snooze).toBe(1);
|
||||
} finally {
|
||||
server.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('groups by tip_kind', async () => {
|
||||
const { server, call } = await startServer(buildApp());
|
||||
try {
|
||||
const { body } = await call('GET', '/api/admin/reward-analytics?days=30');
|
||||
const b = body as Body;
|
||||
const task = b.byKind.find((r) => r.key === 'task')!;
|
||||
expect(task.served).toBe(2); // tip:a + tip:c
|
||||
const advice = b.byKind.find((r) => r.key === 'advice')!;
|
||||
expect(advice.served).toBe(1);
|
||||
expect(advice.snooze).toBe(1);
|
||||
} finally {
|
||||
server.close();
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('GET /api/admin/stats — field types', () => {
|
||||
it('reactionsLast7d has correct action counts', async () => {
|
||||
const { server, call } = await startServer(buildApp());
|
||||
|
||||
@@ -375,10 +375,58 @@ router.get('/reward-analytics', async (req: AuthenticatedRequest, res: Response)
|
||||
.where(gte(tipFeedback.createdAt, since))
|
||||
.groupBy(tipFeedback.action);
|
||||
|
||||
// Quality breakdowns for LLM tips (#92). Each groups tip_scores served in the
|
||||
// window and left-joins tip_feedback so `served` counts tips even without reactions.
|
||||
// avgRewardMilli is the mean inferred reward (×1000) among *reacted* tips.
|
||||
type QualityRow = {
|
||||
key: string | null;
|
||||
served: number;
|
||||
done: number;
|
||||
snooze: number;
|
||||
dismiss: number;
|
||||
helpful: number;
|
||||
not_helpful: number;
|
||||
avg_reward_milli: number | null;
|
||||
};
|
||||
// Column is a hardcoded allowlist, safe to interpolate.
|
||||
const qualityRows = (column: 'llm_model' | 'prompt_version' | 'tip_kind'): QualityRow[] =>
|
||||
rawSqlite
|
||||
.prepare(`
|
||||
SELECT
|
||||
ts.${column} AS key,
|
||||
COUNT(*) AS served,
|
||||
SUM(CASE WHEN tf.action = 'done' THEN 1 ELSE 0 END) AS done,
|
||||
SUM(CASE WHEN tf.action = 'snooze' THEN 1 ELSE 0 END) AS snooze,
|
||||
SUM(CASE WHEN tf.action = 'dismiss' THEN 1 ELSE 0 END) AS dismiss,
|
||||
SUM(CASE WHEN tf.action = 'helpful' THEN 1 ELSE 0 END) AS helpful,
|
||||
SUM(CASE WHEN tf.action = 'not_helpful' THEN 1 ELSE 0 END) AS not_helpful,
|
||||
AVG(tf.reward_milli) AS avg_reward_milli
|
||||
FROM tip_scores ts
|
||||
LEFT JOIN tip_feedback tf ON tf.tip_id = ts.tip_id
|
||||
WHERE ts.served_at >= ?
|
||||
GROUP BY ts.${column}
|
||||
`)
|
||||
.all(since);
|
||||
|
||||
const normalize = (rows: QualityRow[]) =>
|
||||
rows.map((r) => ({
|
||||
key: r.key,
|
||||
served: Number(r.served ?? 0),
|
||||
done: Number(r.done ?? 0),
|
||||
snooze: Number(r.snooze ?? 0),
|
||||
dismiss: Number(r.dismiss ?? 0),
|
||||
helpful: Number(r.helpful ?? 0),
|
||||
not_helpful: Number(r.not_helpful ?? 0),
|
||||
avgRewardMilli: r.avg_reward_milli == null ? null : Number(r.avg_reward_milli),
|
||||
}));
|
||||
|
||||
res.json({
|
||||
daily: dailyRows,
|
||||
byPolicy: policyRows,
|
||||
byHour: hourRows,
|
||||
byModel: normalize(qualityRows('llm_model')),
|
||||
byPromptVersion: normalize(qualityRows('prompt_version')),
|
||||
byKind: normalize(qualityRows('tip_kind')),
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -2,11 +2,13 @@
|
||||
* Creates an isolated in-memory SQLite DB with the full schema applied.
|
||||
* Use this in tests instead of the shared `db` singleton.
|
||||
*/
|
||||
import Database from 'better-sqlite3';
|
||||
import Database, { type Database as BetterSqlite3Database } from 'better-sqlite3';
|
||||
import { drizzle } from 'drizzle-orm/better-sqlite3';
|
||||
import * as schema from '../db/schema.js';
|
||||
|
||||
export function makeTestDb() {
|
||||
type DrizzleDb = ReturnType<typeof drizzle<typeof schema>>;
|
||||
|
||||
export function makeTestDb(): DrizzleDb & { rawSqlite: BetterSqlite3Database } {
|
||||
const sqlite = new Database(':memory:');
|
||||
sqlite.pragma('foreign_keys = ON');
|
||||
|
||||
@@ -138,7 +140,10 @@ export function makeTestDb() {
|
||||
);
|
||||
`);
|
||||
|
||||
return drizzle(sqlite, { schema });
|
||||
const db = drizzle(sqlite, { schema });
|
||||
// `sqlite` is exposed as `rawSqlite` so tests that mock `../db/index.js`
|
||||
// can provide the same `{ db, rawSqlite }` shape as the prod module.
|
||||
return Object.assign(db, { rawSqlite: sqlite });
|
||||
}
|
||||
|
||||
export type TestDb = ReturnType<typeof makeTestDb>;
|
||||
|
||||
Reference in New Issue
Block a user