feat(observability): structured logs, W3C trace IDs, Sentry hooks (#18)

- TS: pino + pino-http; every HTTP request log includes traceId from
  W3C traceparent header (generated if absent); forwarded to ml/serving
  on all /score, /generate, /reward, and /api/ml proxy calls
- Python: structlog JSON; FastAPI middleware binds trace_id via
  contextvars so every log line within a request carries it
- Sentry: optional SENTRY_DSN init in both runtimes (no-op if unset)
- Replace all console.* calls across services/api with pino logger
- Update tests to spy on logger instead of console

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-26 03:37:28 +00:00
parent 7281af83a4
commit c4960d0601
18 changed files with 1041 additions and 64 deletions

View File

@@ -121,13 +121,14 @@ describe('connectNats — bridge bus → JetStream', () => {
it('swallows JetStream publish errors so the in-process bus keeps working', async () => {
const { connectNats } = await import('../nats.js');
const { logger } = await import('../../logger.js');
const { bus } = await import('../bus.js');
await connectNats('nats://test:4222');
// Force the next js.publish to reject.
lastJsPublish.mockRejectedValueOnce(new Error('jetstream down'));
const errSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
const errSpy = vi.spyOn(logger, 'error');
expect(() =>
bus.publish('signals.task.synced', { userId: 'u', source: 'todoist', count: 0, syncedAt: '' }),
@@ -142,12 +143,16 @@ describe('connectNats — bridge bus → JetStream', () => {
describe('connectNats — failure mode', () => {
it('logs a warning and stays silent when connect rejects', async () => {
const { connectNats } = await import('../nats.js');
const { logger } = await import('../../logger.js');
lastConnect.mockRejectedValueOnce(new Error('ECONNREFUSED'));
const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
const warnSpy = vi.spyOn(logger, 'warn');
await expect(connectNats('nats://nope:4222')).resolves.toBeUndefined();
expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining('connection failed'));
expect(warnSpy).toHaveBeenCalledWith(
expect.objectContaining({ err: expect.anything() }),
expect.stringContaining('connection failed'),
);
});
});

View File

@@ -12,6 +12,7 @@
import type { NatsConnection, JetStreamClient, StreamConfig } from 'nats';
import { bus } from './bus.js';
import { logger } from '../logger.js';
let nc: NatsConnection | null = null;
let js: JetStreamClient | null = null;
@@ -67,13 +68,13 @@ export async function connectNats(natsUrl: string): Promise<void> {
if (!js) return;
const data = new TextEncoder().encode(JSON.stringify(payload));
js.publish(subject, data).catch((err: Error) =>
console.error(`[nats] publish failed for ${subject}: ${err.message}`),
logger.error({ err, subject }, 'nats publish failed'),
);
});
console.log(`[nats] connected to ${natsUrl}, streams: ${STREAMS.map((s) => s.name).join(', ')}`);
logger.info({ url: natsUrl, streams: STREAMS.map((s) => s.name) }, 'nats connected');
} catch (err: any) {
console.warn(`[nats] connection failed — running without JetStream: ${err.message}`);
logger.warn({ err }, 'nats connection failed — running without JetStream');
}
}

View File

@@ -1,7 +1,10 @@
import 'dotenv/config';
import { logger } from './logger.js';
import express from 'express';
import { pinoHttp } from 'pino-http';
import cookieParser from 'cookie-parser';
import cors from 'cors';
import { tracingMiddleware } from './middleware/tracing.js';
import { config } from './config.js';
import { db, runMigrations } from './db/index.js';
import { tipScores, tipFeedback } from './db/schema.js';
@@ -26,13 +29,11 @@ import { registerProfileSubscriptions } from './profile/subscriber.js';
await mkdir(dirname(config.DATABASE_PATH), { recursive: true });
runMigrations();
// Keep the API alive on stray async faults (e.g. a single bad admin route)
// rather than dropping the whole process.
process.on('unhandledRejection', (reason) => {
console.error('[api] unhandledRejection', reason);
logger.error({ err: reason }, 'unhandledRejection');
});
process.on('uncaughtException', (err) => {
console.error('[api] uncaughtException', err);
logger.fatal({ err }, 'uncaughtException');
});
const app = express();
@@ -43,6 +44,15 @@ app.use(
credentials: true,
}),
);
app.use(tracingMiddleware);
app.use(
pinoHttp({
logger,
genReqId: (req) => req.traceId,
customProps: (req) => ({ traceId: req.traceId }),
autoLogging: { ignore: (req) => req.url === '/health' },
}),
);
app.use(express.json());
app.use(cookieParser());
app.use(sessionMiddleware);
@@ -56,16 +66,13 @@ app.use('/api/user', userRouter);
app.use('/api/push', pushRouter);
app.use('/api/admin', adminRouter);
// Proxy ml/serving endpoints through the API (admin-only).
// Allows admin UI to call /api/ml/stats/:userId, /api/ml/features/:userId
// without needing direct access to the ml/serving port.
app.use('/api/ml', requireAuth as any, requireAdmin as any, async (req: Request, res: Response) => {
const mlUrl = config.ML_SERVING_URL;
const target = `${mlUrl}${req.path}`;
try {
const upstream = await fetch(target, {
method: req.method,
headers: { 'Content-Type': 'application/json' },
headers: { 'Content-Type': 'application/json', traceparent: req.traceparent },
body: req.method !== 'GET' ? JSON.stringify(req.body) : undefined,
signal: AbortSignal.timeout(5000),
});
@@ -82,7 +89,7 @@ async function purgeExpiredData() {
await db.delete(tipScores).where(lt(tipScores.servedAt, cutoff));
await db.delete(tipFeedback).where(lt(tipFeedback.createdAt, cutoff));
} catch (err: any) {
console.error(`[purge] retention cleanup failed: ${err.message}`);
logger.error({ err }, 'retention cleanup failed');
}
}
@@ -90,7 +97,7 @@ purgeExpiredData();
setInterval(purgeExpiredData, 24 * 60 * 60 * 1000);
app.listen(config.PORT, () => {
console.log(`oO API listening on http://localhost:${config.PORT}`);
logger.info({ port: config.PORT }, 'oO API listening');
});
if (config.NATS_URL) {

View File

@@ -0,0 +1,12 @@
import pino from 'pino';
import * as Sentry from '@sentry/node';
if (process.env['SENTRY_DSN']) {
Sentry.init({
dsn: process.env['SENTRY_DSN'],
environment: process.env['NODE_ENV'] ?? 'development',
});
}
export const logger = pino({ level: process.env['LOG_LEVEL'] ?? 'info' });
export { Sentry };

View File

@@ -0,0 +1,26 @@
import { randomBytes } from 'crypto';
import type { Request, Response, NextFunction } from 'express';
declare global {
namespace Express {
interface Request {
traceId: string;
traceparent: string;
}
}
}
export function tracingMiddleware(req: Request, _res: Response, next: NextFunction): void {
const incoming = req.headers['traceparent'] as string | undefined;
let traceId: string;
if (incoming) {
const parts = incoming.split('-');
traceId = parts.length === 4 && parts[1]?.length === 32 ? parts[1] : randomBytes(16).toString('hex');
} else {
traceId = randomBytes(16).toString('hex');
}
const parentId = randomBytes(8).toString('hex');
req.traceId = traceId;
req.traceparent = `00-${traceId}-${parentId}-01`;
next();
}

View File

@@ -1,4 +1,5 @@
import { type Router as ExpressRouter, Router, Response } from 'express';
import { logger } from '../logger.js';
import { db, rawSqlite } from '../db/index.js';
import {
users,
@@ -766,7 +767,7 @@ router.post('/simulate/start', async (req: AuthenticatedRequest, res: Response)
// — e.g. in the alpine api container) would emit an unhandled 'error' event
// and crash the whole API process.
child.on('error', async (err) => {
console.error('[sim] spawn error', err);
logger.error({ err }, 'sim: spawn error');
_simProcesses.delete(id);
await db
.update(simRuns)

View File

@@ -5,6 +5,7 @@ import { db } from '../db/index.js';
import { users, sessions } from '../db/schema.js';
import { eq } from 'drizzle-orm';
import { config } from '../config.js';
import { logger } from '../logger.js';
const router: ExpressRouter = Router();
@@ -36,7 +37,7 @@ router.get('/login', async (req: Request, res: Response) => {
setTimeout(() => pendingStates.delete(state), 10 * 60 * 1000);
const redirectUri = `${config.API_BASE_URL}/api/auth/callback`;
console.log('[auth] redirect_uri sent to Google:', redirectUri);
logger.info({ redirectUri }, 'auth: redirect_uri');
const authUrl = client.buildAuthorizationUrl(cfg, {
redirect_uri: redirectUri,
scope: 'openid email profile',
@@ -72,7 +73,7 @@ router.get('/callback', async (req: Request, res: Response) => {
expectedState: state,
});
} catch (err) {
console.error('OAuth callback error', err);
logger.error({ err }, 'auth: OAuth callback error');
res.status(400).json({ error: 'OAuth error' });
return;
}

View File

@@ -1,5 +1,6 @@
import { type Router as ExpressRouter, Router, Response } from 'express';
import { nanoid } from 'nanoid';
import { logger } from '../logger.js';
import { db } from '../db/index.js';
import { integrationTokens, tipFeedback, tipViews, tipScores } from '../db/schema.js';
import { eq, and, desc } from 'drizzle-orm';
@@ -85,6 +86,7 @@ async function remotePolicy(
userId: string,
tasks: TipCandidate[],
profile: Profile,
traceparent?: string,
): Promise<{ tipId: string; score: number; policy: string } | null> {
const hour = new Date().getHours();
const dayOfWeek = new Date().getDay();
@@ -102,11 +104,10 @@ async function remotePolicy(
profile_features: profile,
};
// Active policy: egreedy-v2 (promoted from shadow after offline sim — ADR-0012)
try {
const res = await fetch(`${config.ML_SERVING_URL}/score/egreedy/v2`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
headers: { 'Content-Type': 'application/json', ...(traceparent ? { traceparent } : {}) },
body: JSON.stringify(body),
signal: AbortSignal.timeout(3000),
});
@@ -146,6 +147,7 @@ async function fetchLlmCandidates(
dayOfWeek: number,
promptVersion: string | null,
profile: Profile,
traceparent?: string,
): Promise<LlmGenerateResult> {
try {
const tasks = signals.slice(0, 10).map((s) => ({
@@ -156,7 +158,7 @@ async function fetchLlmCandidates(
}));
const res = await fetch(`${config.ML_SERVING_URL}/generate`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
headers: { 'Content-Type': 'application/json', ...(traceparent ? { traceparent } : {}) },
body: JSON.stringify({
user_id: userId,
context: { tasks, hour_of_day: hour, day_of_week: dayOfWeek },
@@ -226,6 +228,7 @@ router.post('/recommend', requireAuth, async (req: AuthenticatedRequest, res: Re
dayOfWeek,
requestedPromptVersion,
profile,
req.traceparent,
);
const allCandidates: TipCandidate[] = [...signalCandidates, ...llmResult.candidates];
@@ -240,7 +243,7 @@ router.post('/recommend', requireAuth, async (req: AuthenticatedRequest, res: Re
const t0 = Date.now();
// Stage 2: score — egreedy bandit with random fallback
const scored = await remotePolicy(req.userId!, allCandidates, profile);
const scored = await remotePolicy(req.userId!, allCandidates, profile, req.traceparent);
const latencyMs = Date.now() - t0;
const tip = scored
? (allCandidates.find((t) => t.id === scored.tipId) ?? randomPolicy(allCandidates))
@@ -373,6 +376,7 @@ async function sendRewardWithRetry(
reward: number,
features: TipCandidate['features'],
profile: Profile,
traceparent?: string,
): Promise<void> {
const body = JSON.stringify({
user_id: userId,
@@ -387,7 +391,7 @@ async function sendRewardWithRetry(
try {
const res = await fetch(`${config.ML_SERVING_URL}/reward/egreedy/v2`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
headers: { 'Content-Type': 'application/json', ...(traceparent ? { traceparent } : {}) },
body,
signal: AbortSignal.timeout(3000),
});
@@ -395,7 +399,7 @@ async function sendRewardWithRetry(
throw new Error(`HTTP ${res.status}`);
} catch (err: any) {
if (attempt === 3) {
console.error(`[reward] failed after 3 attempts for tip ${tipId}: ${err.message}`);
logger.error({ tipId, err }, 'reward: failed after 3 attempts');
bus.publish('signals.tip.reward_failed', {
userId,
tipId,
@@ -468,7 +472,7 @@ router.post('/tip/:id/feedback', requireAuth, async (req: AuthenticatedRequest,
if (candidate) {
// Re-fetch profile for the v2 ridge update; TTL cache makes this near-instant.
const profile = await getProfile(req.userId!);
sendRewardWithRetry(req.userId!, tipId, reward, candidate.features, profile);
sendRewardWithRetry(req.userId!, tipId, reward, candidate.features, profile, req.traceparent);
}
// Delegate action to the owning signal source (e.g. mark done in Todoist)

View File

@@ -8,6 +8,11 @@
*/
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
vi.mock('../../logger.js', () => ({
logger: { info: vi.fn(), warn: vi.fn(), error: vi.fn(), fatal: vi.fn() },
}));
import { logger } from '../../logger.js';
// ── mock the drizzle query chain: db.select(...).from(...).where(...) ────────
let users: { userId: string }[] = [];
const whereMock = vi.fn(async () => users);
@@ -35,6 +40,7 @@ beforeEach(() => {
whereMock.mockClear();
fromMock.mockClear();
selectMock.mockClear();
vi.clearAllMocks();
vi.useFakeTimers();
});
@@ -102,8 +108,6 @@ describe('startTodoistSyncScheduler', () => {
if (id === 'bad') throw new Error('todoist 401');
return [];
});
const errSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
const logSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
startTodoistSyncScheduler(60_000);
await vi.advanceTimersByTimeAsync(10_001);
@@ -112,19 +116,27 @@ describe('startTodoistSyncScheduler', () => {
await Promise.resolve();
expect(fetchSignalsMock).toHaveBeenCalledTimes(3);
expect(errSpy).toHaveBeenCalledWith(expect.stringContaining('sync error'), expect.anything());
expect(logSpy).toHaveBeenCalledWith(expect.stringContaining('2 ok, 1 failed'));
expect(logger.error).toHaveBeenCalledWith(
expect.objectContaining({ err: expect.anything() }),
'scheduler: sync error',
);
expect(logger.info).toHaveBeenCalledWith(
expect.objectContaining({ ok: 2, failed: 1 }),
'scheduler: todoist sync',
);
});
it('survives a db query failure — logs and skips the tick', async () => {
const { startTodoistSyncScheduler } = await import('../scheduler.js');
whereMock.mockRejectedValueOnce(new Error('sqlite locked'));
const errSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
startTodoistSyncScheduler(60_000);
await vi.advanceTimersByTimeAsync(10_001);
expect(fetchSignalsMock).not.toHaveBeenCalled();
expect(errSpy).toHaveBeenCalledWith(expect.stringContaining('failed to query users'));
expect(logger.error).toHaveBeenCalledWith(
expect.objectContaining({ err: expect.anything() }),
'scheduler: failed to query users',
);
});
});

View File

@@ -1,4 +1,5 @@
import type { Signal, SignalSource } from '@oo/shared-types';
import { logger } from '../logger.js';
/**
* Merges signals from all registered sources for a user.
@@ -24,7 +25,7 @@ export class SignalAggregator {
if (r.status === 'fulfilled') {
signals.push(...r.value);
} else {
console.error(`[aggregator] source '${this.sources[i].id}' failed:`, r.reason);
logger.error({ sourceId: this.sources[i]!.id, err: r.reason }, 'aggregator: source failed');
}
}
return signals;

View File

@@ -13,6 +13,7 @@ import { db } from '../db/index.js';
import { integrationTokens } from '../db/schema.js';
import { eq } from 'drizzle-orm';
import { todoistSource } from './todoist.js';
import { logger } from '../logger.js';
const DEFAULT_INTERVAL_MS = 15 * 60 * 1000;
@@ -25,7 +26,7 @@ export function startTodoistSyncScheduler(intervalMs = DEFAULT_INTERVAL_MS): Nod
.from(integrationTokens)
.where(eq(integrationTokens.tokenStatus, 'active'));
} catch (err: any) {
console.error(`[scheduler] failed to query users: ${err.message}`);
logger.error({ err }, 'scheduler: failed to query users');
return;
}
@@ -39,10 +40,10 @@ export function startTodoistSyncScheduler(intervalMs = DEFAULT_INTERVAL_MS): Nod
let failed = 0;
for (const r of results) {
if (r.status === 'fulfilled') ok++;
else { failed++; console.error(`[scheduler] sync error:`, r.reason); }
else { failed++; logger.error({ err: r.reason }, 'scheduler: sync error'); }
}
console.log(`[scheduler] todoist sync: ${ok} ok, ${failed} failed (${users.length} users)`);
logger.info({ ok, failed, total: users.length }, 'scheduler: todoist sync');
}
// Run once shortly after startup, then on interval

View File

@@ -3,6 +3,7 @@ import { db } from '../db/index.js';
import { integrationTokens } from '../db/schema.js';
import { eq, and } from 'drizzle-orm';
import { bus } from '../events/bus.js';
import { logger } from '../logger.js';
const CACHE_TTL_MS = 30_000;
@@ -46,7 +47,7 @@ export class TodoistSignalSource implements SignalSource {
if (!res.ok) {
if (res.status === 401) {
console.error(`[todoist] token expired for user ${userId}`);
logger.warn({ userId }, 'todoist: token expired');
bus.publish('signals.integration.token_expired', {
userId,
provider: 'todoist',