feat(observability): structured logs, W3C trace IDs, Sentry hooks (#18)

- TS: pino + pino-http; every HTTP request log includes traceId from
  W3C traceparent header (generated if absent); forwarded to ml/serving
  on all /score, /generate, /reward, and /api/ml proxy calls
- Python: structlog JSON; FastAPI middleware binds trace_id via
  contextvars so every log line within a request carries it
- Sentry: optional SENTRY_DSN init in both runtimes (no-op if unset)
- Replace all console.* calls across services/api with pino logger
- Update tests to spy on logger instead of console

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-26 03:37:28 +00:00
parent 7281af83a4
commit c4960d0601
18 changed files with 1041 additions and 64 deletions

View File

@@ -0,0 +1,20 @@
"""Structlog JSON configuration — import once at process start."""
import logging
import structlog
def configure() -> None:
structlog.configure(
processors=[
structlog.contextvars.merge_contextvars,
structlog.stdlib.add_log_level,
structlog.stdlib.add_logger_name,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.JSONRenderer(),
],
wrapper_class=structlog.make_filtering_bound_logger(logging.INFO),
context_class=dict,
logger_factory=structlog.PrintLoggerFactory(),
)
logging.basicConfig(level=logging.WARNING)

View File

@@ -34,12 +34,25 @@ from typing import Optional, Deque
import httpx
import numpy as np
from fastapi import FastAPI, HTTPException
import sentry_sdk
import structlog
import structlog.contextvars
from fastapi import FastAPI, HTTPException, Request
from pydantic import BaseModel
from starlette.middleware.base import BaseHTTPMiddleware
import logging_config
import nats_consumer
from prompts import get_prompt
logging_config.configure()
_SENTRY_DSN = os.getenv("SENTRY_DSN")
if _SENTRY_DSN:
sentry_sdk.init(dsn=_SENTRY_DSN, environment=os.getenv("ENV", "development"))
log = structlog.get_logger()
@asynccontextmanager
async def lifespan(app: FastAPI):
@@ -50,6 +63,21 @@ async def lifespan(app: FastAPI):
app = FastAPI(title="oO ML Serving", version="1.0.0", lifespan=lifespan)
class _TracingMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next):
structlog.contextvars.clear_contextvars()
traceparent = request.headers.get("traceparent", "")
if traceparent:
parts = traceparent.split("-")
trace_id = parts[1] if len(parts) == 4 and len(parts[1]) == 32 else None
if trace_id:
structlog.contextvars.bind_contextvars(trace_id=trace_id)
return await call_next(request)
app.add_middleware(_TracingMiddleware)
LITELLM_URL = os.getenv("LITELLM_URL", "http://localhost:4000")
LITELLM_MASTER_KEY = os.getenv("LITELLM_MASTER_KEY", "sk-oo-dev")

View File

@@ -17,15 +17,15 @@ Config (env vars):
from __future__ import annotations
import json
import logging
import os
import time
from pathlib import Path
from typing import Optional
import structlog
from schemas import TaskSyncedPayload, TipFeedbackPayload
logger = logging.getLogger(__name__)
log = structlog.get_logger(__name__)
NATS_URL = os.getenv("NATS_URL", "")
NATS_DURABLE_PREFIX = os.getenv("NATS_DURABLE_PREFIX", "feature-pipeline")
@@ -56,15 +56,12 @@ async def _handle(subject: str, payload: dict, state_dir: Path) -> None:
"last_sync_ts": msg.syncedAt,
"task_count": msg.count,
}))
logger.info("[nats] task_synced user=%s count=%s", msg.userId, msg.count)
log.info("nats: task_synced", user_id=msg.userId, count=msg.count)
elif subject == "signals.tip.feedback":
msg = TipFeedbackPayload.model_validate(payload)
logger.info(
"[nats] tip_feedback user=%s tip=%s action=%s reward=%s",
msg.userId, msg.tipId, msg.action, msg.reward,
)
log.info("nats: tip_feedback", user_id=msg.userId, tip_id=msg.tipId, action=msg.action, reward=msg.reward)
else:
logger.debug("[nats] unhandled subject=%s", subject)
log.debug("nats: unhandled subject", subject=subject)
# ── Consumer factory ───────────────────────────────────────────────────────
@@ -80,7 +77,7 @@ def _make_handler(key: str, state_dir: Path):
consumer_health[key]["processed"] += 1
except Exception as exc:
consumer_health[key]["errors"] += 1
logger.warning("[nats] processing error key=%s subject=%s: %s", key, msg.subject, exc)
log.warning("nats: processing error", key=key, subject=msg.subject, exc=str(exc))
await msg.nak()
return handler
@@ -91,7 +88,7 @@ async def start(state_dir: Path) -> None:
"""Connect to NATS and register durable push consumers. No-op if NATS_URL is unset."""
global _nc
if not NATS_URL:
logger.info("[nats] NATS_URL unset — JetStream consumers disabled")
log.info("nats: NATS_URL unset — JetStream consumers disabled")
return
try:
@@ -105,9 +102,9 @@ async def start(state_dir: Path) -> None:
max_reconnect_attempts=-1,
)
js = _nc.jetstream()
logger.info("[nats] connected to %s", NATS_URL)
log.info("nats: connected", url=NATS_URL)
except Exception as exc:
logger.warning("[nats] connection failed: %s — consumers disabled", exc)
log.warning("nats: connection failed — consumers disabled", exc=str(exc))
_nc = None
return
@@ -126,9 +123,9 @@ async def start(state_dir: Path) -> None:
config=config,
)
_subs.append(sub)
logger.info("[nats] subscribed subject=%s durable=%s", subject, durable)
log.info("nats: subscribed", subject=subject, durable=durable)
except Exception as exc:
logger.warning("[nats] subscribe failed key=%s: %s", key, exc)
log.warning("nats: subscribe failed", key=key, exc=str(exc))
async def stop() -> None:
@@ -146,4 +143,4 @@ async def stop() -> None:
except Exception:
pass
_nc = None
logger.info("[nats] disconnected")
log.info("nats: disconnected")

View File

@@ -5,3 +5,5 @@ numpy>=1.26.0
httpx>=0.27.0
anthropic>=0.40.0
nats-py>=2.9.0
structlog>=24.1.0
sentry-sdk>=2.0.0