feat(serving): add MLflow tracing to ml-serving for all agent calls

Logs one MLflow run per /recommend (params, token metrics, latency,
full prompt + tip as artifacts) and per /agents/{id}/compute and
/infer call (signals snapshot, inferred prefs, latency).

Tracing is a no-op when MLFLOW_TRACKING_URI is unset; ml-serving
starts and serves tips correctly without MLflow configured.

Refs #118 (M4: remove from production / move off critical path).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-06 10:30:24 +00:00
parent 488a764519
commit c43dbaf23d
3 changed files with 263 additions and 0 deletions

View File

@@ -14,6 +14,7 @@ from __future__ import annotations
import json
import os
import sys
import time
from contextlib import asynccontextmanager
from datetime import datetime, timezone
from pathlib import Path
@@ -29,6 +30,7 @@ from starlette.middleware.base import BaseHTTPMiddleware
import logging_config
import nats_consumer
from mlflow_client import MLflowClient
from prompts import get_prompt, build_orchestrator_messages
# Make ml.agents importable regardless of working directory.
@@ -79,6 +81,30 @@ LITELLM_URL = os.getenv("LITELLM_URL", "http://localhost:4000")
LITELLM_MASTER_KEY = os.getenv("LITELLM_MASTER_KEY", "sk-oo-dev")
STATE_DIR = Path(os.getenv("STATE_DIR", "/tmp/oo-serving-state"))
# ── MLflow tracing (optional) ───────────────────────────────────────────────
# Set MLFLOW_TRACKING_URI to enable. All calls are fire-and-forget; any error
# is logged at WARNING and never propagates to the caller.
_MLFLOW_URI = os.getenv("MLFLOW_TRACKING_URI", "")
_mlflow: MLflowClient | None = MLflowClient(tracking_uri=_MLFLOW_URI) if _MLFLOW_URI else None
_MLFLOW_EXP = "oO/serving"
def _mlflow_run(run_name: str, params: dict, metrics: dict, tags: dict) -> None:
"""Create a finished MLflow run. Silently no-ops if MLflow is not configured."""
if _mlflow is None:
return
try:
exp_id = _mlflow.get_or_create_experiment(_MLFLOW_EXP)
run_id = _mlflow.create_run(exp_id, run_name, tags={"source": "ml-serving"})
_mlflow.log_params(run_id, {k: str(v)[:250] for k, v in params.items()})
_mlflow.log_metrics(run_id, metrics)
for k, v in tags.items():
_mlflow.log_text(run_id, str(v), k)
_mlflow.end_run(run_id)
except Exception as exc: # noqa: BLE001
log.warning("mlflow_log_failed", error=str(exc))
STATE_DIR.mkdir(parents=True, exist_ok=True)
@@ -251,6 +277,12 @@ async def compute_agent(agent_id: str, req: AgentComputeRequest) -> AgentCompute
raise HTTPException(status_code=500, detail=f"Agent compute failed: {exc}")
log.info("agent_computed", agent_id=agent_id, user_id=req.user_id, expires_at=output.expires_at)
_mlflow_run(
run_name=f"compute/{agent_id}",
params={"agent_id": agent_id, "user_id": req.user_id, "agent_version": output.agent_version},
metrics={"task_count": len(req.tasks), "feedback_count": len(req.feedback_history)},
tags={"prompt_text": output.prompt_text, "signals_snapshot": json.dumps(output.signals_snapshot)},
)
return AgentComputeResponse(
user_id=output.user_id,
agent_id=output.agent_id,
@@ -307,6 +339,12 @@ async def infer_agent(agent_id: str, req: AgentInferRequest) -> AgentInferRespon
history_len=len(events),
latency_ms=latency_ms,
)
_mlflow_run(
run_name=f"infer/{agent_id}",
params={"agent_id": agent_id, "user_id": req.user_id},
metrics={"latency_ms": latency_ms, "history_len": len(events), "n_params": len(inferred)},
tags={"inferred_prefs": json.dumps(inferred)},
)
return AgentInferResponse(user_id=req.user_id, agent_id=agent_id, inferred_prefs=inferred)
@@ -318,6 +356,7 @@ async def recommend(req: RecommendRequest) -> RecommendResponse:
the fresh rows from agent_outputs table (fetched by the TypeScript recommender
before calling this endpoint). Falls back to raw task context if empty.
"""
t0_recommend = time.monotonic()
messages = build_orchestrator_messages(
agent_outputs=[s.model_dump() for s in req.agent_outputs],
tasks=req.tasks,
@@ -376,12 +415,34 @@ async def recommend(req: RecommendRequest) -> RecommendResponse:
content=item.get("content", ""),
rationale=item.get("rationale"),
)
latency_ms_recommend = round((time.monotonic() - t0_recommend) * 1000, 1)
log.info(
"recommend_served",
user_id=req.user_id,
agent_count=len(req.agent_outputs),
tip_id=tip.id,
)
_mlflow_run(
run_name="recommend",
params={
"user_id": req.user_id,
"agent_ids": ",".join(s.agent_id for s in req.agent_outputs),
"model": model_used,
"hour_of_day": req.hour_of_day,
"day_of_week": req.day_of_week,
},
metrics={
"prompt_tokens": total_usage["prompt_tokens"],
"completion_tokens": total_usage["completion_tokens"],
"agent_count": len(req.agent_outputs),
"latency_ms": latency_ms_recommend,
},
tags={
"prompt_messages": json.dumps(messages),
"tip_content": tip.content,
"tip_rationale": tip.rationale or "",
},
)
return RecommendResponse(
tip=tip,
model=model_used,