chore: remove Airflow completely from the stack

Drop all four Airflow containers (db, init, webserver, scheduler) from the mlops compose profile, leaving MLflow as the sole mlops service. Remove AIRFLOW_* env vars, config fields, health-check entries, DAG trigger code in admin/bench routes, the airflow_dag_run_id schema column, Airflow nav links and DAG-run links in the admin UI, the two Airflow DAG files (bench_dag.py, sim_dag.py), and all related docs/ADR references. Simulations now run exclusively via the subprocess path. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-03 16:38:46 +00:00
parent ce1c8bde57
commit f8d66aa01f
27 changed files with 663 additions and 719 deletions
--- a/ml/serving/main.py
+++ b/ml/serving/main.py
@@ -26,9 +26,11 @@ from __future__ import annotations
 import json
 import math
 import os
+import sys
 import time
 from collections import deque
 from contextlib import asynccontextmanager
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import Optional, Deque

@@ -43,7 +45,17 @@ from starlette.middleware.base import BaseHTTPMiddleware

 import logging_config
 import nats_consumer
-from prompts import get_prompt
+from prompts import get_prompt, build_orchestrator_messages
+
+# Make ml.agents importable regardless of working directory.
+# In Docker (WORKDIR=/app/ml/serving, PYTHONPATH=/app): /app already on path.
+# In local dev (run from ml/serving/): repo root is two levels up.
+_repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+if _repo_root not in sys.path:
+    sys.path.insert(0, _repo_root)
+
+from ml.agents.base import AgentInput  # noqa: E402
+from ml.agents.registry import get_agent, all_agents  # noqa: E402

 logging_config.configure()

@@ -350,12 +362,61 @@ class GenerateResponse(BaseModel):
    completion_tokens: int = 0


+# ── Multi-agent models ─────────────────────────────────────────────────────
+
+class AgentComputeRequest(BaseModel):
+    user_id: str
+    tasks: list[dict] = []
+    profile: dict[str, Optional[float]] = {}
+    feedback_history: list[dict] = []
+    now_iso: Optional[str] = None  # ISO 8601; defaults to utcnow
+
+
+class AgentComputeResponse(BaseModel):
+    user_id: str
+    agent_id: str
+    prompt_text: str
+    signals_snapshot: dict
+    computed_at: str
+    expires_at: str
+    agent_version: str
+
+
+class AgentOutputSnippet(BaseModel):
+    agent_id: str
+    prompt_text: str
+
+
+class RecommendRequest(BaseModel):
+    user_id: str
+    agent_outputs: list[AgentOutputSnippet] = []
+    tasks: list[dict] = []
+    hour_of_day: int = 12
+    day_of_week: int = 0
+
+
+class TipResult(BaseModel):
+    id: str
+    content: str
+    source: str = "llm"
+    kind: str = "advice"
+    rationale: Optional[str] = None
+
+
+class RecommendResponse(BaseModel):
+    tip: TipResult
+    model: str
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+
+
 # ── Endpoints ──────────────────────────────────────────────────────────────

@app.get("/health")
 def health():
    return {
        "ok": True,
+        "agents": [a.agent_id for a in all_agents()],
        "nats": {
            "enabled": bool(nats_consumer.NATS_URL),
            "consumers": nats_consumer.consumer_health,
@@ -368,6 +429,137 @@ _RETRY_SUFFIX = (
    "Reply ONLY with the JSON array — no prose, no markdown fences."
 )

+_RETRY_SUFFIX_OBJ = (
+    "\n\nYour previous response was not valid JSON. "
+    "Reply ONLY with the JSON object — no prose, no markdown fences."
+)
+
+
+@app.post("/agents/{agent_id}/compute", response_model=AgentComputeResponse)
+async def compute_agent(agent_id: str, req: AgentComputeRequest) -> AgentComputeResponse:
+    """Run a single sub-agent for a user and return its prompt snippet.
+
+    Called by the precompute pipeline for each (user_id, agent_id) pair.
+    The caller is responsible for persisting the result to agent_outputs via the
+    TypeScript API callback.
+    """
+    try:
+        agent = get_agent(agent_id)
+    except KeyError:
+        raise HTTPException(status_code=404, detail=f"Unknown agent: {agent_id!r}")
+
+    now = (
+        datetime.fromisoformat(req.now_iso.replace("Z", "+00:00"))
+        if req.now_iso
+        else datetime.now(timezone.utc)
+    )
+    if now.tzinfo is None:
+        now = now.replace(tzinfo=timezone.utc)
+
+    inp = AgentInput(
+        user_id=req.user_id,
+        tasks=req.tasks,
+        profile=req.profile,
+        feedback_history=req.feedback_history,
+        now=now,
+    )
+    try:
+        output = agent.compute(inp)
+    except Exception as exc:
+        log.error("agent_compute_failed", agent_id=agent_id, user_id=req.user_id, error=str(exc))
+        raise HTTPException(status_code=500, detail=f"Agent compute failed: {exc}")
+
+    log.info("agent_computed", agent_id=agent_id, user_id=req.user_id, expires_at=output.expires_at)
+    return AgentComputeResponse(
+        user_id=output.user_id,
+        agent_id=output.agent_id,
+        prompt_text=output.prompt_text,
+        signals_snapshot=output.signals_snapshot,
+        computed_at=output.computed_at,
+        expires_at=output.expires_at,
+        agent_version=output.agent_version,
+    )
+
+
+@app.post("/recommend", response_model=RecommendResponse)
+async def recommend(req: RecommendRequest) -> RecommendResponse:
+    """Orchestrator: combine pre-computed agent outputs into one tip via LLM.
+
+    Called in real time when a user requests a tip. agent_outputs should be
+    the fresh rows from agent_outputs table (fetched by the TypeScript recommender
+    before calling this endpoint). Falls back to raw task context if empty.
+    """
+    messages = build_orchestrator_messages(
+        agent_outputs=[s.model_dump() for s in req.agent_outputs],
+        tasks=req.tasks,
+        hour_of_day=req.hour_of_day,
+        day_of_week=req.day_of_week,
+    )
+    headers = {"Authorization": f"Bearer {LITELLM_MASTER_KEY}"}
+    last_raw = ""
+    last_parse_error = ""
+    total_usage: dict = {"prompt_tokens": 0, "completion_tokens": 0}
+    model_used = "tip-generator"
+
+    async with httpx.AsyncClient(timeout=30.0) as client:
+        for _attempt in range(1 + _MAX_GENERATE_RETRIES):
+            payload = {"model": "tip-generator", "messages": messages, "temperature": 0.7}
+            try:
+                resp = await client.post(
+                    f"{LITELLM_URL}/chat/completions", json=payload, headers=headers
+                )
+                resp.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                raise HTTPException(status_code=502, detail=f"LiteLLM error: {e.response.text}")
+            except httpx.RequestError as e:
+                raise HTTPException(status_code=503, detail=f"LiteLLM unreachable: {e}")
+
+            data = resp.json()
+            usage = data.get("usage", {})
+            total_usage["prompt_tokens"] += usage.get("prompt_tokens", 0)
+            total_usage["completion_tokens"] += usage.get("completion_tokens", 0)
+            model_used = data.get("model", "tip-generator")
+            last_raw = data["choices"][0]["message"]["content"]
+
+            try:
+                text = last_raw.strip()
+                if text.startswith("```"):
+                    parts = text.split("```")
+                    text = parts[1] if len(parts) > 1 else text
+                    if text.startswith("json"):
+                        text = text[4:]
+                parsed = json.loads(text)
+                item: dict = parsed[0] if isinstance(parsed, list) else parsed
+                break
+            except (json.JSONDecodeError, ValueError, IndexError) as exc:
+                last_parse_error = str(exc)
+                messages.append({"role": "assistant", "content": last_raw})
+                messages.append({"role": "user", "content": _RETRY_SUFFIX_OBJ})
+        else:
+            raise HTTPException(
+                status_code=502,
+                detail=f"LLM returned invalid JSON after {_MAX_GENERATE_RETRIES} retries: "
+                       f"{last_parse_error}\n{last_raw[:200]}",
+            )
+
+    tip = TipResult(
+        id=item.get("id", f"tip-{req.user_id[:8]}"),
+        content=item.get("content", ""),
+        rationale=item.get("rationale"),
+    )
+    log.info(
+        "recommend_served",
+        user_id=req.user_id,
+        agent_count=len(req.agent_outputs),
+        tip_id=tip.id,
+    )
+    return RecommendResponse(
+        tip=tip,
+        model=model_used,
+        prompt_tokens=total_usage["prompt_tokens"],
+        completion_tokens=total_usage["completion_tokens"],
+    )
+
 _MAX_GENERATE_RETRIES = 2