chore: remove Airflow completely from the stack

Drop all four Airflow containers (db, init, webserver, scheduler) from the
mlops compose profile, leaving MLflow as the sole mlops service. Remove
AIRFLOW_* env vars, config fields, health-check entries, DAG trigger code
in admin/bench routes, the airflow_dag_run_id schema column, Airflow nav
links and DAG-run links in the admin UI, the two Airflow DAG files
(bench_dag.py, sim_dag.py), and all related docs/ADR references.
Simulations now run exclusively via the subprocess path.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-03 16:38:46 +00:00
parent ce1c8bde57
commit f8d66aa01f
27 changed files with 663 additions and 719 deletions

View File

@@ -28,13 +28,20 @@ POST /api/push/subscribe
DELETE /api/push/subscribe
GET /api/admin/stats DAU/WAU, feedback breakdown
GET /api/admin/users
GET /api/admin/events recent event stream (ring buffer)
GET /api/admin/users user list with pagination
GET /api/user/:id user detail, consents, integrations
GET /api/admin/events recent event stream (ring buffer or NATS JetStream)
GET /api/admin/events/history historical event query (time range, filters)
GET /api/admin/sim/runs offline sim run list
POST /api/admin/sim/run launch offline sim
POST /api/admin/sim/run launch offline sim with policy/judge params
GET /api/admin/sim/runs/:id/output tail sim stdout
...
GET /api/admin/features/:userId per-user profile features + freshness
GET /api/admin/features/:userId/context context features for last score call
POST /api/admin/policies list shadow policies + active policy
POST /api/admin/policies/:name/toggle enable/disable shadow policy
POST /api/admin/users/:id/actions revoke-integration, reset-bandit, rebuild-profile
GET /api/admin/health system health: api, ml/serving, db, bus, mlflow
GET /api/admin/docs admin documentation index
GET /api/ml/* admin-only proxy to ml/serving
```

View File

@@ -35,11 +35,8 @@ export const config = {
LITELLM_URL: optional('LITELLM_URL', 'http://localhost:4000'),
MLFLOW_URL: optional('MLFLOW_URL', 'http://localhost:5000'),
AIRFLOW_URL: optional('AIRFLOW_URL', 'http://localhost:8080'),
AIRFLOW_API_USER: optional('AIRFLOW_API_USER', 'admin'),
AIRFLOW_API_PASSWORD: optional('AIRFLOW_API_PASSWORD', 'admin'),
/** Shared secret for internal Airflow→API callbacks. */
/** Shared secret for internal API callbacks. */
INTERNAL_API_TOKEN: optional('INTERNAL_API_TOKEN', ''),
/** Static token for automated/service access to the admin panel (e.g. Playwright tests). */

View File

@@ -143,6 +143,19 @@ export function runMigrations() {
day_of_week INTEGER NOT NULL,
created_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS agent_outputs (
id TEXT PRIMARY KEY,
user_id TEXT NOT NULL REFERENCES users(id),
agent_id TEXT NOT NULL,
prompt_text TEXT NOT NULL,
signals_snapshot TEXT,
computed_at TEXT NOT NULL,
expires_at TEXT NOT NULL,
agent_version TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_agent_outputs_user_agent_exp
ON agent_outputs(user_id, agent_id, expires_at DESC);
`);
// Additive column migrations — safe to run on existing DBs.
@@ -156,7 +169,6 @@ export function runMigrations() {
`ALTER TABLE tip_scores ADD COLUMN prompt_version TEXT`,
`ALTER TABLE tip_scores ADD COLUMN llm_model TEXT`,
`ALTER TABLE tip_scores ADD COLUMN tip_kind TEXT`,
`ALTER TABLE sim_runs ADD COLUMN airflow_dag_run_id TEXT`,
`ALTER TABLE sim_runs ADD COLUMN mlflow_run_id TEXT`,
`ALTER TABLE sim_runs ADD COLUMN judge_mode TEXT NOT NULL DEFAULT 'rule'`,
`ALTER TABLE sim_runs ADD COLUMN n_policies INTEGER NOT NULL DEFAULT 2`,

View File

@@ -117,7 +117,6 @@ export const simRuns = sqliteTable('sim_runs', {
summaryJson: text('summary_json'), // JSON: { [policy]: PolicySummary }
winner: text('winner'),
personaBreakdownJson: text('persona_breakdown_json'), // JSON: { [persona]: { [policy]: {reward,n} } }
airflowDagRunId: text('airflow_dag_run_id'),
mlflowRunId: text('mlflow_run_id'),
createdAt: text('created_at').notNull(),
finishedAt: text('finished_at'),
@@ -142,6 +141,20 @@ export const simEvents = sqliteTable('sim_events', {
createdAt: text('created_at').notNull(),
});
// ── Agent outputs (#multi-agent) ─────────────────────────────────────────────
// One row per (userId, agentId) pre-compute run. The orchestrator reads the
// freshest non-expired row per agent when assembling the tip prompt.
export const agentOutputs = sqliteTable('agent_outputs', {
id: text('id').primaryKey(),
userId: text('user_id').notNull().references(() => users.id),
agentId: text('agent_id').notNull(), // e.g. 'overdue-task'
promptText: text('prompt_text').notNull(), // snippet for orchestrator prompt
signalsSnapshot: text('signals_snapshot'), // JSON: inputs the agent consumed
computedAt: text('computed_at').notNull(), // ISO 8601
expiresAt: text('expires_at').notNull(), // ISO 8601 = computedAt + TTL
agentVersion: text('agent_version').notNull(), // bump to invalidate on logic changes
});
// Admin saved SQL queries.
export const savedQueries = sqliteTable('saved_queries', {
id: text('id').primaryKey(),

View File

@@ -389,7 +389,7 @@ describe('GET /api/admin/events', () => {
// Health endpoint — mock fetch so tests don't depend on running services.
// ---------------------------------------------------------------------------
describe('GET /api/admin/health', () => {
const EXPECTED_HTTP_SERVICES = ['api', 'ml-serving', 'mlflow', 'airflow'] as const;
const EXPECTED_HTTP_SERVICES = ['api', 'ml-serving', 'mlflow'] as const;
const EXPECTED_INTERNAL = ['sqlite', 'event-bus'] as const;
const VALID_STATUSES = new Set(['ok', 'degraded', 'down']);
@@ -404,7 +404,6 @@ describe('GET /api/admin/health', () => {
let name: string;
if (s.includes(':8000')) name = 'ml-serving';
else if (s.includes(':5000')) name = 'mlflow';
else if (s.includes(':8080')) name = 'airflow';
else name = 'api';
if (!upServices.has(name)) throw new Error(`ECONNREFUSED ${name}`);
@@ -415,7 +414,7 @@ describe('GET /api/admin/health', () => {
afterEach(() => vi.unstubAllGlobals());
it('shape: 200, typed fields, all expected services present', async () => {
mockFetch(new Set(['api', 'ml-serving', 'mlflow', 'airflow']));
mockFetch(new Set(['api', 'ml-serving', 'mlflow']));
const { server, call } = await startServer(buildApp());
try {
const { status, body } = await call('GET', '/api/admin/health');
@@ -440,7 +439,7 @@ describe('GET /api/admin/health', () => {
});
it('ok=true when all HTTP services respond 200', async () => {
mockFetch(new Set(['api', 'ml-serving', 'mlflow', 'airflow']));
mockFetch(new Set(['api', 'ml-serving', 'mlflow']));
const { server, call } = await startServer(buildApp());
try {
const { body } = await call('GET', '/api/admin/health');
@@ -456,7 +455,7 @@ describe('GET /api/admin/health', () => {
});
it('ml-serving=down and ok=false when ml-serving is unreachable', async () => {
mockFetch(new Set(['api', 'mlflow', 'airflow'])); // ml-serving absent
mockFetch(new Set(['api', 'mlflow'])); // ml-serving absent
const { server, call } = await startServer(buildApp());
try {
const { body } = await call('GET', '/api/admin/health');
@@ -469,22 +468,8 @@ describe('GET /api/admin/health', () => {
}
});
it('airflow=down and ok=false when airflow is unreachable', async () => {
mockFetch(new Set(['api', 'ml-serving', 'mlflow'])); // airflow absent
const { server, call } = await startServer(buildApp());
try {
const { body } = await call('GET', '/api/admin/health');
const b = body as HealthBody;
const svc = b.services.find((s) => s.name === 'airflow');
expect(svc?.status).toBe('down');
expect(b.ok).toBe(false);
} finally {
server.close();
}
});
it('mlflow=down and ok=false when mlflow is unreachable', async () => {
mockFetch(new Set(['api', 'ml-serving', 'airflow'])); // mlflow absent
mockFetch(new Set(['api', 'ml-serving'])); // mlflow absent
const { server, call } = await startServer(buildApp());
try {
const { body } = await call('GET', '/api/admin/health');

View File

@@ -524,14 +524,10 @@ router.get('/data-quality', async (req: AuthenticatedRequest, res: Response) =>
// Fan-out to all subsystem /health endpoints.
// ---------------------------------------------------------------------------
router.get('/health', async (_req: AuthenticatedRequest, res: Response) => {
const airflowAuth = Buffer.from(`${config.AIRFLOW_API_USER}:${config.AIRFLOW_API_PASSWORD}`).toString('base64');
const checks: Array<{ name: string; url: string; headers?: Record<string, string> }> = [
{ name: 'api', url: `http://localhost:${config.PORT}/health` },
{ name: 'ml-serving', url: `${config.ML_SERVING_URL}/health` },
{ name: 'mlflow', url: `${config.MLFLOW_URL}/health` },
{ name: 'airflow', url: `${config.AIRFLOW_URL}/api/v1/health`,
headers: { Authorization: `Basic ${airflowAuth}` } },
];
const results = await Promise.allSettled(
@@ -705,8 +701,7 @@ router.delete('/saved-queries/:id', async (req: AuthenticatedRequest, res: Respo
// ---------------------------------------------------------------------------
// POST /api/admin/simulate/start
// Trigger an Airflow DAG run (bandit_sim). Falls back to a local subprocess
// when AIRFLOW_URL is not reachable, so local dev still works.
// Trigger a bandit_sim run via local subprocess.
// ---------------------------------------------------------------------------
router.post('/simulate/start', async (req: AuthenticatedRequest, res: Response) => {
const {
@@ -745,56 +740,7 @@ router.post('/simulate/start', async (req: AuthenticatedRequest, res: Response)
createdAt: now,
});
// ── Try Airflow first ────────────────────────────────────────────────────
if (config.AIRFLOW_URL && config.INTERNAL_API_TOKEN) {
try {
const airflowAuth = Buffer.from(
`${config.AIRFLOW_API_USER}:${config.AIRFLOW_API_PASSWORD}`,
).toString('base64');
const dagRes = await fetch(
`${config.AIRFLOW_URL}/api/v1/dags/bandit_sim/dagRuns`,
{
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Basic ${airflowAuth}`,
},
body: JSON.stringify({
conf: {
sim_run_id: id,
n_users: nUsers,
n_rounds: nRounds,
tasks_per_round: tasksPerRound,
policies,
judge_mode: judgeMode,
ml_url: config.ML_SERVING_URL,
mlflow_url: config.MLFLOW_URL,
callback_url: `${config.API_BASE_URL}/api/admin/simulate/${id}/complete`,
internal_token: config.INTERNAL_API_TOKEN,
},
}),
signal: AbortSignal.timeout(5000),
},
);
if (dagRes.ok) {
const dagBody = await dagRes.json() as { dag_run_id: string };
await db
.update(simRuns)
.set({ airflowDagRunId: dagBody.dag_run_id })
.where(eq(simRuns.id, id));
res.json({ id, status: 'running', airflow_dag_run_id: dagBody.dag_run_id });
return;
}
logger.warn({ status: dagRes.status }, 'sim: Airflow trigger failed, falling back to subprocess');
} catch (err) {
logger.warn({ err }, 'sim: Airflow unreachable, falling back to subprocess');
}
}
// ── Subprocess fallback (local dev / Airflow not configured) ────────────
// ── Subprocess ───────────────────────────────────────────────────────────
const runnerPath = resolve(__dirname, '../../../../ml/experiments/sim/runner.py');
const venvPython = resolve(__dirname, '../../../../ml/serving/.venv/bin/python');
const pythonBin = existsSync(venvPython) ? venvPython : 'python3';

View File

@@ -0,0 +1,220 @@
import { Router } from 'express';
import { nanoid } from 'nanoid';
import { db } from '../db/index.js';
import { agentOutputs, tipFeedback, tipViews } from '../db/schema.js';
import { eq, and, gt, lt } from 'drizzle-orm';
import { config } from '../config.js';
import { getProfile } from '../profile/builder.js';
import { todoistSource } from '../signals/todoist.js';
import { SignalAggregator } from '../signals/aggregator.js';
import type { Request, Response } from 'express';
const router = Router();
// Separate aggregator instance — avoids circular dep with recommender.ts.
const _agentAggregator = new SignalAggregator().register(todoistSource);
// ── Internal auth helper ──────────────────────────────────────────────────────
function checkInternalToken(req: Request, res: Response): boolean {
const token = req.headers['x-internal-token'];
if (!config.INTERNAL_API_TOKEN || token !== config.INTERNAL_API_TOKEN) {
res.status(401).json({ error: 'Unauthorized' });
return false;
}
return true;
}
// ── DB helpers ────────────────────────────────────────────────────────────────
export async function getActiveAgentOutputs(userId: string) {
const now = new Date().toISOString();
return db
.select()
.from(agentOutputs)
.where(and(eq(agentOutputs.userId, userId), gt(agentOutputs.expiresAt, now)));
}
async function storeAgentOutput(output: {
user_id: string;
agent_id: string;
prompt_text: string;
signals_snapshot?: unknown;
computed_at: string;
expires_at: string;
agent_version: string;
}) {
await db
.delete(agentOutputs)
.where(and(eq(agentOutputs.userId, output.user_id), eq(agentOutputs.agentId, output.agent_id)));
await db.insert(agentOutputs).values({
id: nanoid(),
userId: output.user_id,
agentId: output.agent_id,
promptText: output.prompt_text,
signalsSnapshot: output.signals_snapshot ? JSON.stringify(output.signals_snapshot) : null,
computedAt: output.computed_at,
expiresAt: output.expires_at,
agentVersion: output.agent_version,
});
}
// ── GET /api/agents/active-users ──────────────────────────────────────────────
// Returns user IDs that have requested a tip in the last 48 hours.
// Returns user IDs for fan-out precompute tasks.
router.get('/active-users', async (req: Request, res: Response) => {
if (!checkInternalToken(req, res)) return;
const cutoff = new Date(Date.now() - 48 * 60 * 60 * 1000).toISOString();
try {
const rows = await db
.selectDistinct({ userId: tipViews.userId })
.from(tipViews)
.where(gt(tipViews.servedAt, cutoff));
res.json({ user_ids: rows.map((r) => r.userId) });
} catch (err: any) {
res.status(500).json({ error: err.message });
}
});
// ── POST /api/agents/:agentId/compute ─────────────────────────────────────────
// Orchestrating endpoint for per-(user, agent) compute tasks.
// Fetches all signals, calls ml/serving /agents/{agentId}/compute, stores result.
// Body: { user_id: string }
router.post('/:agentId/compute', async (req: Request, res: Response) => {
if (!checkInternalToken(req, res)) return;
const { agentId } = req.params as { agentId: string };
const { user_id } = req.body as { user_id: string };
if (!user_id) {
res.status(422).json({ error: 'Missing user_id' });
return;
}
try {
// Fetch tasks via Todoist integration (gracefully empty if not connected).
let tasks: object[] = [];
try {
const signals = await _agentAggregator.fetchAll(user_id);
tasks = signals.map((s) => ({
id: s.id,
content: s.content,
priority: (s.features.priority as number) ?? 1,
is_overdue: Boolean(s.features.is_overdue),
task_age_days: (s.features.task_age_days as number) ?? 0,
project_id: (s.metadata as Record<string, unknown>).project_id ?? null,
}));
} catch {
// No integration or fetch error — agents that need tasks will report "no tasks"
}
// Fetch profile features (lazy-refreshed from DB).
let profile: Record<string, number | null> = {};
try {
profile = await getProfile(user_id);
} catch {}
// Fetch last 7 days of feedback for RecentPatternsAgent.
const sevenDaysAgo = new Date(Date.now() - 7 * 24 * 60 * 60 * 1000).toISOString();
const feedbackRows = await db
.select({ action: tipFeedback.action, dwellMs: tipFeedback.dwellMs, createdAt: tipFeedback.createdAt })
.from(tipFeedback)
.where(and(eq(tipFeedback.userId, user_id), gt(tipFeedback.createdAt, sevenDaysAgo)));
const feedbackHistory = feedbackRows.map((f) => ({
action: f.action,
dwell_ms: f.dwellMs,
created_at: f.createdAt,
}));
// Call ml/serving to run the agent.
const mlResp = await fetch(`${config.ML_SERVING_URL}/agents/${agentId}/compute`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ user_id, tasks, profile, feedback_history: feedbackHistory }),
signal: AbortSignal.timeout(15_000),
});
if (!mlResp.ok) {
const detail = await mlResp.text().catch(() => '');
res.status(502).json({ error: `ml/serving returned ${mlResp.status}`, detail });
return;
}
const output = await mlResp.json() as {
user_id: string; agent_id: string; prompt_text: string;
signals_snapshot: unknown; computed_at: string; expires_at: string; agent_version: string;
};
await storeAgentOutput(output);
res.json({ ok: true, agent_id: output.agent_id, user_id: output.user_id, expires_at: output.expires_at });
} catch (err: any) {
res.status(500).json({ error: err.message });
}
});
// ── POST /api/agents/outputs ──────────────────────────────────────────────────
// Stores a pre-computed agent output directly (used if the DAG calls ml/serving
// itself and pushes the result separately).
router.post('/outputs', async (req: Request, res: Response) => {
if (!checkInternalToken(req, res)) return;
const { user_id, agent_id, prompt_text, signals_snapshot, computed_at, expires_at, agent_version } =
req.body as Record<string, string>;
if (!user_id || !agent_id || !prompt_text || !computed_at || !expires_at || !agent_version) {
res.status(422).json({
error: 'Missing required fields: user_id, agent_id, prompt_text, computed_at, expires_at, agent_version',
});
return;
}
try {
await storeAgentOutput({ user_id, agent_id, prompt_text, signals_snapshot, computed_at, expires_at, agent_version });
res.json({ ok: true });
} catch (err: any) {
res.status(500).json({ error: err.message });
}
});
// ── DELETE /api/agents/outputs/expired ───────────────────────────────────────
// Purges rows expired more than 24 hours ago.
router.delete('/outputs/expired', async (req: Request, res: Response) => {
if (!checkInternalToken(req, res)) return;
const cutoff = new Date(Date.now() - 24 * 60 * 60 * 1000).toISOString();
try {
await db.delete(agentOutputs).where(lt(agentOutputs.expiresAt, cutoff));
res.json({ ok: true });
} catch (err: any) {
res.status(500).json({ error: err.message });
}
});
// ── GET /api/agents/:userId/outputs ──────────────────────────────────────────
// Returns non-expired agent outputs. Admin observability; recommender calls
// getActiveAgentOutputs() directly (no HTTP hop).
router.get('/:userId/outputs', async (req: Request, res: Response) => {
const { userId } = req.params as { userId: string };
try {
const rows = await getActiveAgentOutputs(userId);
res.json({
user_id: userId,
outputs: rows.map((r) => ({
agent_id: r.agentId,
prompt_text: r.promptText,
computed_at: r.computedAt,
expires_at: r.expiresAt,
agent_version: r.agentVersion,
})),
});
} catch (err: any) {
res.status(500).json({ error: err.message });
}
});
export default router;

View File

@@ -18,10 +18,6 @@ const MLFLOW_URL = process.env.MLFLOW_URL || "http://mlflow:5000";
const MLFLOW_USER = process.env.MLFLOW_TRACKING_USERNAME || "admin";
const MLFLOW_PASS = process.env.MLFLOW_TRACKING_PASSWORD || "password";
const AIRFLOW_URL = process.env.AIRFLOW_URL || "http://airflow-webserver:8080";
const AIRFLOW_USER = process.env.AIRFLOW_API_USER || "admin";
const AIRFLOW_PASS = process.env.AIRFLOW_API_PASSWORD || "admin";
// Wrapper for MLflow REST calls with Host header fix
async function mlflowFetch(
path: string,
@@ -65,44 +61,6 @@ router.get("/experiments", async (req: Request, res: Response) => {
}
});
// POST /api/bench/run — trigger benchmark DAG
router.post("/run", async (req: Request, res: Response) => {
try {
const config = req.body || {};
const experiment = config.experiment || "tip-bench-admin";
const dagRunUrl = new URL("/api/v1/dags/bench_collect/dagRuns", AIRFLOW_URL);
const auth = Buffer.from(`${AIRFLOW_USER}:${AIRFLOW_PASS}`).toString(
"base64"
);
const response = await fetch(dagRunUrl.toString(), {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Basic ${auth}`,
},
body: JSON.stringify({
conf: config,
dag_run_id: `bench-${Date.now()}`,
}),
});
if (!response.ok) {
throw new Error(`Airflow ${response.status}: ${response.statusText}`);
}
const result = await response.json();
res.json({
status: "triggered",
dag_run_id: result.dag_run_id,
experiment,
});
} catch (err) {
res.status(500).json({ error: String(err) });
}
});
// GET /api/bench/runs/:experiment — list runs in an experiment
router.get("/runs/:experiment", async (req: Request, res: Response) => {
try {