feat: MLOps external services, AI stack planning, admin MLOps hub

Infrastructure:
- Add `mlops` compose profile: MLflow (basic-auth, /mlflow path) + Airflow (LocalExecutor, /airflow path) + airflow-db
- infra/mlflow/basic_auth.ini for MLflow auth config
- Caddy routes /mlflow* and /airflow* inside existing o.alogins.net block (see agap_git)
- Dockerfile.admin: NEXT_PUBLIC_MLFLOW_URL / NEXT_PUBLIC_AIRFLOW_URL build args (default /mlflow, /airflow)

Admin panel:
- /admin/models: replace MLflow iframe with external link cards
- /admin/experiments: replace LinUCB stats with MLOps hub (links to MLflow experiments/models + Airflow DAGs/datasets)
- AdminShell: external nav links for MLflow ↗ and Airflow ↗ under MLOps section

Docs & planning:
- README: new AI stack section (Ollama/LiteLLM/OpenWebUI three-tier, tip generation pipeline, model aliases)
- README: Phase 2 expanded with AI infra issues (#86-#93) and granular pipeline breakdown
- README: Phase 4 expanded with LLM MLOps items (#94-#97)
- CLAUDE.md: AI stack section, updated current phase (M1 shipped / M2 in progress), compose profiles, updated What NOT to do
- docs/architecture/overview.md: AI stack section, updated decision flow diagram for Phase 2 LLM pipeline
- ADR-0006: updated to reflect external services (path-based, not embedded)
- Gitea issues #86-#97 created (M2: AI infra + pipeline; M4: LLM MLOps)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-17 08:20:44 +00:00
parent faf44c18fc
commit 85367aeaa0
25 changed files with 695 additions and 222 deletions

View File

@@ -65,7 +65,17 @@ async function fetchTodoistTasks(userId: string, accessToken: string): Promise<C
headers: { Authorization: `Bearer ${accessToken}` },
});
if (!res.ok) return cached?.tasks ?? [];
if (!res.ok) {
if (res.status === 401) {
console.error(`[todoist] token expired for user ${userId}`);
bus.publish('signals.integration.token_expired', {
userId,
provider: 'todoist',
detectedAt: new Date().toISOString(),
});
}
return cached?.tasks ?? [];
}
const body = (await res.json()) as {
results: Array<{
@@ -230,18 +240,20 @@ router.post('/recommend', requireAuth, async (req: AuthenticatedRequest, res: Re
// ---------------------------------------------------------------------------
// Reward inference from action + dwell time
//
// Feedback is now 3 signals only: done / snooze / dismiss.
// "Helpfulness" is inferred from how long the user took to act on a tip:
// dismiss → -1.0 (clear rejection)
// snooze → +0.1 (tip noticed, timing off — mild positive)
// helpful → +0.5 (explicit positive signal)
// not_helpful → -0.5 (explicit negative signal)
// done < 15 s → -0.3 (almost certainly a stale task, not magic)
// done 15 s 2 min → +1.0 (magic zone: user saw tip and acted)
// done 2 10 min → +0.6 (good: user engaged, acted in same session)
// done > 10 min → +0.3 (eventually done; tip may have helped, unclear)
// ---------------------------------------------------------------------------
function inferReward(action: string, dwellMs: number | null): number {
if (action === 'dismiss') return -1.0;
if (action === 'snooze') return 0.1;
if (action === 'dismiss') return -1.0;
if (action === 'snooze') return 0.1;
if (action === 'helpful') return 0.5;
if (action === 'not_helpful') return -0.5;
// done — use dwell time
if (dwellMs === null || dwellMs < 0) return 0.5; // unknown dwell: neutral positive
if (dwellMs < 15_000) return -0.3; // stale / reflex
@@ -250,6 +262,51 @@ function inferReward(action: string, dwellMs: number | null): number {
return 0.3; // eventually
}
// ---------------------------------------------------------------------------
// Reward delivery with retry (bug #75 — was fire-and-forget)
// ---------------------------------------------------------------------------
async function sendRewardWithRetry(
userId: string,
tipId: string,
reward: number,
features: TaskFeatures,
): Promise<void> {
const body = JSON.stringify({
user_id: userId,
tip_id: tipId,
reward,
features,
day_of_week: new Date().getDay(),
});
for (let attempt = 1; attempt <= 3; attempt++) {
try {
const res = await fetch(`${config.ML_SERVING_URL}/reward/egreedy`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body,
signal: AbortSignal.timeout(3000),
});
if (res.ok) return;
throw new Error(`HTTP ${res.status}`);
} catch (err: any) {
if (attempt === 3) {
console.error(`[reward] failed after 3 attempts for tip ${tipId}: ${err.message}`);
bus.publish('signals.tip.reward_failed', {
userId,
tipId,
reward,
attempts: 3,
error: err.message,
failedAt: new Date().toISOString(),
});
return;
}
await new Promise((r) => setTimeout(r, 250 * Math.pow(2, attempt)));
}
}
}
// ---------------------------------------------------------------------------
// POST /api/tip/:id/feedback
// ---------------------------------------------------------------------------
@@ -258,7 +315,7 @@ router.post('/tip/:id/feedback', requireAuth, async (req: AuthenticatedRequest,
const tipId = String(req.params.id);
const now = new Date();
const validActions = ['done', 'dismiss', 'snooze'];
const validActions = ['done', 'dismiss', 'snooze', 'helpful', 'not_helpful'];
if (!validActions.includes(action)) {
res.status(400).json({ error: 'Invalid action' });
return;
@@ -297,25 +354,14 @@ router.post('/tip/:id/feedback', requireAuth, async (req: AuthenticatedRequest,
bus.publish('signals.tip.feedback', {
userId: req.userId!,
tipId,
action: action as 'done' | 'dismiss' | 'snooze',
action: action as 'done' | 'dismiss' | 'snooze' | 'helpful' | 'not_helpful',
reward,
dwellMs,
createdAt: now.toISOString(),
});
if (task) {
// Send reward to egreedy-v1 (active policy — ADR-0007)
fetch(`${config.ML_SERVING_URL}/reward/egreedy`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
user_id: req.userId!,
tip_id: tipId,
reward,
features: task.features,
day_of_week: new Date().getDay(),
}),
}).catch(() => {});
sendRewardWithRetry(req.userId!, tipId, reward, task.features);
}
// Mark complete in Todoist if done