feat: ε-greedy v1 as active policy; dwell-time reward inference; offline sim framework

- Promote egreedy-v1 to active serving policy (ADR-0007): /score/egreedy + /reward/egreedy replaces linucb-v1 endpoints after offline sim shows +10.7% mean reward (−0.548 vs −0.606) - Replace explicit helpful/not_helpful feedback with dwell-time inferred reward (inferReward): dismiss=−1.0, snooze=+0.1, done<15s=−0.3, done 15s–2min=+1.0, done 2–10min=+0.6, done>10min=+0.3 - Add ml/serving ε-greedy endpoints: /score/egreedy, /reward/egreedy, /stats/egreedy/{user_id} with d=7 feature vector (base 5 + sin/cos day-of-week encoding) - Add offline simulation framework (ml/experiments/sim): rule/LLM/claude-code judges, two-phase score+reward, synthetic personas, task generator; results stored in sim_runs/sim_events - Add /admin/simulations page: start runs, live-poll status, reward curve SVG, action/persona tables - Fix egreedy day_of_week training skew: reward endpoint now uses actual dow instead of hardcoded 0 - Fix runner.py proxy bypass: httpx.Client(trust_env=False) for localhost ML calls - Add dwellMs to TipFeedbackEvent contract and bus.test.ts fixture - Schema: sim_runs, sim_events tables; tip_feedback gains dwell_ms, reward_milli columns - ADR-0006: admin console framework; ADR-0007: egreedy-v1 policy selection rationale Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-16 07:44:37 +00:00
parent c5ea18ec6e
commit faf44c18fc
48 changed files with 6151 additions and 40 deletions
--- a/ml/serving/tests/test_score.py
+++ b/ml/serving/tests/test_score.py
@@ -0,0 +1,261 @@
+"""
+Unit tests for ml/serving — feature building and scoring contract.
+Run with: pytest ml/serving/tests/
+"""
+import math
+import pytest
+from httpx import AsyncClient, ASGITransport
+
+from main import app, build_feature_vector
+
+
+class TestFeatureVector:
+    def test_shape(self):
+        v = build_feature_vector({"hour_of_day": 8, "is_overdue": True, "task_age_days": 3, "priority": 3})
+        assert v.shape == (5,)
+
+    def test_hour_encoding_noon(self):
+        v = build_feature_vector({"hour_of_day": 12})
+        # sin(2π * 12/24) = sin(π) ≈ 0
+        assert abs(v[0]) < 1e-10
+        # cos(2π * 12/24) = cos(π) = -1
+        assert abs(v[1] - (-1.0)) < 1e-10
+
+    def test_hour_encoding_midnight(self):
+        v = build_feature_vector({"hour_of_day": 0})
+        # sin(0) = 0
+        assert abs(v[0]) < 1e-10
+        # cos(0) = 1
+        assert abs(v[1] - 1.0) < 1e-10
+
+    def test_hour_encoding_6am(self):
+        v = build_feature_vector({"hour_of_day": 6})
+        # sin(2π * 6/24) = sin(π/2) = 1
+        assert abs(v[0] - 1.0) < 1e-10
+        # cos(π/2) = 0
+        assert abs(v[1]) < 1e-10
+
+    def test_age_clipped_at_30(self):
+        v_long = build_feature_vector({"task_age_days": 100})
+        v_cap = build_feature_vector({"task_age_days": 30})
+        assert v_long[3] == v_cap[3] == 1.0
+
+    def test_age_zero(self):
+        v = build_feature_vector({"task_age_days": 0})
+        assert v[3] == pytest.approx(0.0)
+
+    def test_age_15_days_normalised(self):
+        v = build_feature_vector({"task_age_days": 15})
+        assert v[3] == pytest.approx(0.5)
+
+    def test_priority_normalised(self):
+        v1 = build_feature_vector({"priority": 1})
+        v4 = build_feature_vector({"priority": 4})
+        assert v1[4] == pytest.approx(0.0)
+        assert v4[4] == pytest.approx(1.0)
+
+    def test_priority_2_and_3(self):
+        v2 = build_feature_vector({"priority": 2})
+        v3 = build_feature_vector({"priority": 3})
+        assert v2[4] == pytest.approx(1 / 3)
+        assert v3[4] == pytest.approx(2 / 3)
+
+    def test_is_overdue_true(self):
+        v = build_feature_vector({"is_overdue": True})
+        assert v[2] == 1.0
+
+    def test_is_overdue_false(self):
+        v = build_feature_vector({"is_overdue": False})
+        assert v[2] == 0.0
+
+    def test_defaults_when_no_keys(self):
+        v = build_feature_vector({})
+        # hour=12 → sin(π)≈0, cos(π)=-1
+        assert abs(v[0]) < 1e-10
+        assert abs(v[1] - (-1.0)) < 1e-10
+        assert v[2] == 0.0   # is_overdue=False
+        assert v[3] == 0.0   # task_age_days=0
+        assert v[4] == 0.0   # priority=1 → (1-1)/3=0
+
+
+@pytest.mark.asyncio
+async def test_health():
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        r = await client.get("/health")
+    assert r.status_code == 200
+    assert r.json()["ok"] is True
+
+
+@pytest.mark.asyncio
+async def test_score_returns_a_candidate():
+    payload = {
+        "user_id": "test-user",
+        "candidates": [
+            {"id": "t:1", "content": "Task A", "source": "todoist", "source_id": "1",
+             "features": {"is_overdue": True, "task_age_days": 2, "priority": 3}},
+            {"id": "t:2", "content": "Task B", "source": "todoist", "source_id": "2",
+             "features": {"is_overdue": False, "task_age_days": 0, "priority": 1}},
+        ],
+        "context": {"hour_of_day": 9, "day_of_week": 1},
+    }
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        r = await client.post("/score", json=payload)
+    assert r.status_code == 200
+    body = r.json()
+    assert body["tip_id"] in {"t:1", "t:2"}
+    assert "policy" in body
+    assert body["policy"] == "linucb-v1"
+    assert isinstance(body["score"], float)
+
+
+@pytest.mark.asyncio
+async def test_score_single_candidate_always_selected():
+    """With a single candidate there is no choice — it must be returned."""
+    payload = {
+        "user_id": "solo-user",
+        "candidates": [
+            {"id": "only:1", "content": "Only task", "source": "todoist",
+             "features": {"is_overdue": False, "task_age_days": 0, "priority": 1}},
+        ],
+        "context": {"hour_of_day": 10, "day_of_week": 0},
+    }
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        r = await client.post("/score", json=payload)
+    assert r.status_code == 200
+    assert r.json()["tip_id"] == "only:1"
+
+
+@pytest.mark.asyncio
+async def test_score_empty_candidates_returns_422():
+    payload = {"user_id": "u", "candidates": [], "context": {"hour_of_day": 9, "day_of_week": 1}}
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        r = await client.post("/score", json=payload)
+    assert r.status_code == 422
+
+
+@pytest.mark.asyncio
+async def test_reward_accepted():
+    payload = {
+        "user_id": "reward-user",
+        "tip_id": "t:1",
+        "reward": 1.0,
+        "features": {"hour_of_day": 9, "is_overdue": True, "task_age_days": 2, "priority": 3},
+    }
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        r = await client.post("/reward", json=payload)
+    assert r.status_code == 200
+    assert r.json()["ok"] is True
+
+
+@pytest.mark.asyncio
+async def test_reward_updates_stats():
+    """Posting a reward should increase cumulative_reward in /stats."""
+    user_id = "reward-stats-user"
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        r0 = await client.get(f"/stats/{user_id}")
+        before = r0.json()["cumulative_reward"]
+
+        await client.post("/reward", json={
+            "user_id": user_id,
+            "tip_id": "tip:x",
+            "reward": 1.0,
+            "features": {"hour_of_day": 8, "is_overdue": False, "task_age_days": 0, "priority": 2},
+        })
+        r1 = await client.get(f"/stats/{user_id}")
+    assert r1.json()["cumulative_reward"] == pytest.approx(before + 1.0)
+
+
+@pytest.mark.asyncio
+async def test_score_increments_pulls():
+    user_id = "pull-counter-user"
+    payload = {
+        "user_id": user_id,
+        "candidates": [
+            {"id": "t:p1", "content": "Pull task", "source": "todoist",
+             "features": {"is_overdue": False, "task_age_days": 1, "priority": 2}},
+        ],
+        "context": {"hour_of_day": 10, "day_of_week": 2},
+    }
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        r0 = await client.get(f"/stats/{user_id}")
+        pulls_before = r0.json()["pulls"]
+
+        await client.post("/score", json=payload)
+        await client.post("/score", json=payload)
+
+        r1 = await client.get(f"/stats/{user_id}")
+    assert r1.json()["pulls"] == pulls_before + 2
+
+
+@pytest.mark.asyncio
+async def test_reset_clears_state():
+    user_id = "reset-user"
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        # Score once to build state
+        await client.post("/score", json={
+            "user_id": user_id,
+            "candidates": [
+                {"id": "t:r", "content": "Reset task", "source": "todoist",
+                 "features": {"is_overdue": True, "task_age_days": 5, "priority": 4}},
+            ],
+            "context": {"hour_of_day": 14, "day_of_week": 3},
+        })
+        r_reset = await client.post(f"/reset/{user_id}")
+        assert r_reset.json()["ok"] is True
+
+        r_stats = await client.get(f"/stats/{user_id}")
+    assert r_stats.json()["pulls"] == 0
+
+
+@pytest.mark.asyncio
+async def test_features_endpoint_returns_history():
+    user_id = "features-user"
+    payload = {
+        "user_id": user_id,
+        "candidates": [
+            {"id": "t:f1", "content": "Feature task", "source": "todoist",
+             "features": {"is_overdue": False, "task_age_days": 0, "priority": 1}},
+        ],
+        "context": {"hour_of_day": 7, "day_of_week": 0},
+    }
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        await client.post("/score", json=payload)
+        r = await client.get(f"/features/{user_id}")
+    body = r.json()
+    assert r.status_code == 200
+    assert "history" in body
+    assert len(body["history"]) >= 1
+    entry = body["history"][-1]
+    assert "ts" in entry
+    assert "score" in entry
+    assert "tip_id" in entry
+
+
+@pytest.mark.asyncio
+async def test_stats_for_fresh_user():
+    """A user with no history should return zero/default stats without error."""
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        r = await client.get("/stats/brand-new-user-xyz-abc")
+    body = r.json()
+    assert r.status_code == 200
+    assert body["pulls"] == 0
+    assert body["cumulative_reward"] == 0.0
+    assert body["estimated_mean_reward"] == 0.0
+
+
+@pytest.mark.asyncio
+async def test_reward_negative_value():
+    """Dismissing a tip should decrease cumulative_reward."""
+    user_id = "dismiss-user-neg"
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        r0 = await client.get(f"/stats/{user_id}")
+        before = r0.json()["cumulative_reward"]
+
+        await client.post("/reward", json={
+            "user_id": user_id,
+            "tip_id": "t:neg",
+            "reward": -1.0,
+            "features": {"hour_of_day": 20, "is_overdue": False, "task_age_days": 0, "priority": 1},
+        })
+        r1 = await client.get(f"/stats/{user_id}")
+    assert r1.json()["cumulative_reward"] == pytest.approx(before - 1.0)