feat(ml): egreedy-v2 shadow policy — D=12 with profile features (#99)

Ship the scaffolding for #99 (phase B.3 of #81): - ml/serving: add /score/egreedy/v2, /reward/egreedy/v2, /stats/egreedy/v2 endpoints (D=12). New feature dims: completion/dismiss rates, mean dwell (clipped 10min), preferred-hour alignment (cosine, 1-dim), tip volume (log). Separate state file per user (_egreedy_v2.json). /reset clears v2 state too. - ADR-0012: documents D=7→12 dimension change, normalization choices, shadow rollout protocol, and promotion gate (offline sim win per ADR-0002). - recommender.ts: register egreedy-v2-shadow in shadow-policy map (disabled by default). When enabled, calls /score/egreedy/v2 fire-and-forget and publishes shadow:egreedy-v2-shadow serve signal. No reward to shadow — sim is the gate. - sim runner/personas: personas carry synthetic profile_features per persona; _call_score/_call_reward thread profile_features through (None-safe for v1/linucb). - 18 new Python tests; all 56 Python + 170 TS tests pass. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-25 10:00:38 +00:00
parent b8113d4bda
commit 2d7cf217a9
6 changed files with 629 additions and 20 deletions
--- a/ml/serving/tests/test_score.py
+++ b/ml/serving/tests/test_score.py
@@ -6,7 +6,15 @@ import math
 import pytest
 from httpx import AsyncClient, ASGITransport

-from main import app, build_feature_vector
+from main import (
+    app,
+    build_feature_vector,
+    build_feature_vector_12,
+    _norm_dwell,
+    _norm_preferred_hour,
+    _norm_rate,
+    _norm_volume,
+)


 class TestFeatureVector:
@@ -243,6 +251,176 @@ async def test_stats_for_fresh_user():
    assert body["estimated_mean_reward"] == 0.0


+class TestV2Normalization:
+    def test_rate_passthrough(self):
+        assert _norm_rate(0.0) == 0.0
+        assert _norm_rate(0.42) == 0.42
+        assert _norm_rate(1.0) == 1.0
+
+    def test_rate_none_zero(self):
+        assert _norm_rate(None) == 0.0
+
+    def test_rate_clipped(self):
+        assert _norm_rate(1.5) == 1.0
+        assert _norm_rate(-0.1) == 0.0
+
+    def test_dwell_none_zero(self):
+        assert _norm_dwell(None) == 0.0
+
+    def test_dwell_scales_to_0_1(self):
+        assert _norm_dwell(0) == 0.0
+        # 600_000 ms (10 min) is the clip ceiling
+        assert _norm_dwell(600_000) == 1.0
+        assert _norm_dwell(1_200_000) == 1.0
+        assert _norm_dwell(60_000) == pytest.approx(0.1)
+
+    def test_volume_monotonic_and_clipped(self):
+        assert _norm_volume(None) == 0.0
+        assert _norm_volume(0) == 0.0
+        assert _norm_volume(10) < _norm_volume(100)
+        # 100 tips ≈ full saturation
+        assert _norm_volume(100) == pytest.approx(1.0)
+        assert _norm_volume(10_000) == 1.0
+
+    def test_preferred_hour_alignment(self):
+        # Exact match → 1.0
+        assert _norm_preferred_hour(9, 9) == pytest.approx(1.0)
+        # 12h opposite → 0.0
+        assert _norm_preferred_hour(21, 9) == pytest.approx(0.0, abs=1e-10)
+        # 6h off → 0.5 (cos(π/2) = 0, scaled to 0.5)
+        assert _norm_preferred_hour(15, 9) == pytest.approx(0.5, abs=1e-10)
+
+    def test_preferred_hour_null_neutral(self):
+        # Null preference → neutral 0.5 rather than misleading "alignment at 0"
+        assert _norm_preferred_hour(None, 9) == 0.5
+
+
+class TestFeatureVector12:
+    def test_shape(self):
+        v = build_feature_vector_12(
+            {"hour_of_day": 9, "is_overdue": True, "task_age_days": 2, "priority": 3},
+            day_of_week=2,
+            profile={
+                "completion_rate_30d": 0.5,
+                "dismiss_rate_30d": 0.1,
+                "mean_dwell_ms_30d": 60_000,
+                "preferred_hour": 9,
+                "tip_volume_30d": 20,
+            },
+        )
+        assert v.shape == (12,)
+
+    def test_first_seven_match_v1(self):
+        """v2 must reduce to v1-style features on the first 7 dims so rollout
+        behaviour is predictable when profile is absent."""
+        from main import build_feature_vector_7
+        feat = {"hour_of_day": 14, "is_overdue": True, "task_age_days": 5, "priority": 2}
+        v1 = build_feature_vector_7(feat, day_of_week=3)
+        v2 = build_feature_vector_12(feat, day_of_week=3, profile=None)
+        assert (v1 == v2[:7]).all()
+
+    def test_missing_profile_defaults(self):
+        v = build_feature_vector_12({"hour_of_day": 9}, day_of_week=0, profile=None)
+        # completion, dismiss, dwell, volume → 0; preferred_hour → 0.5 neutral
+        assert v[7] == 0.0
+        assert v[8] == 0.0
+        assert v[9] == 0.0
+        assert v[10] == pytest.approx(0.5)
+        assert v[11] == 0.0
+
+
+@pytest.mark.asyncio
+async def test_score_egreedy_v2_returns_candidate():
+    payload = {
+        "user_id": "v2-user",
+        "candidates": [
+            {"id": "t:a", "content": "A", "source": "todoist",
+             "features": {"is_overdue": True, "task_age_days": 2, "priority": 3}},
+            {"id": "t:b", "content": "B", "source": "todoist",
+             "features": {"is_overdue": False, "task_age_days": 0, "priority": 1}},
+        ],
+        "context": {"hour_of_day": 9, "day_of_week": 1},
+        "profile_features": {
+            "completion_rate_30d": 0.4,
+            "dismiss_rate_30d": 0.1,
+            "mean_dwell_ms_30d": 45_000,
+            "preferred_hour": 9,
+            "tip_volume_30d": 8,
+        },
+    }
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        r = await client.post("/score/egreedy/v2", json=payload)
+    assert r.status_code == 200
+    body = r.json()
+    assert body["tip_id"] in {"t:a", "t:b"}
+    assert body["policy"] == "egreedy-v2"
+
+
+@pytest.mark.asyncio
+async def test_score_egreedy_v2_accepts_missing_profile():
+    payload = {
+        "user_id": "v2-no-profile",
+        "candidates": [
+            {"id": "t:solo", "content": "Solo", "source": "todoist",
+             "features": {"is_overdue": False, "task_age_days": 0, "priority": 1}},
+        ],
+        "context": {"hour_of_day": 10, "day_of_week": 0},
+    }
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        r = await client.post("/score/egreedy/v2", json=payload)
+    assert r.status_code == 200
+    assert r.json()["tip_id"] == "t:solo"
+
+
+@pytest.mark.asyncio
+async def test_reward_egreedy_v2_updates_stats():
+    user_id = "v2-reward-stats"
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        r0 = await client.get(f"/stats/egreedy/v2/{user_id}")
+        before = r0.json()["cumulative_reward"]
+
+        await client.post("/reward/egreedy/v2", json={
+            "user_id": user_id,
+            "tip_id": "t:r",
+            "reward": 1.0,
+            "features": {"hour_of_day": 9, "is_overdue": True, "task_age_days": 2, "priority": 3},
+            "day_of_week": 1,
+            "profile_features": {
+                "completion_rate_30d": 0.3,
+                "dismiss_rate_30d": 0.2,
+                "mean_dwell_ms_30d": 30_000,
+                "preferred_hour": 9,
+                "tip_volume_30d": 5,
+            },
+        })
+        r1 = await client.get(f"/stats/egreedy/v2/{user_id}")
+    body = r1.json()
+    assert body["cumulative_reward"] == pytest.approx(before + 1.0)
+    assert body["policy"] == "egreedy-v2"
+    assert len(body["theta"]) == 12
+    assert len(body["feature_labels"]) == 12
+
+
+@pytest.mark.asyncio
+async def test_reset_clears_v2_state():
+    user_id = "v2-reset"
+    async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client:
+        await client.post("/score/egreedy/v2", json={
+            "user_id": user_id,
+            "candidates": [
+                {"id": "t:v2r", "content": "x", "source": "todoist",
+                 "features": {"is_overdue": False, "task_age_days": 0, "priority": 1}},
+            ],
+            "context": {"hour_of_day": 10, "day_of_week": 0},
+        })
+        r0 = await client.get(f"/stats/egreedy/v2/{user_id}")
+        assert r0.json()["pulls"] >= 1
+
+        await client.post(f"/reset/{user_id}")
+        r1 = await client.get(f"/stats/egreedy/v2/{user_id}")
+    assert r1.json()["pulls"] == 0
+
+
@pytest.mark.asyncio
 async def test_reward_negative_value():
    """Dismissing a tip should decrease cumulative_reward."""