feat(agents): p50-lateness tolerance + per-project realness for overdue-task (#115)

Replaces snooze-rate heuristic with p50 of actual task lateness (completedAt − dueAt).
Adds project_realness inference: projects with chronic lateness get realness < 1 and
the agent softens its snippet language from "overdue" to "past target date".

- TaskCompletion added to UserHistory with lateness_days computed property
- _infer_lateness_tolerance: p50 of task_completions, clipped at 0, float
- _infer_project_realness: per-project median lateness normalised by global median
- Both InferredParams use 7d TTL; cold_start = 0.0 / {}
- AgentInferRequest accepts task_completions; endpoint wires them through
- 12 new tests covering punctual/chronic/mixed users and language softening
- Agent bumped to v1.2.0

Closes #115

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-06 05:14:04 +00:00
parent 35257b7756
commit 04212ff318
5 changed files with 210 additions and 60 deletions

View File

@@ -8,7 +8,7 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
from datetime import datetime, timezone
import pytest
from ml.agents.inference.history import FeedbackEvent, UserHistory
from ml.agents.inference.history import FeedbackEvent, TaskCompletion, UserHistory
from ml.agents.inference.framework import run_inference
from ml.agents.momentum import MomentumAgent, MANIFEST as MOMENTUM_MANIFEST
from ml.agents.overdue_task import OverdueTaskAgent, MANIFEST as OVERDUE_MANIFEST
@@ -32,8 +32,20 @@ def _event(action: str, days_ago: float = 1.0) -> FeedbackEvent:
return FeedbackEvent(action=action, dwell_ms=dwell, created_at=ts)
def _history(*events: FeedbackEvent) -> UserHistory:
return UserHistory(user_id="u1", events=list(events))
def _history(*events: FeedbackEvent, completions: list[TaskCompletion] | None = None) -> UserHistory:
return UserHistory(user_id="u1", events=list(events), task_completions=completions or [])
def _completion(project_id: str | None, lateness_days: float) -> TaskCompletion:
"""Build a TaskCompletion where completed_at is lateness_days after due_at."""
from datetime import timedelta
due = _NOW - timedelta(days=30)
completed = due + timedelta(days=lateness_days)
return TaskCompletion(
project_id=project_id,
completed_at=completed.isoformat(),
due_at=due.isoformat(),
)
# ── momentum: engagement_trend ───────────────────────────────────────────────
@@ -82,49 +94,94 @@ class TestMomentumInference:
assert MOMENTUM_MANIFEST.version == "1.1.0"
# ── overdue-task: lateness_tolerance_days ────────────────────────────────────
# ── overdue-task: lateness_tolerance_days + project_realness (#115) ──────────
class TestOverdueTaskInference:
def test_cold_start_returns_zero(self):
history = _history(*[_event("done") for _ in range(5)])
result = run_inference(OVERDUE_MANIFEST, history)
assert result["lateness_tolerance_days"] == 0
# -- lateness_tolerance_days inference --
def test_high_snooze_rate_returns_two(self):
events = [_event("snooze")] * 8 + [_event("done")] * 2
history = _history(*events)
def test_cold_start_returns_zero_when_few_completions(self):
# Below min_history=10 task completions → cold start
cs = [_completion("p1", 2.0) for _ in range(5)]
history = _history(*[_event("done")] * 5, completions=cs)
result = run_inference(OVERDUE_MANIFEST, history)
assert result["lateness_tolerance_days"] == 2
assert result["lateness_tolerance_days"] == 0.0
def test_moderate_snooze_returns_one(self):
events = [_event("snooze")] * 3 + [_event("done")] * 7
history = _history(*events)
def test_punctual_user_zero_tolerance(self):
# User always finishes early or on time (negative lateness) → tolerance 0
cs = [_completion("p1", -1.0) for _ in range(12)]
history = _history(*[_event("done")] * 12, completions=cs)
result = run_inference(OVERDUE_MANIFEST, history)
assert result["lateness_tolerance_days"] == 1
assert result["lateness_tolerance_days"] == 0.0
def test_low_snooze_returns_zero(self):
events = [_event("done")] * 9 + [_event("snooze")] * 1
history = _history(*events)
def test_chronic_late_user_positive_tolerance(self):
# User consistently finishes 5 days late → p50 = 5
cs = [_completion("p1", 5.0) for _ in range(12)]
history = _history(*[_event("done")] * 12, completions=cs)
result = run_inference(OVERDUE_MANIFEST, history)
assert result["lateness_tolerance_days"] == 0
assert result["lateness_tolerance_days"] == pytest.approx(5.0)
def test_mixed_lateness_uses_median(self):
# 6 tasks at +1d, 6 tasks at +3d → median = 2
cs = [_completion("p1", 1.0)] * 6 + [_completion("p1", 3.0)] * 6
history = _history(*[_event("done")] * 12, completions=cs)
result = run_inference(OVERDUE_MANIFEST, history)
assert result["lateness_tolerance_days"] == pytest.approx(2.0)
# -- project_realness inference --
def test_project_realness_cold_start_empty(self):
cs = [_completion("p1", 1.0) for _ in range(5)] # below min_history
history = _history(*[_event("done")] * 5, completions=cs)
result = run_inference(OVERDUE_MANIFEST, history)
assert result["project_realness"] == {}
def test_project_realness_punctual_project_scores_high(self):
# p1 always on time (0d late), p2 always 10d late → p1 should be realness ≈ 1
cs = [_completion("p1", 0.0)] * 6 + [_completion("p2", 10.0)] * 6
history = _history(*[_event("done")] * 12, completions=cs)
result = run_inference(OVERDUE_MANIFEST, history)
assert result["project_realness"]["p1"] > result["project_realness"]["p2"]
def test_project_realness_values_clipped_01(self):
cs = [_completion("p1", 0.0)] * 6 + [_completion("p2", 100.0)] * 6
history = _history(*[_event("done")] * 12, completions=cs)
result = run_inference(OVERDUE_MANIFEST, history)
for v in result["project_realness"].values():
assert 0.0 <= v <= 1.0
# -- compute() reads inferred prefs --
def test_tolerance_filters_tasks(self):
tasks = [
{"content": "Fresh overdue", "is_overdue": True, "task_age_days": 0.5},
{"content": "Old overdue", "is_overdue": True, "task_age_days": 3.0},
]
# tolerance=2 → only the 3-day task should count
out = OverdueTaskAgent().compute(_inp(tasks=tasks, agent_prefs={"lateness_tolerance_days": 2}))
assert "1 overdue task" in out.prompt_text
assert "Old overdue" in out.prompt_text
def test_snapshot_includes_tolerance(self):
tasks = [{"content": "T", "is_overdue": True, "task_age_days": 1.0}]
out = OverdueTaskAgent().compute(_inp(tasks=tasks, agent_prefs={"lateness_tolerance_days": 0}))
assert "lateness_tolerance_days" in out.signals_snapshot
def test_low_realness_softens_language(self):
tasks = [{"content": "Wishlist", "is_overdue": True, "task_age_days": 3.0,
"project_id": "aspirational"}]
prefs = {"lateness_tolerance_days": 0, "project_realness": {"aspirational": 0.2}}
out = OverdueTaskAgent().compute(_inp(tasks=tasks, agent_prefs=prefs))
assert "target date" in out.prompt_text
def test_high_realness_uses_overdue_language(self):
tasks = [{"content": "Critical", "is_overdue": True, "task_age_days": 3.0,
"project_id": "work"}]
prefs = {"lateness_tolerance_days": 0, "project_realness": {"work": 0.9}}
out = OverdueTaskAgent().compute(_inp(tasks=tasks, agent_prefs=prefs))
assert "overdue" in out.prompt_text
def test_snapshot_includes_realness(self):
tasks = [{"content": "T", "is_overdue": True, "task_age_days": 1.0, "project_id": "p1"}]
prefs = {"lateness_tolerance_days": 0, "project_realness": {"p1": 0.8}}
out = OverdueTaskAgent().compute(_inp(tasks=tasks, agent_prefs=prefs))
assert "realness" in out.signals_snapshot["top_overdue"][0]
def test_version_bumped(self):
assert OVERDUE_MANIFEST.version == "1.1.0"
assert OVERDUE_MANIFEST.version == "1.2.0"
# ── recent-patterns: window_days ─────────────────────────────────────────────