Files
oO/ml/agents/tests/test_per_agent_inference.py
alvis afb0e9b0cb feat(agents): per-agent inference — momentum, overdue-task, recent-patterns, focus-area (ADR-0014 step 7)
All four agents bumped to v1.1.0.

momentum (#114): infers engagement_trend ('up'|'stable'|'down') by comparing
done-rate in the last 7 days vs the prior 7 days. Agent surfaces the trend
in its snippet ("trending up — build on the momentum").

overdue-task (#115): infers lateness_tolerance_days (0/1/2) from snooze rate.
Agent now filters tasks against the tolerance so low-urgency users aren't
nagged about tasks that are only hours overdue.

recent-patterns (#116): infers window_days (7/14/30) from feedback event
density — sparse users get a wider window so the snippet isn't always empty.

focus-area (#113): no inferred params (project-level feedback linkage needed,
tracked under #78). preferred_areas pref was declared but ignored; agent now
honours it as a tiebreaker and mentions it in the snippet.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-05 11:21:10 +00:00

214 lines
9.4 KiB
Python

"""Per-agent inference tests: momentum (#114), overdue-task (#115), recent-patterns (#116),
and focus-area (#113) preferred_areas wiring."""
from __future__ import annotations
import sys, os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
from datetime import datetime, timezone
import pytest
from ml.agents.inference.history import FeedbackEvent, UserHistory
from ml.agents.inference.framework import run_inference
from ml.agents.momentum import MomentumAgent, MANIFEST as MOMENTUM_MANIFEST
from ml.agents.overdue_task import OverdueTaskAgent, MANIFEST as OVERDUE_MANIFEST
from ml.agents.recent_patterns import RecentPatternsAgent, MANIFEST as RECENT_MANIFEST
from ml.agents.focus_area import FocusAreaAgent
from ml.agents.base import AgentInput
_NOW = datetime(2026, 5, 8, 14, 0, 0, tzinfo=timezone.utc)
def _inp(**kwargs) -> AgentInput:
defaults = dict(user_id="u1", tasks=[], profile={}, now=_NOW, agent_prefs={})
defaults.update(kwargs)
return AgentInput(**defaults)
def _event(action: str, days_ago: float = 1.0) -> FeedbackEvent:
from datetime import timedelta
ts = (_NOW - timedelta(days=days_ago)).isoformat()
dwell = 60_000 if action == "done" else 500
return FeedbackEvent(action=action, dwell_ms=dwell, created_at=ts)
def _history(*events: FeedbackEvent) -> UserHistory:
return UserHistory(user_id="u1", events=list(events))
# ── momentum: engagement_trend ───────────────────────────────────────────────
class TestMomentumInference:
def test_cold_start_below_min_history(self):
history = _history(*[_event("done", days_ago=i) for i in range(5)])
result = run_inference(MOMENTUM_MANIFEST, history)
assert result["engagement_trend"] == "stable" # cold_start_default
def test_trend_up_when_recent_done_rate_higher(self):
# 8 done in last 7 days, 1 done in prior 7 days → trending up
recent = [_event("done", days_ago=i) for i in range(1, 9)]
older = [_event("dismiss", days_ago=i) for i in range(8, 15)]
older[0] = _event("done", days_ago=8) # one done in older window
history = _history(*recent, *older)
result = run_inference(MOMENTUM_MANIFEST, history)
assert result["engagement_trend"] == "up"
def test_trend_down_when_recent_done_rate_lower(self):
recent = [_event("dismiss", days_ago=i) for i in range(1, 8)]
older = [_event("done", days_ago=i) for i in range(8, 15)]
history = _history(*recent, *older)
result = run_inference(MOMENTUM_MANIFEST, history)
assert result["engagement_trend"] == "down"
def test_trend_stable_when_similar(self):
events = [_event("done" if i % 2 == 0 else "dismiss", days_ago=i) for i in range(1, 15)]
history = _history(*events)
result = run_inference(MOMENTUM_MANIFEST, history)
assert result["engagement_trend"] == "stable"
def test_agent_uses_trend_in_snippet(self):
out = MomentumAgent().compute(_inp(agent_prefs={"engagement_trend": "up"}))
assert "trending up" in out.prompt_text
def test_agent_uses_down_trend_in_snippet(self):
out = MomentumAgent().compute(_inp(agent_prefs={"engagement_trend": "down"}))
assert "trending down" in out.prompt_text
def test_snapshot_includes_trend(self):
out = MomentumAgent().compute(_inp(agent_prefs={"engagement_trend": "stable"}))
assert "engagement_trend" in out.signals_snapshot
def test_version_bumped(self):
assert MOMENTUM_MANIFEST.version == "1.1.0"
# ── overdue-task: lateness_tolerance_days ────────────────────────────────────
class TestOverdueTaskInference:
def test_cold_start_returns_zero(self):
history = _history(*[_event("done") for _ in range(5)])
result = run_inference(OVERDUE_MANIFEST, history)
assert result["lateness_tolerance_days"] == 0
def test_high_snooze_rate_returns_two(self):
events = [_event("snooze")] * 8 + [_event("done")] * 2
history = _history(*events)
result = run_inference(OVERDUE_MANIFEST, history)
assert result["lateness_tolerance_days"] == 2
def test_moderate_snooze_returns_one(self):
events = [_event("snooze")] * 3 + [_event("done")] * 7
history = _history(*events)
result = run_inference(OVERDUE_MANIFEST, history)
assert result["lateness_tolerance_days"] == 1
def test_low_snooze_returns_zero(self):
events = [_event("done")] * 9 + [_event("snooze")] * 1
history = _history(*events)
result = run_inference(OVERDUE_MANIFEST, history)
assert result["lateness_tolerance_days"] == 0
def test_tolerance_filters_tasks(self):
tasks = [
{"content": "Fresh overdue", "is_overdue": True, "task_age_days": 0.5},
{"content": "Old overdue", "is_overdue": True, "task_age_days": 3.0},
]
# tolerance=2 → only the 3-day task should count
out = OverdueTaskAgent().compute(_inp(tasks=tasks, agent_prefs={"lateness_tolerance_days": 2}))
assert "1 overdue task" in out.prompt_text
assert "Old overdue" in out.prompt_text
def test_snapshot_includes_tolerance(self):
tasks = [{"content": "T", "is_overdue": True, "task_age_days": 1.0}]
out = OverdueTaskAgent().compute(_inp(tasks=tasks, agent_prefs={"lateness_tolerance_days": 0}))
assert "lateness_tolerance_days" in out.signals_snapshot
def test_version_bumped(self):
assert OVERDUE_MANIFEST.version == "1.1.0"
# ── recent-patterns: window_days ─────────────────────────────────────────────
class TestRecentPatternsInference:
def test_cold_start_default_7(self):
history = _history(*[_event("done") for _ in range(3)]) # below min_history=5
result = run_inference(RECENT_MANIFEST, history)
assert result["window_days"] == 7 # cold_start_default
def test_sparse_history_widens_window(self):
history = _history(*[_event("done") for _ in range(5)]) # 5 events, n < 7 → 30 days
result = run_inference(RECENT_MANIFEST, history)
assert result["window_days"] == 30
def test_moderate_history_14_days(self):
history = _history(*[_event("done") for _ in range(10)]) # 7 ≤ n < 14 → 14 days
result = run_inference(RECENT_MANIFEST, history)
assert result["window_days"] == 14
def test_dense_history_stays_7(self):
history = _history(*[_event("done") for _ in range(20)]) # 20+ → 7 days
result = run_inference(RECENT_MANIFEST, history)
assert result["window_days"] == 7
def test_agent_uses_window_days_pref(self):
from datetime import timedelta
# 5 feedback events, all within 14 days but older than 7 days
feedback = [
{"action": "done", "dwell_ms": 60000,
"created_at": (_NOW - timedelta(days=10)).isoformat()}
] * 5
# With window_days=7 → 0 events seen; with window_days=14 → 5 events
out_narrow = RecentPatternsAgent().compute(
_inp(feedback_history=feedback, agent_prefs={"window_days": 7})
)
out_wide = RecentPatternsAgent().compute(
_inp(feedback_history=feedback, agent_prefs={"window_days": 14})
)
assert "No tip reactions" in out_narrow.prompt_text
assert "5 tip reactions" in out_wide.prompt_text
def test_snapshot_includes_window_days(self):
out = RecentPatternsAgent().compute(_inp(agent_prefs={"window_days": 14}))
assert out.signals_snapshot["window_days"] == 14
def test_version_bumped(self):
assert RECENT_MANIFEST.version == "1.1.0"
# ── focus-area: preferred_areas wiring ───────────────────────────────────────
class TestFocusAreaPreferredAreas:
agent = FocusAreaAgent()
def _task(self, content: str, project_id: str, is_overdue: bool = False) -> dict:
return {"id": "t1", "content": content, "is_overdue": is_overdue,
"task_age_days": 2.0, "priority": 1, "project_id": project_id}
def test_preferred_area_wins_tie(self):
tasks = [
self._task("Work thing", "work"),
self._task("Home thing", "home"),
]
out = self.agent.compute(_inp(tasks=tasks, agent_prefs={"preferred_areas": ["work"]}))
assert "work" in out.prompt_text
assert "matches the user's stated focus preferences" in out.prompt_text
def test_no_preferred_areas_uses_congestion_score(self):
tasks = [
self._task("W1", "work"),
self._task("H1", "home"),
self._task("H2", "home"),
]
out = self.agent.compute(_inp(tasks=tasks))
# home has more tasks → wins without any preference
assert "home" in out.prompt_text
def test_snapshot_includes_preferred_areas(self):
tasks = [self._task("T", "work")]
out = self.agent.compute(_inp(tasks=tasks, agent_prefs={"preferred_areas": ["work"]}))
assert out.signals_snapshot["preferred_areas"] == ["work"]
def test_version_bumped(self):
from ml.agents.focus_area import MANIFEST as FA_MANIFEST
assert FA_MANIFEST.version == "1.1.0"