feat(agents): per-user baseline + stdev inference for momentum agent (#114)

Adds two InferredParams (TTL=7d) computed from 28-day rolling daily done counts:
- baseline_completions_per_day: mean done events/day over the window
- stdev: stdev of daily counts (floored at 0.1 to avoid division by zero)

MomentumAgent.compute() now calculates a z-score from recent done events in
inp.feedback_history vs the inferred baseline. Snippet language switches to
z-score framing ("above your usual pace", "slowing down") when |z| >= 1.0,
falling back to engagement_trend labels when in the normal range.

- engagement_trend InferredParam preserved for backward compatibility
- momentum_window pref added (default 7, user-overridable)
- 14 new tests covering power user, casual user, returning-from-break, and
  relative stdev comparison; engagement_trend tests updated for z-score priority
- Agent bumped to v1.2.0

Closes #114

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-06 05:18:29 +00:00
parent 04212ff318
commit 4cade4868b
2 changed files with 245 additions and 24 deletions

View File

@@ -48,19 +48,31 @@ def _completion(project_id: str | None, lateness_days: float) -> TaskCompletion:
)
# ── momentum: engagement_trend ───────────────────────────────────────────────
# ── momentum helpers ─────────────────────────────────────────────────────────
class TestMomentumInference:
def _neutral_prefs(**extra) -> dict:
"""Prefs that put z-score in the normal range so trend label can show."""
return {"baseline_completions_per_day": 0.0, "stdev": 1.0, "momentum_window": 7, **extra}
def _feedback_done(n: int, days_ago: float = 1.0) -> list[dict]:
from datetime import timedelta
ts = (_NOW - timedelta(days=days_ago)).isoformat()
return [{"action": "done", "dwell_ms": 60_000, "created_at": ts}] * n
# ── momentum: engagement_trend inference ─────────────────────────────────────
class TestMomentumTrendInference:
def test_cold_start_below_min_history(self):
history = _history(*[_event("done", days_ago=i) for i in range(5)])
result = run_inference(MOMENTUM_MANIFEST, history)
assert result["engagement_trend"] == "stable" # cold_start_default
def test_trend_up_when_recent_done_rate_higher(self):
# 8 done in last 7 days, 1 done in prior 7 days → trending up
recent = [_event("done", days_ago=i) for i in range(1, 9)]
older = [_event("dismiss", days_ago=i) for i in range(8, 15)]
older[0] = _event("done", days_ago=8) # one done in older window
older[0] = _event("done", days_ago=8)
history = _history(*recent, *older)
result = run_inference(MOMENTUM_MANIFEST, history)
assert result["engagement_trend"] == "up"
@@ -78,20 +90,115 @@ class TestMomentumInference:
result = run_inference(MOMENTUM_MANIFEST, history)
assert result["engagement_trend"] == "stable"
def test_agent_uses_trend_in_snippet(self):
out = MomentumAgent().compute(_inp(agent_prefs={"engagement_trend": "up"}))
def test_trend_shown_when_z_score_normal(self):
# baseline=0 so z≈0 → no z label → trend label falls through
out = MomentumAgent().compute(_inp(agent_prefs=_neutral_prefs(engagement_trend="up")))
assert "trending up" in out.prompt_text
def test_agent_uses_down_trend_in_snippet(self):
out = MomentumAgent().compute(_inp(agent_prefs={"engagement_trend": "down"}))
def test_trend_down_shown_when_z_score_normal(self):
out = MomentumAgent().compute(_inp(agent_prefs=_neutral_prefs(engagement_trend="down")))
assert "trending down" in out.prompt_text
def test_snapshot_includes_trend(self):
out = MomentumAgent().compute(_inp(agent_prefs={"engagement_trend": "stable"}))
out = MomentumAgent().compute(_inp(agent_prefs=_neutral_prefs(engagement_trend="stable")))
assert "engagement_trend" in out.signals_snapshot
# ── momentum: baseline + stdev inference (#114) ───────────────────────────────
class TestMomentumBaselineInference:
def _events_n_per_day(self, done_per_day: int, n_days: int) -> list[FeedbackEvent]:
"""Generate done events spread across n_days."""
events = []
for d in range(n_days):
for _ in range(done_per_day):
events.append(_event("done", days_ago=d + 0.5))
return events
def test_cold_start_when_few_events(self):
history = _history(*[_event("done", days_ago=i) for i in range(5)])
result = run_inference(MOMENTUM_MANIFEST, history)
assert result["baseline_completions_per_day"] == 1.0
assert result["stdev"] == 1.0
def test_power_user_baseline_high(self):
# 5 done events per day for 20 days → baseline ≈ 5/day (over 28d window, zeros fill rest)
events = self._events_n_per_day(5, 20)
history = _history(*events)
result = run_inference(MOMENTUM_MANIFEST, history)
assert result["baseline_completions_per_day"] > 2.0
def test_casual_user_baseline_low(self):
# 1 done every 3 days + dismiss filler to clear min_history=14 → baseline ≈ 0.33/day
done_events = [_event("done", days_ago=d * 3 + 0.5) for d in range(7)]
filler = [_event("dismiss", days_ago=d + 0.5) for d in range(10)]
history = _history(*done_events, *filler)
result = run_inference(MOMENTUM_MANIFEST, history)
assert result["baseline_completions_per_day"] < 0.5
def test_stdev_reflects_variability(self):
# Alternating 0 and 4 done events → high stdev
events = []
for d in range(14):
if d % 2 == 0:
for _ in range(4):
events.append(_event("done", days_ago=d + 0.5))
history = _history(*events)
result = run_inference(MOMENTUM_MANIFEST, history)
assert result["stdev"] > 1.0
def test_consistent_user_lower_stdev_than_variable(self):
# Consistent 2/day for 28 days has lower stdev than alternating 0/4
consistent = self._events_n_per_day(2, 28)
variable = []
for d in range(14):
if d % 2 == 0:
for _ in range(4):
variable.append(_event("done", days_ago=d + 0.5))
else:
variable.append(_event("dismiss", days_ago=d + 0.5))
r_consistent = run_inference(MOMENTUM_MANIFEST, _history(*consistent))
r_variable = run_inference(MOMENTUM_MANIFEST, _history(*variable))
assert r_consistent["stdev"] < r_variable["stdev"]
# ── momentum: z-score snippet language ───────────────────────────────────────
class TestMomentumZScore:
def _prefs(self, baseline: float, stdev: float = 1.0) -> dict:
return {"baseline_completions_per_day": baseline, "stdev": stdev,
"momentum_window": 7, "engagement_trend": "stable"}
def test_power_user_above_baseline_says_above_usual(self):
# baseline=3/day, stdev=1.0, window=7 → expected rate=3; user did 35 → rate=5, z=2
prefs = self._prefs(baseline=3.0, stdev=1.0)
feedback = _feedback_done(35, days_ago=1.0)
out = MomentumAgent().compute(_inp(feedback_history=feedback, agent_prefs=prefs))
assert "above your usual" in out.prompt_text
def test_casual_user_slowing_down(self):
# baseline=1/day, user did 0 in 7d → z = (0 - 1) / 1 = -1 → below usual
prefs = self._prefs(baseline=1.0, stdev=1.0)
out = MomentumAgent().compute(_inp(feedback_history=[], agent_prefs=prefs))
assert "below your usual" in out.prompt_text
def test_returning_from_break_at_normal_rate(self):
# User just came back: 1 done, baseline=1/day, window=7 → z=(1/7-1)/1≈-0.86, within normal
prefs = self._prefs(baseline=1.0, stdev=1.0)
feedback = _feedback_done(1, days_ago=0.5)
out = MomentumAgent().compute(_inp(feedback_history=feedback, agent_prefs=prefs))
# z ≈ -0.86 → no z label, falls back to trend (stable → no extra sentence)
assert "above your usual" not in out.prompt_text
assert "below your usual" not in out.prompt_text
def test_snapshot_includes_z_score(self):
prefs = self._prefs(baseline=1.0)
out = MomentumAgent().compute(_inp(agent_prefs=prefs))
assert "z_score" in out.signals_snapshot
assert "recent_done_count" in out.signals_snapshot
def test_version_bumped(self):
assert MOMENTUM_MANIFEST.version == "1.1.0"
assert MOMENTUM_MANIFEST.version == "1.2.0"
# ── overdue-task: lateness_tolerance_days + project_realness (#115) ──────────