feat(agents): semantic task clustering + focus-area inferred preferred_areas (#97, #113)

- New ml/agents/clustering.py: embed task content via nomic-embed-text
  (Ollama), greedy cosine clustering (threshold 0.72, max 6 clusters),
  graceful fallback to project-id grouping when Ollama is unreachable
- focus_area v2.0.0: compute() uses semantic clusters as focus areas;
  adds preferred_areas InferredParam inferred from top-2 projects by
  task_completion count
- 135 tests, all passing

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-06 06:54:46 +00:00
parent 336644a90a
commit 26fc67776f
5 changed files with 404 additions and 41 deletions

View File

@@ -1,16 +1,27 @@
from __future__ import annotations
from collections import defaultdict
from collections import Counter
from typing import ClassVar
from .base import BaseAgent, AgentInput, AgentOutput
from .manifest import AgentManifest
from .clustering import cluster_tasks
from .inference.history import UserHistory
from .manifest import AgentManifest, InferredParam
def _infer_preferred_areas(history: UserHistory) -> list[str]:
"""Top-2 project IDs by completed task count (last 90 days worth of data)."""
counts: Counter[str] = Counter()
for tc in history.task_completions:
if tc.project_id:
counts[tc.project_id] += 1
return [pid for pid, _ in counts.most_common(2)]
MANIFEST = AgentManifest(
id="focus-area",
version="1.1.0", # bumped: preferred_areas pref is now honoured in compute (#113)
description="Identifies the most congested project/area in the user's task list.",
version="2.0.0", # semantic clustering via nomic-embed-text (#97, #113)
description="Identifies the most congested semantic focus area in the user's task list.",
pref_schema={
"type": "object",
"additionalProperties": False,
@@ -19,7 +30,7 @@ MANIFEST = AgentManifest(
"type": "array",
"items": {"type": "string"},
"default": [],
"description": "Project / label names to prioritise when multiple areas tie.",
"description": "Project IDs or label names to prioritise when multiple areas tie.",
},
},
},
@@ -27,59 +38,75 @@ MANIFEST = AgentManifest(
required_consents=["data:core", "data:todoist", "agent:focus-area"],
output_contract={"type": "snippet", "format": "free_text"},
ttl_sec=43_200,
# No inferred_params: preferred_areas requires project-level feedback linkage
# that isn't available in feedback_history alone. Revisit with #78 (signal
# abstraction) once per-task reactions can be traced back to a project.
inferred_params=[
InferredParam(
key="preferred_areas",
ttl_sec=86_400,
cold_start_default=[],
min_history=0, # use task_completions, not feedback events; handle empty inside
infer=_infer_preferred_areas,
),
],
)
class FocusAreaAgent(BaseAgent):
"""Identifies the most congested project/area in the user's task list."""
"""Identifies the most congested semantic focus area in the user's task list."""
agent_id: ClassVar[str] = MANIFEST.id
ttl_seconds: ClassVar[int] = MANIFEST.ttl_sec
version: ClassVar[str] = MANIFEST.version
def compute(self, inp: AgentInput) -> AgentOutput:
preferred: list[str] = inp.agent_prefs.get("preferred_areas", [])
by_project: dict[str, list[dict]] = defaultdict(list)
for task in inp.tasks:
project = task.get("project_id") or task.get("project") or "default"
by_project[project].append(task)
if not by_project:
prompt = "No tasks available to identify a focus area."
return self._make_output(inp, prompt, {"project_count": 0})
if not inp.tasks:
return self._make_output(
inp,
"No tasks available to identify a focus area.",
{"cluster_count": 0, "strategy": "none"},
)
def score(project: str, tasks: list[dict]) -> tuple[float, bool]:
base = sum(2.0 if t.get("is_overdue") else 1.0 for t in tasks)
# Boost preferred areas to break ties in their favour
boosted = project in preferred or any(p in project for p in preferred)
return (base + (0.5 if boosted else 0.0), boosted)
clusters = cluster_tasks(inp.tasks)
top_project, top_tasks = max(
by_project.items(),
key=lambda kv: score(kv[0], kv[1]),
)
overdue_in_top = sum(1 for t in top_tasks if t.get("is_overdue"))
label = "the default project" if top_project == "default" else f'"{top_project}"'
n = len(top_tasks)
boosted = top_project in preferred or any(p in top_project for p in preferred)
if not clusters:
return self._make_output(
inp,
"No tasks available to identify a focus area.",
{"cluster_count": 0, "strategy": "none"},
)
strategy = "semantic" if len(clusters) > 1 or len(inp.tasks) > 1 else "fallback"
def score(cluster) -> float:
base = sum(2.0 if t.get("is_overdue") else 1.0 for t in cluster.tasks)
boosted = any(p in cluster.label for p in preferred) if preferred else False
return base + (0.5 if boosted else 0.0)
top = max(clusters, key=score)
boosted = bool(preferred) and any(p in top.label for p in preferred)
parts = [
f"The user's most congested area is {label} "
f"({n} task{'s' if n != 1 else ''}, {overdue_in_top} overdue)."
f'The user\'s most active focus area is "{top.label}" '
f"({top.task_count} task{'s' if top.task_count != 1 else ''}, "
f"{top.overdue_count} overdue)."
]
if boosted:
parts.append("This area matches the user's stated focus preferences.")
if overdue_in_top >= 3:
if top.overdue_count >= 3:
parts.append("Consider surfacing an action from this area.")
if len(clusters) > 1:
other_total = sum(c.task_count for c in clusters if c is not top)
parts.append(
f"{len(clusters) - 1} other area{'s' if len(clusters) > 2 else ''} "
f"contain {other_total} task{'s' if other_total != 1 else ''}."
)
prompt = " ".join(parts)
snapshot = {
"top_project": top_project,
"top_task_count": n,
"top_overdue_count": overdue_in_top,
"project_count": len(by_project),
"top_cluster_label": top.label,
"top_task_count": top.task_count,
"top_overdue_count": top.overdue_count,
"cluster_count": len(clusters),
"strategy": strategy,
"preferred_areas": preferred,
}
return self._make_output(inp, prompt, snapshot)
return self._make_output(inp, " ".join(parts), snapshot)