feat(agents): semantic task clustering + focus-area inferred preferred_areas (#97, #113)

- New ml/agents/clustering.py: embed task content via nomic-embed-text (Ollama), greedy cosine clustering (threshold 0.72, max 6 clusters), graceful fallback to project-id grouping when Ollama is unreachable - focus_area v2.0.0: compute() uses semantic clusters as focus areas; adds preferred_areas InferredParam inferred from top-2 projects by task_completion count - 135 tests, all passing Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 06:54:46 +00:00
parent 336644a90a
commit 26fc67776f
5 changed files with 404 additions and 41 deletions
--- a/ml/agents/focus_area.py
+++ b/ml/agents/focus_area.py
@@ -1,16 +1,27 @@
 from __future__ import annotations

-from collections import defaultdict
+from collections import Counter
 from typing import ClassVar

 from .base import BaseAgent, AgentInput, AgentOutput
-from .manifest import AgentManifest
+from .clustering import cluster_tasks
+from .inference.history import UserHistory
+from .manifest import AgentManifest, InferredParam
+
+
+def _infer_preferred_areas(history: UserHistory) -> list[str]:
+    """Top-2 project IDs by completed task count (last 90 days worth of data)."""
+    counts: Counter[str] = Counter()
+    for tc in history.task_completions:
+        if tc.project_id:
+            counts[tc.project_id] += 1
+    return [pid for pid, _ in counts.most_common(2)]


 MANIFEST = AgentManifest(
    id="focus-area",
-    version="1.1.0",  # bumped: preferred_areas pref is now honoured in compute (#113)
-    description="Identifies the most congested project/area in the user's task list.",
+    version="2.0.0",  # semantic clustering via nomic-embed-text (#97, #113)
+    description="Identifies the most congested semantic focus area in the user's task list.",
    pref_schema={
        "type": "object",
        "additionalProperties": False,
@@ -19,7 +30,7 @@ MANIFEST = AgentManifest(
                "type": "array",
                "items": {"type": "string"},
                "default": [],
-                "description": "Project / label names to prioritise when multiple areas tie.",
+                "description": "Project IDs or label names to prioritise when multiple areas tie.",
            },
        },
    },
@@ -27,59 +38,75 @@ MANIFEST = AgentManifest(
    required_consents=["data:core", "data:todoist", "agent:focus-area"],
    output_contract={"type": "snippet", "format": "free_text"},
    ttl_sec=43_200,
-    # No inferred_params: preferred_areas requires project-level feedback linkage
-    # that isn't available in feedback_history alone. Revisit with #78 (signal
-    # abstraction) once per-task reactions can be traced back to a project.
+    inferred_params=[
+        InferredParam(
+            key="preferred_areas",
+            ttl_sec=86_400,
+            cold_start_default=[],
+            min_history=0,   # use task_completions, not feedback events; handle empty inside
+            infer=_infer_preferred_areas,
+        ),
+    ],
 )


 class FocusAreaAgent(BaseAgent):
-    """Identifies the most congested project/area in the user's task list."""
+    """Identifies the most congested semantic focus area in the user's task list."""
    agent_id: ClassVar[str] = MANIFEST.id
    ttl_seconds: ClassVar[int] = MANIFEST.ttl_sec
    version: ClassVar[str] = MANIFEST.version

    def compute(self, inp: AgentInput) -> AgentOutput:
        preferred: list[str] = inp.agent_prefs.get("preferred_areas", [])
-        by_project: dict[str, list[dict]] = defaultdict(list)
-        for task in inp.tasks:
-            project = task.get("project_id") or task.get("project") or "default"
-            by_project[project].append(task)

-        if not by_project:
-            prompt = "No tasks available to identify a focus area."
-            return self._make_output(inp, prompt, {"project_count": 0})
+        if not inp.tasks:
+            return self._make_output(
+                inp,
+                "No tasks available to identify a focus area.",
+                {"cluster_count": 0, "strategy": "none"},
+            )

-        def score(project: str, tasks: list[dict]) -> tuple[float, bool]:
-            base = sum(2.0 if t.get("is_overdue") else 1.0 for t in tasks)
-            # Boost preferred areas to break ties in their favour
-            boosted = project in preferred or any(p in project for p in preferred)
-            return (base + (0.5 if boosted else 0.0), boosted)
+        clusters = cluster_tasks(inp.tasks)

-        top_project, top_tasks = max(
-            by_project.items(),
-            key=lambda kv: score(kv[0], kv[1]),
-        )
-        overdue_in_top = sum(1 for t in top_tasks if t.get("is_overdue"))
-        label = "the default project" if top_project == "default" else f'"{top_project}"'
-        n = len(top_tasks)
-        boosted = top_project in preferred or any(p in top_project for p in preferred)
+        if not clusters:
+            return self._make_output(
+                inp,
+                "No tasks available to identify a focus area.",
+                {"cluster_count": 0, "strategy": "none"},
+            )
+
+        strategy = "semantic" if len(clusters) > 1 or len(inp.tasks) > 1 else "fallback"
+
+        def score(cluster) -> float:
+            base = sum(2.0 if t.get("is_overdue") else 1.0 for t in cluster.tasks)
+            boosted = any(p in cluster.label for p in preferred) if preferred else False
+            return base + (0.5 if boosted else 0.0)
+
+        top = max(clusters, key=score)
+        boosted = bool(preferred) and any(p in top.label for p in preferred)

        parts = [
-            f"The user's most congested area is {label} "
-            f"({n} task{'s' if n != 1 else ''}, {overdue_in_top} overdue)."
+            f'The user\'s most active focus area is "{top.label}" '
+            f"({top.task_count} task{'s' if top.task_count != 1 else ''}, "
+            f"{top.overdue_count} overdue)."
        ]
        if boosted:
            parts.append("This area matches the user's stated focus preferences.")
-        if overdue_in_top >= 3:
+        if top.overdue_count >= 3:
            parts.append("Consider surfacing an action from this area.")
+        if len(clusters) > 1:
+            other_total = sum(c.task_count for c in clusters if c is not top)
+            parts.append(
+                f"{len(clusters) - 1} other area{'s' if len(clusters) > 2 else ''} "
+                f"contain {other_total} task{'s' if other_total != 1 else ''}."
+            )

-        prompt = " ".join(parts)
        snapshot = {
-            "top_project": top_project,
-            "top_task_count": n,
-            "top_overdue_count": overdue_in_top,
-            "project_count": len(by_project),
+            "top_cluster_label": top.label,
+            "top_task_count": top.task_count,
+            "top_overdue_count": top.overdue_count,
+            "cluster_count": len(clusters),
+            "strategy": strategy,
            "preferred_areas": preferred,
        }
-        return self._make_output(inp, prompt, snapshot)
+        return self._make_output(inp, " ".join(parts), snapshot)