Embed Crawl4AI at all tiers, restore qwen3:4b medium, update docs

- Pre-routing URL fetch: any message with URLs gets content fetched async (httpx.AsyncClient) before routing via _fetch_urls_from_message() - URL context and memories gathered concurrently with asyncio.gather - Light tier upgraded to medium when URL content is present - url_context injected into system prompt for medium and complex agents - Complex agent retains web_search/fetch_url tools + receives pre-fetched content - Medium model restored to qwen3:4b (was temporarily qwen2.5:1.5b) - Unit tests added for _extract_urls - ARCHITECTURE.md: added Tool Handling, Crawl4AI Integration, Memory Pipeline sections - CLAUDE.md: updated request flow and Crawl4AI integration docs Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-12 15:49:34 +00:00
parent f9618a9bbf
commit 50097d6092
8 changed files with 183 additions and 31 deletions
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/integration/init.py
+++ b/tests/integration/init.py
--- a/tests/unit/init.py
+++ b/tests/unit/init.py
--- a/tests/unit/test_agent_helpers.py
+++ b/tests/unit/test_agent_helpers.py
@@ -13,7 +13,7 @@ import pytest
 # The FastAPI app is instantiated at module level in agent.py —
 # with the mocked fastapi, that just creates a MagicMock() object
 # and the route decorators are no-ops.
-from agent import _strip_think, _extract_final_text
+from agent import _strip_think, _extract_final_text, _extract_urls


 # ── _strip_think ───────────────────────────────────────────────────────────────
@@ -159,3 +159,40 @@ class TestExtractFinalText:
            ]
        }
        assert _extract_final_text(result) == "## Report\n\nSome content."
+
+
+# ── _extract_urls ──────────────────────────────────────────────────────────────
+
+class TestExtractUrls:
+    def test_single_url(self):
+        assert _extract_urls("check this out https://example.com please") == ["https://example.com"]
+
+    def test_multiple_urls(self):
+        urls = _extract_urls("see https://foo.com and https://bar.org/path?q=1")
+        assert urls == ["https://foo.com", "https://bar.org/path?q=1"]
+
+    def test_no_urls(self):
+        assert _extract_urls("no links here at all") == []
+
+    def test_http_and_https(self):
+        urls = _extract_urls("http://old.site and https://new.site")
+        assert "http://old.site" in urls
+        assert "https://new.site" in urls
+
+    def test_url_at_start_of_message(self):
+        assert _extract_urls("https://example.com is interesting") == ["https://example.com"]
+
+    def test_url_only(self):
+        assert _extract_urls("https://example.com/page") == ["https://example.com/page"]
+
+    def test_url_with_path_and_query(self):
+        url = "https://example.com/articles/123?ref=home&page=2"
+        assert _extract_urls(url) == [url]
+
+    def test_empty_string(self):
+        assert _extract_urls("") == []
+
+    def test_does_not_include_surrounding_quotes(self):
+        # URLs inside quotes should not include the quote character
+        urls = _extract_urls('visit "https://example.com" today')
+        assert urls == ["https://example.com"]