Split monolithic test_pipeline.py into focused integration test scripts

- common.py: shared config, URL constants, benchmark questions, all helpers (get, post_json, check_sse, qdrant_count, fetch_logs, parse_run_block, wait_for, etc.) - test_health.py: service health checks (deepagents, bifrost, GPU/CPU Ollama, Qdrant, SearXNG) - test_memory.py: name store/recall pipeline, memory benchmark (5 facts + 10 recalls), dedup test - test_routing.py: easy/medium/hard tier routing benchmarks with --easy/medium/hard-only flags - Removed test_pipeline.py Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-12 16:02:57 +00:00
parent 50097d6092
commit 021104f510
6 changed files with 1255 additions and 1304 deletions
--- a/tests/integration/common.py
+++ b/tests/integration/common.py
@@ -0,0 +1,273 @@
+"""
+Shared config, helpers, and utilities for Adolf integration tests.
+"""
+
+import http.client
+import json
+import re
+import subprocess
+import time
+import urllib.request
+
+# ── config ────────────────────────────────────────────────────────────────────
+DEEPAGENTS   = "http://localhost:8000"
+BIFROST      = "http://localhost:8080"
+OPENMEMORY   = "http://localhost:8765"
+GRAMMY_HOST  = "localhost"
+GRAMMY_PORT  = 3001
+OLLAMA_GPU   = "http://localhost:11436"
+OLLAMA_CPU   = "http://localhost:11435"
+QDRANT       = "http://localhost:6333"
+SEARXNG      = "http://localhost:11437"
+COMPOSE_FILE = "/home/alvis/adolf/docker-compose.yml"
+DEFAULT_CHAT_ID = "346967270"
+
+NAMES = [
+    "Maximilian", "Cornelius", "Zephyr", "Archibald", "Balthazar",
+    "Ignatius", "Lysander", "Octavian", "Reginald", "Sylvester",
+]
+
+BENCHMARK = {
+    "easy": [
+        "hi",
+        "what is 2+2?",
+        "what is the capital of France?",
+        "tell me a short joke",
+        "how are you doing today?",
+        "thanks!",
+        "what day comes after Wednesday?",
+        "name the three primary colors",
+        "is the sky blue?",
+        "what does CPU stand for?",
+    ],
+    "medium": [
+        "what is the current weather in Berlin?",
+        "find the latest news about artificial intelligence",
+        "what is the current price of Bitcoin?",
+        "search for a good pasta carbonara recipe",
+        "what movies are in theaters this week?",
+        "find Python tutorials for beginners",
+        "who won the last FIFA World Cup?",
+        "do you remember what we talked about before?",
+        "search for the best coffee shops in Tokyo",
+        "what is happening in the tech industry this week?",
+        "what's the weather like today?",
+    ],
+    "hard": [
+        "/think compare the top 3 Python web frameworks (Django, FastAPI, Flask) and recommend one for a production REST API",
+        "/think research the history of artificial intelligence and create a timeline of key milestones",
+        "/think plan a 7-day trip to Japan with daily itinerary, accommodation suggestions, and budget breakdown",
+        "/think analyze microservices vs monolithic architecture: pros, cons, and when to choose each",
+        "/think write a Python script that reads a CSV file, cleans the data, and generates summary statistics",
+        "/think research quantum computing: explain the key concepts and how it differs from classical computing",
+        "/think compare PostgreSQL, MongoDB, and Redis — when to use each and what are the trade-offs?",
+        "/think create a comprehensive Docker deployment guide covering best practices for production",
+        "/think research climate change: summarize the latest IPCC findings and key data points",
+        "/think design a REST API with authentication, rate limiting, and proper error handling — provide architecture and code outline",
+    ],
+}
+
+# ── terminal colours ──────────────────────────────────────────────────────────
+PASS = "\033[32mPASS\033[0m"
+FAIL = "\033[31mFAIL\033[0m"
+INFO = "\033[36mINFO\033[0m"
+WARN = "\033[33mWARN\033[0m"
+
+
+# ── result helpers ────────────────────────────────────────────────────────────
+
+def report(results: list, name: str, ok: bool, detail: str = ""):
+    tag = PASS if ok else FAIL
+    print(f"  [{tag}] {name}" + (f" — {detail}" if detail else ""))
+    results.append((name, ok))
+
+
+def print_summary(results: list):
+    print(f"\n{'─'*55}")
+    total  = len(results)
+    passed = sum(1 for _, ok in results if ok)
+    failed = total - passed
+    print(f"Results: {passed}/{total} passed", end="")
+    if failed:
+        print(f"  ({failed} failed)\n")
+        print("Failed checks:")
+        for name, ok in results:
+            if not ok:
+                print(f"  - {name}")
+    else:
+        print(" — all good")
+    print()
+
+
+def tf(v):
+    """Format timing value."""
+    return f"{v:6.2f}s" if v is not None else "   n/a"
+
+
+# ── HTTP helpers ──────────────────────────────────────────────────────────────
+
+def get(url, timeout=5):
+    with urllib.request.urlopen(urllib.request.Request(url), timeout=timeout) as r:
+        return r.status, r.read().decode()
+
+
+def post_json(url, payload, timeout=10):
+    data = json.dumps(payload).encode()
+    req = urllib.request.Request(
+        url, data=data,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=timeout) as r:
+        return r.status, json.loads(r.read().decode())
+
+
+def check_sse(host, port, path):
+    try:
+        conn = http.client.HTTPConnection(host, port, timeout=5)
+        conn.request("GET", path, headers={"Accept": "text/event-stream"})
+        r = conn.getresponse()
+        conn.close()
+        return r.status == 200, f"HTTP {r.status}"
+    except Exception as e:
+        return False, str(e)
+
+
+def qdrant_count():
+    try:
+        _, body = get(f"{QDRANT}/collections/adolf_memories")
+        return json.loads(body).get("result", {}).get("points_count", 0)
+    except Exception:
+        return 0
+
+
+# ── log helpers ───────────────────────────────────────────────────────────────
+
+def fetch_logs(since_s=600):
+    """Return deepagents log lines from the last since_s seconds."""
+    try:
+        r = subprocess.run(
+            ["docker", "compose", "-f", COMPOSE_FILE, "logs", "deepagents",
+             f"--since={int(since_s)}s", "--no-log-prefix"],
+            capture_output=True, text=True, timeout=15,
+        )
+        return r.stdout.splitlines()
+    except Exception:
+        return []
+
+
+def fetch_bifrost_logs(since_s=120):
+    """Return bifrost container log lines from the last since_s seconds."""
+    try:
+        r = subprocess.run(
+            ["docker", "compose", "-f", COMPOSE_FILE, "logs", "bifrost",
+             f"--since={int(since_s)}s", "--no-log-prefix"],
+            capture_output=True, text=True, timeout=10,
+        )
+        return r.stdout.splitlines()
+    except Exception:
+        return []
+
+
+def parse_run_block(lines, msg_prefix):
+    """
+    Scan log lines for the LAST '[agent] running: <msg_prefix>' block.
+    Extracts reply timing, tier, and memory timing from that block.
+
+    Returns dict or None if the reply has not appeared in logs yet.
+    Dict keys:
+      reply_total, llm, send, tier, reply_text  — from "[agent] replied in ..."
+      memory_s                                  — from "[memory] stored in ..."
+      memory_error                              — True if "[memory] error" found
+    """
+    search = msg_prefix[:50]
+    start_idx = None
+    for i, line in enumerate(lines):
+        if "[agent] running:" in line and search in line:
+            start_idx = i  # keep updating — we want the LAST occurrence
+
+    if start_idx is None:
+        return None
+
+    block = lines[start_idx:]
+    last_ai_text = None
+    reply_data = None
+
+    for j, line in enumerate(block):
+        if "AIMessage:" in line and "→" not in line:
+            txt = line.split("AIMessage:", 1)[-1].strip()
+            if txt:
+                last_ai_text = txt
+
+        m = re.search(r"replied in ([\d.]+)s \(llm=([\d.]+)s, send=([\d.]+)s\)", line)
+        if m:
+            tier_m = re.search(r"\btier=(\w+)", line)
+            tier = tier_m.group(1) if tier_m else "unknown"
+            reply_data = {
+                "reply_total": float(m.group(1)),
+                "llm":         float(m.group(2)),
+                "send":        float(m.group(3)),
+                "tier":        tier,
+                "reply_text":  last_ai_text,
+                "memory_s":    None,
+                "memory_error": False,
+                "_j": j,
+            }
+            break
+
+    if reply_data is not None:
+        next_lines = block[reply_data["_j"] + 1: reply_data["_j"] + 3]
+        for line in next_lines:
+            if line.startswith("[agent] reply_text:"):
+                reply_data["reply_text"] = line[len("[agent] reply_text:"):].strip()
+                break
+
+    if reply_data is None:
+        return None
+
+    for line in block[reply_data["_j"] + 1:]:
+        mm = re.search(r"\[memory\] stored in ([\d.]+)s", line)
+        if mm:
+            reply_data["memory_s"] = float(mm.group(1))
+            break
+        if "[memory] error" in line:
+            reply_data["memory_error"] = True
+            break
+
+    return reply_data
+
+
+def wait_for(label, msg_prefix, timeout_s=200, need_memory=True):
+    """
+    Poll deepagents logs until the message is fully processed.
+    Shows a live progress line. Returns timing dict or None on timeout.
+    """
+    t_start = time.monotonic()
+    deadline = t_start + timeout_s
+    tick = 0
+    last_result = None
+
+    while time.monotonic() < deadline:
+        since = int(time.monotonic() - t_start) + 90
+        lines = fetch_logs(since_s=since)
+        result = parse_run_block(lines, msg_prefix)
+
+        if result:
+            last_result = result
+            has_mem = result["memory_s"] is not None or result["memory_error"]
+            if (not need_memory) or has_mem:
+                elapsed = time.monotonic() - t_start
+                print(f"\r  [{label}] done after {elapsed:.0f}s{' ' * 30}")
+                return result
+
+        time.sleep(4)
+        tick += 1
+        rem = int(deadline - time.monotonic())
+        if last_result:
+            phase = "waiting for memory..." if need_memory else "done"
+        else:
+            phase = "waiting for LLM reply..."
+        print(f"\r  [{label}] {tick*4}s elapsed, {rem}s left — {phase}  ", end="", flush=True)
+
+    print(f"\r  [{label}] TIMEOUT after {timeout_s}s{' ' * 30}")
+    return None