Split monolithic test_pipeline.py into focused integration test scripts
- common.py: shared config, URL constants, benchmark questions, all helpers (get, post_json, check_sse, qdrant_count, fetch_logs, parse_run_block, wait_for, etc.) - test_health.py: service health checks (deepagents, bifrost, GPU/CPU Ollama, Qdrant, SearXNG) - test_memory.py: name store/recall pipeline, memory benchmark (5 facts + 10 recalls), dedup test - test_routing.py: easy/medium/hard tier routing benchmarks with --easy/medium/hard-only flags - Removed test_pipeline.py Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
273
tests/integration/common.py
Normal file
273
tests/integration/common.py
Normal file
@@ -0,0 +1,273 @@
|
||||
"""
|
||||
Shared config, helpers, and utilities for Adolf integration tests.
|
||||
"""
|
||||
|
||||
import http.client
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
import urllib.request
|
||||
|
||||
# ── config ────────────────────────────────────────────────────────────────────
|
||||
DEEPAGENTS = "http://localhost:8000"
|
||||
BIFROST = "http://localhost:8080"
|
||||
OPENMEMORY = "http://localhost:8765"
|
||||
GRAMMY_HOST = "localhost"
|
||||
GRAMMY_PORT = 3001
|
||||
OLLAMA_GPU = "http://localhost:11436"
|
||||
OLLAMA_CPU = "http://localhost:11435"
|
||||
QDRANT = "http://localhost:6333"
|
||||
SEARXNG = "http://localhost:11437"
|
||||
COMPOSE_FILE = "/home/alvis/adolf/docker-compose.yml"
|
||||
DEFAULT_CHAT_ID = "346967270"
|
||||
|
||||
NAMES = [
|
||||
"Maximilian", "Cornelius", "Zephyr", "Archibald", "Balthazar",
|
||||
"Ignatius", "Lysander", "Octavian", "Reginald", "Sylvester",
|
||||
]
|
||||
|
||||
BENCHMARK = {
|
||||
"easy": [
|
||||
"hi",
|
||||
"what is 2+2?",
|
||||
"what is the capital of France?",
|
||||
"tell me a short joke",
|
||||
"how are you doing today?",
|
||||
"thanks!",
|
||||
"what day comes after Wednesday?",
|
||||
"name the three primary colors",
|
||||
"is the sky blue?",
|
||||
"what does CPU stand for?",
|
||||
],
|
||||
"medium": [
|
||||
"what is the current weather in Berlin?",
|
||||
"find the latest news about artificial intelligence",
|
||||
"what is the current price of Bitcoin?",
|
||||
"search for a good pasta carbonara recipe",
|
||||
"what movies are in theaters this week?",
|
||||
"find Python tutorials for beginners",
|
||||
"who won the last FIFA World Cup?",
|
||||
"do you remember what we talked about before?",
|
||||
"search for the best coffee shops in Tokyo",
|
||||
"what is happening in the tech industry this week?",
|
||||
"what's the weather like today?",
|
||||
],
|
||||
"hard": [
|
||||
"/think compare the top 3 Python web frameworks (Django, FastAPI, Flask) and recommend one for a production REST API",
|
||||
"/think research the history of artificial intelligence and create a timeline of key milestones",
|
||||
"/think plan a 7-day trip to Japan with daily itinerary, accommodation suggestions, and budget breakdown",
|
||||
"/think analyze microservices vs monolithic architecture: pros, cons, and when to choose each",
|
||||
"/think write a Python script that reads a CSV file, cleans the data, and generates summary statistics",
|
||||
"/think research quantum computing: explain the key concepts and how it differs from classical computing",
|
||||
"/think compare PostgreSQL, MongoDB, and Redis — when to use each and what are the trade-offs?",
|
||||
"/think create a comprehensive Docker deployment guide covering best practices for production",
|
||||
"/think research climate change: summarize the latest IPCC findings and key data points",
|
||||
"/think design a REST API with authentication, rate limiting, and proper error handling — provide architecture and code outline",
|
||||
],
|
||||
}
|
||||
|
||||
# ── terminal colours ──────────────────────────────────────────────────────────
|
||||
PASS = "\033[32mPASS\033[0m"
|
||||
FAIL = "\033[31mFAIL\033[0m"
|
||||
INFO = "\033[36mINFO\033[0m"
|
||||
WARN = "\033[33mWARN\033[0m"
|
||||
|
||||
|
||||
# ── result helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
def report(results: list, name: str, ok: bool, detail: str = ""):
|
||||
tag = PASS if ok else FAIL
|
||||
print(f" [{tag}] {name}" + (f" — {detail}" if detail else ""))
|
||||
results.append((name, ok))
|
||||
|
||||
|
||||
def print_summary(results: list):
|
||||
print(f"\n{'─'*55}")
|
||||
total = len(results)
|
||||
passed = sum(1 for _, ok in results if ok)
|
||||
failed = total - passed
|
||||
print(f"Results: {passed}/{total} passed", end="")
|
||||
if failed:
|
||||
print(f" ({failed} failed)\n")
|
||||
print("Failed checks:")
|
||||
for name, ok in results:
|
||||
if not ok:
|
||||
print(f" - {name}")
|
||||
else:
|
||||
print(" — all good")
|
||||
print()
|
||||
|
||||
|
||||
def tf(v):
|
||||
"""Format timing value."""
|
||||
return f"{v:6.2f}s" if v is not None else " n/a"
|
||||
|
||||
|
||||
# ── HTTP helpers ──────────────────────────────────────────────────────────────
|
||||
|
||||
def get(url, timeout=5):
|
||||
with urllib.request.urlopen(urllib.request.Request(url), timeout=timeout) as r:
|
||||
return r.status, r.read().decode()
|
||||
|
||||
|
||||
def post_json(url, payload, timeout=10):
|
||||
data = json.dumps(payload).encode()
|
||||
req = urllib.request.Request(
|
||||
url, data=data,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.status, json.loads(r.read().decode())
|
||||
|
||||
|
||||
def check_sse(host, port, path):
|
||||
try:
|
||||
conn = http.client.HTTPConnection(host, port, timeout=5)
|
||||
conn.request("GET", path, headers={"Accept": "text/event-stream"})
|
||||
r = conn.getresponse()
|
||||
conn.close()
|
||||
return r.status == 200, f"HTTP {r.status}"
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
def qdrant_count():
|
||||
try:
|
||||
_, body = get(f"{QDRANT}/collections/adolf_memories")
|
||||
return json.loads(body).get("result", {}).get("points_count", 0)
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
# ── log helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
def fetch_logs(since_s=600):
|
||||
"""Return deepagents log lines from the last since_s seconds."""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["docker", "compose", "-f", COMPOSE_FILE, "logs", "deepagents",
|
||||
f"--since={int(since_s)}s", "--no-log-prefix"],
|
||||
capture_output=True, text=True, timeout=15,
|
||||
)
|
||||
return r.stdout.splitlines()
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def fetch_bifrost_logs(since_s=120):
|
||||
"""Return bifrost container log lines from the last since_s seconds."""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["docker", "compose", "-f", COMPOSE_FILE, "logs", "bifrost",
|
||||
f"--since={int(since_s)}s", "--no-log-prefix"],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
return r.stdout.splitlines()
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def parse_run_block(lines, msg_prefix):
|
||||
"""
|
||||
Scan log lines for the LAST '[agent] running: <msg_prefix>' block.
|
||||
Extracts reply timing, tier, and memory timing from that block.
|
||||
|
||||
Returns dict or None if the reply has not appeared in logs yet.
|
||||
Dict keys:
|
||||
reply_total, llm, send, tier, reply_text — from "[agent] replied in ..."
|
||||
memory_s — from "[memory] stored in ..."
|
||||
memory_error — True if "[memory] error" found
|
||||
"""
|
||||
search = msg_prefix[:50]
|
||||
start_idx = None
|
||||
for i, line in enumerate(lines):
|
||||
if "[agent] running:" in line and search in line:
|
||||
start_idx = i # keep updating — we want the LAST occurrence
|
||||
|
||||
if start_idx is None:
|
||||
return None
|
||||
|
||||
block = lines[start_idx:]
|
||||
last_ai_text = None
|
||||
reply_data = None
|
||||
|
||||
for j, line in enumerate(block):
|
||||
if "AIMessage:" in line and "→" not in line:
|
||||
txt = line.split("AIMessage:", 1)[-1].strip()
|
||||
if txt:
|
||||
last_ai_text = txt
|
||||
|
||||
m = re.search(r"replied in ([\d.]+)s \(llm=([\d.]+)s, send=([\d.]+)s\)", line)
|
||||
if m:
|
||||
tier_m = re.search(r"\btier=(\w+)", line)
|
||||
tier = tier_m.group(1) if tier_m else "unknown"
|
||||
reply_data = {
|
||||
"reply_total": float(m.group(1)),
|
||||
"llm": float(m.group(2)),
|
||||
"send": float(m.group(3)),
|
||||
"tier": tier,
|
||||
"reply_text": last_ai_text,
|
||||
"memory_s": None,
|
||||
"memory_error": False,
|
||||
"_j": j,
|
||||
}
|
||||
break
|
||||
|
||||
if reply_data is not None:
|
||||
next_lines = block[reply_data["_j"] + 1: reply_data["_j"] + 3]
|
||||
for line in next_lines:
|
||||
if line.startswith("[agent] reply_text:"):
|
||||
reply_data["reply_text"] = line[len("[agent] reply_text:"):].strip()
|
||||
break
|
||||
|
||||
if reply_data is None:
|
||||
return None
|
||||
|
||||
for line in block[reply_data["_j"] + 1:]:
|
||||
mm = re.search(r"\[memory\] stored in ([\d.]+)s", line)
|
||||
if mm:
|
||||
reply_data["memory_s"] = float(mm.group(1))
|
||||
break
|
||||
if "[memory] error" in line:
|
||||
reply_data["memory_error"] = True
|
||||
break
|
||||
|
||||
return reply_data
|
||||
|
||||
|
||||
def wait_for(label, msg_prefix, timeout_s=200, need_memory=True):
|
||||
"""
|
||||
Poll deepagents logs until the message is fully processed.
|
||||
Shows a live progress line. Returns timing dict or None on timeout.
|
||||
"""
|
||||
t_start = time.monotonic()
|
||||
deadline = t_start + timeout_s
|
||||
tick = 0
|
||||
last_result = None
|
||||
|
||||
while time.monotonic() < deadline:
|
||||
since = int(time.monotonic() - t_start) + 90
|
||||
lines = fetch_logs(since_s=since)
|
||||
result = parse_run_block(lines, msg_prefix)
|
||||
|
||||
if result:
|
||||
last_result = result
|
||||
has_mem = result["memory_s"] is not None or result["memory_error"]
|
||||
if (not need_memory) or has_mem:
|
||||
elapsed = time.monotonic() - t_start
|
||||
print(f"\r [{label}] done after {elapsed:.0f}s{' ' * 30}")
|
||||
return result
|
||||
|
||||
time.sleep(4)
|
||||
tick += 1
|
||||
rem = int(deadline - time.monotonic())
|
||||
if last_result:
|
||||
phase = "waiting for memory..." if need_memory else "done"
|
||||
else:
|
||||
phase = "waiting for LLM reply..."
|
||||
print(f"\r [{label}] {tick*4}s elapsed, {rem}s left — {phase} ", end="", flush=True)
|
||||
|
||||
print(f"\r [{label}] TIMEOUT after {timeout_s}s{' ' * 30}")
|
||||
return None
|
||||
Reference in New Issue
Block a user