- Add tier_capture param to _run_agent_pipeline; append tier after determination - Capture actual_tier in run_agent_task from tier_capture list - Log tier in replied-in line: [agent] replied in Xs tier=Y - Remove reply_text[:200] truncation (was breaking benchmark keyword matching) - Update parse_run_block regex to match new log format; llm/send fields now None Fixes #1, #3, #4 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
273 lines
9.9 KiB
Python
273 lines
9.9 KiB
Python
"""
|
|
Shared config, helpers, and utilities for Adolf integration tests.
|
|
"""
|
|
|
|
import http.client
|
|
import json
|
|
import re
|
|
import subprocess
|
|
import time
|
|
import urllib.request
|
|
|
|
# ── config ────────────────────────────────────────────────────────────────────
|
|
DEEPAGENTS = "http://localhost:8000"
|
|
BIFROST = "http://localhost:8080"
|
|
OPENMEMORY = "http://localhost:8765"
|
|
GRAMMY_HOST = "localhost"
|
|
GRAMMY_PORT = 3001
|
|
OLLAMA_GPU = "http://localhost:11436"
|
|
OLLAMA_CPU = "http://localhost:11435"
|
|
QDRANT = "http://localhost:6333"
|
|
SEARXNG = "http://localhost:11437"
|
|
COMPOSE_FILE = "/home/alvis/adolf/docker-compose.yml"
|
|
DEFAULT_CHAT_ID = "346967270"
|
|
|
|
NAMES = [
|
|
"Maximilian", "Cornelius", "Zephyr", "Archibald", "Balthazar",
|
|
"Ignatius", "Lysander", "Octavian", "Reginald", "Sylvester",
|
|
]
|
|
|
|
BENCHMARK = {
|
|
"easy": [
|
|
"hi",
|
|
"what is 2+2?",
|
|
"what is the capital of France?",
|
|
"tell me a short joke",
|
|
"how are you doing today?",
|
|
"thanks!",
|
|
"what day comes after Wednesday?",
|
|
"name the three primary colors",
|
|
"is the sky blue?",
|
|
"what does CPU stand for?",
|
|
],
|
|
"medium": [
|
|
"what is the current weather in Berlin?",
|
|
"find the latest news about artificial intelligence",
|
|
"what is the current price of Bitcoin?",
|
|
"search for a good pasta carbonara recipe",
|
|
"what movies are in theaters this week?",
|
|
"find Python tutorials for beginners",
|
|
"who won the last FIFA World Cup?",
|
|
"do you remember what we talked about before?",
|
|
"search for the best coffee shops in Tokyo",
|
|
"what is happening in the tech industry this week?",
|
|
"what's the weather like today?",
|
|
],
|
|
"hard": [
|
|
"/think compare the top 3 Python web frameworks (Django, FastAPI, Flask) and recommend one for a production REST API",
|
|
"/think research the history of artificial intelligence and create a timeline of key milestones",
|
|
"/think plan a 7-day trip to Japan with daily itinerary, accommodation suggestions, and budget breakdown",
|
|
"/think analyze microservices vs monolithic architecture: pros, cons, and when to choose each",
|
|
"/think write a Python script that reads a CSV file, cleans the data, and generates summary statistics",
|
|
"/think research quantum computing: explain the key concepts and how it differs from classical computing",
|
|
"/think compare PostgreSQL, MongoDB, and Redis — when to use each and what are the trade-offs?",
|
|
"/think create a comprehensive Docker deployment guide covering best practices for production",
|
|
"/think research climate change: summarize the latest IPCC findings and key data points",
|
|
"/think design a REST API with authentication, rate limiting, and proper error handling — provide architecture and code outline",
|
|
],
|
|
}
|
|
|
|
# ── terminal colours ──────────────────────────────────────────────────────────
|
|
PASS = "\033[32mPASS\033[0m"
|
|
FAIL = "\033[31mFAIL\033[0m"
|
|
INFO = "\033[36mINFO\033[0m"
|
|
WARN = "\033[33mWARN\033[0m"
|
|
|
|
|
|
# ── result helpers ────────────────────────────────────────────────────────────
|
|
|
|
def report(results: list, name: str, ok: bool, detail: str = ""):
|
|
tag = PASS if ok else FAIL
|
|
print(f" [{tag}] {name}" + (f" — {detail}" if detail else ""))
|
|
results.append((name, ok))
|
|
|
|
|
|
def print_summary(results: list):
|
|
print(f"\n{'─'*55}")
|
|
total = len(results)
|
|
passed = sum(1 for _, ok in results if ok)
|
|
failed = total - passed
|
|
print(f"Results: {passed}/{total} passed", end="")
|
|
if failed:
|
|
print(f" ({failed} failed)\n")
|
|
print("Failed checks:")
|
|
for name, ok in results:
|
|
if not ok:
|
|
print(f" - {name}")
|
|
else:
|
|
print(" — all good")
|
|
print()
|
|
|
|
|
|
def tf(v):
|
|
"""Format timing value."""
|
|
return f"{v:6.2f}s" if v is not None else " n/a"
|
|
|
|
|
|
# ── HTTP helpers ──────────────────────────────────────────────────────────────
|
|
|
|
def get(url, timeout=5):
|
|
with urllib.request.urlopen(urllib.request.Request(url), timeout=timeout) as r:
|
|
return r.status, r.read().decode()
|
|
|
|
|
|
def post_json(url, payload, timeout=10):
|
|
data = json.dumps(payload).encode()
|
|
req = urllib.request.Request(
|
|
url, data=data,
|
|
headers={"Content-Type": "application/json"},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
return r.status, json.loads(r.read().decode())
|
|
|
|
|
|
def check_sse(host, port, path):
|
|
try:
|
|
conn = http.client.HTTPConnection(host, port, timeout=5)
|
|
conn.request("GET", path, headers={"Accept": "text/event-stream"})
|
|
r = conn.getresponse()
|
|
conn.close()
|
|
return r.status == 200, f"HTTP {r.status}"
|
|
except Exception as e:
|
|
return False, str(e)
|
|
|
|
|
|
def qdrant_count():
|
|
try:
|
|
_, body = get(f"{QDRANT}/collections/adolf_memories")
|
|
return json.loads(body).get("result", {}).get("points_count", 0)
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
# ── log helpers ───────────────────────────────────────────────────────────────
|
|
|
|
def fetch_logs(since_s=600):
|
|
"""Return deepagents log lines from the last since_s seconds."""
|
|
try:
|
|
r = subprocess.run(
|
|
["docker", "compose", "-f", COMPOSE_FILE, "logs", "deepagents",
|
|
f"--since={int(since_s)}s", "--no-log-prefix"],
|
|
capture_output=True, text=True, timeout=15,
|
|
)
|
|
return r.stdout.splitlines()
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def fetch_bifrost_logs(since_s=120):
|
|
"""Return bifrost container log lines from the last since_s seconds."""
|
|
try:
|
|
r = subprocess.run(
|
|
["docker", "compose", "-f", COMPOSE_FILE, "logs", "bifrost",
|
|
f"--since={int(since_s)}s", "--no-log-prefix"],
|
|
capture_output=True, text=True, timeout=10,
|
|
)
|
|
return r.stdout.splitlines()
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def parse_run_block(lines, msg_prefix):
|
|
"""
|
|
Scan log lines for the LAST '[agent] running: <msg_prefix>' block.
|
|
Extracts reply timing, tier, and memory timing from that block.
|
|
|
|
Returns dict or None if the reply has not appeared in logs yet.
|
|
Dict keys:
|
|
reply_total, llm, send, tier, reply_text — from "[agent] replied in ..."
|
|
memory_s — from "[memory] stored in ..."
|
|
memory_error — True if "[memory] error" found
|
|
"""
|
|
search = msg_prefix[:50]
|
|
start_idx = None
|
|
for i, line in enumerate(lines):
|
|
if "[agent] running:" in line and search in line:
|
|
start_idx = i # keep updating — we want the LAST occurrence
|
|
|
|
if start_idx is None:
|
|
return None
|
|
|
|
block = lines[start_idx:]
|
|
last_ai_text = None
|
|
reply_data = None
|
|
|
|
for j, line in enumerate(block):
|
|
if "AIMessage:" in line and "→" not in line:
|
|
txt = line.split("AIMessage:", 1)[-1].strip()
|
|
if txt:
|
|
last_ai_text = txt
|
|
|
|
m = re.search(r"replied in ([\d.]+)s(?:\s+tier=(\w+))?", line)
|
|
if m:
|
|
tier = m.group(2) if m.group(2) else "unknown"
|
|
reply_data = {
|
|
"reply_total": float(m.group(1)),
|
|
"llm": None,
|
|
"send": None,
|
|
"tier": tier,
|
|
"reply_text": last_ai_text,
|
|
"memory_s": None,
|
|
"memory_error": False,
|
|
"_j": j,
|
|
}
|
|
break
|
|
|
|
if reply_data is not None:
|
|
next_lines = block[reply_data["_j"] + 1: reply_data["_j"] + 3]
|
|
for line in next_lines:
|
|
if line.startswith("[agent] reply_text:"):
|
|
reply_data["reply_text"] = line[len("[agent] reply_text:"):].strip()
|
|
break
|
|
|
|
if reply_data is None:
|
|
return None
|
|
|
|
for line in block[reply_data["_j"] + 1:]:
|
|
mm = re.search(r"\[memory\] stored in ([\d.]+)s", line)
|
|
if mm:
|
|
reply_data["memory_s"] = float(mm.group(1))
|
|
break
|
|
if "[memory] error" in line:
|
|
reply_data["memory_error"] = True
|
|
break
|
|
|
|
return reply_data
|
|
|
|
|
|
def wait_for(label, msg_prefix, timeout_s=200, need_memory=True):
|
|
"""
|
|
Poll deepagents logs until the message is fully processed.
|
|
Shows a live progress line. Returns timing dict or None on timeout.
|
|
"""
|
|
t_start = time.monotonic()
|
|
deadline = t_start + timeout_s
|
|
tick = 0
|
|
last_result = None
|
|
|
|
while time.monotonic() < deadline:
|
|
since = int(time.monotonic() - t_start) + 90
|
|
lines = fetch_logs(since_s=since)
|
|
result = parse_run_block(lines, msg_prefix)
|
|
|
|
if result:
|
|
last_result = result
|
|
has_mem = result["memory_s"] is not None or result["memory_error"]
|
|
if (not need_memory) or has_mem:
|
|
elapsed = time.monotonic() - t_start
|
|
print(f"\r [{label}] done after {elapsed:.0f}s{' ' * 30}")
|
|
return result
|
|
|
|
time.sleep(4)
|
|
tick += 1
|
|
rem = int(deadline - time.monotonic())
|
|
if last_result:
|
|
phase = "waiting for memory..." if need_memory else "done"
|
|
else:
|
|
phase = "waiting for LLM reply..."
|
|
print(f"\r [{label}] {tick*4}s elapsed, {rem}s left — {phase} ", end="", flush=True)
|
|
|
|
print(f"\r [{label}] TIMEOUT after {timeout_s}s{' ' * 30}")
|
|
return None
|