- Three-tier routing: light (router answers directly ~3s), medium (qwen3:4b + tools ~60s), complex (/think prefix → qwen3:8b + subagents ~140s) - Router: qwen2.5:1.5b, temp=0, regex pre-classifier + raw-text LLM classify - VRAMManager: explicit flush/poll/prewarm to prevent Ollama CPU-spill bug - agent_factory: build_medium_agent and build_complex_agent using deepagents (TodoListMiddleware + SubAgentMiddleware with research/memory subagents) - Fix: split Telegram replies >4000 chars into multiple messages - Benchmark: 30 questions (easy/medium/hard) — 10/10/10 verified passing easy→light, medium→medium, hard→complex with VRAM flush confirmed Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
906 lines
38 KiB
Python
906 lines
38 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Adolf pipeline integration test with end-to-end timing profiling.
|
||
|
||
Tests:
|
||
1. Service health (deepagents, openmemory, grammy MCP SSE)
|
||
2. GPU Ollama models
|
||
3. CPU Ollama models
|
||
4. Qdrant collection + vector dims
|
||
5. SearXNG
|
||
6. Name store — "remember that your name is <RandomName>"
|
||
7. Qdrant point added after store
|
||
8. Name recall — "what is your name?" → reply contains <RandomName>
|
||
9. Timing profile + bottleneck report
|
||
10. Easy benchmark — 10 easy questions → all must route to light
|
||
11. Medium benchmark — 10 medium questions → must route to medium (or light, never complex)
|
||
12. Hard benchmark — 10 /think questions → all must route to complex; VRAM flush verified
|
||
|
||
Usage:
|
||
python3 test_pipeline.py [--chat-id CHAT_ID]
|
||
[--bench-only] skip sections 1-9, run 10+11+12
|
||
[--easy-only] skip 1-9 and 11+12, run only section 10
|
||
[--medium-only] skip 1-9 and 10+12, run only section 11
|
||
[--hard-only] skip 1-9 and 10+11, run only section 12
|
||
[--no-bench] skip sections 10-12
|
||
|
||
Timing is extracted from deepagents container logs, not estimated from sleeps.
|
||
"""
|
||
|
||
import argparse
|
||
import http.client
|
||
import json
|
||
import random
|
||
import re
|
||
import subprocess
|
||
import sys
|
||
import time
|
||
import urllib.request
|
||
|
||
# ── config ────────────────────────────────────────────────────────────────────
|
||
DEEPAGENTS = "http://localhost:8000"
|
||
OPENMEMORY = "http://localhost:8765"
|
||
GRAMMY_HOST = "localhost"
|
||
GRAMMY_PORT = 3001
|
||
OLLAMA_GPU = "http://localhost:11436"
|
||
OLLAMA_CPU = "http://localhost:11435"
|
||
QDRANT = "http://localhost:6333"
|
||
SEARXNG = "http://localhost:11437"
|
||
COMPOSE_FILE = "/home/alvis/agap_git/adolf/docker-compose.yml"
|
||
DEFAULT_CHAT_ID = "346967270"
|
||
|
||
NAMES = [
|
||
"Maximilian", "Cornelius", "Zephyr", "Archibald", "Balthazar",
|
||
"Ignatius", "Lysander", "Octavian", "Reginald", "Sylvester",
|
||
]
|
||
|
||
# ── benchmark questions ───────────────────────────────────────────────────────
|
||
BENCHMARK = {
|
||
"easy": [
|
||
"hi",
|
||
"what is 2+2?",
|
||
"what is the capital of France?",
|
||
"tell me a short joke",
|
||
"how are you doing today?",
|
||
"thanks!",
|
||
"what day comes after Wednesday?",
|
||
"name the three primary colors",
|
||
"is the sky blue?",
|
||
"what does CPU stand for?",
|
||
],
|
||
"medium": [
|
||
"what is the current weather in Berlin?",
|
||
"find the latest news about artificial intelligence",
|
||
"what is the current price of Bitcoin?",
|
||
"search for a good pasta carbonara recipe",
|
||
"what movies are in theaters this week?",
|
||
"find Python tutorials for beginners",
|
||
"who won the last FIFA World Cup?",
|
||
"do you remember what we talked about before?",
|
||
"search for the best coffee shops in Tokyo",
|
||
"what is happening in the tech industry this week?",
|
||
],
|
||
"hard": [
|
||
"/think compare the top 3 Python web frameworks (Django, FastAPI, Flask) and recommend one for a production REST API",
|
||
"/think research the history of artificial intelligence and create a timeline of key milestones",
|
||
"/think plan a 7-day trip to Japan with daily itinerary, accommodation suggestions, and budget breakdown",
|
||
"/think analyze microservices vs monolithic architecture: pros, cons, and when to choose each",
|
||
"/think write a Python script that reads a CSV file, cleans the data, and generates summary statistics",
|
||
"/think research quantum computing: explain the key concepts and how it differs from classical computing",
|
||
"/think compare PostgreSQL, MongoDB, and Redis — when to use each and what are the trade-offs?",
|
||
"/think create a comprehensive Docker deployment guide covering best practices for production",
|
||
"/think research climate change: summarize the latest IPCC findings and key data points",
|
||
"/think design a REST API with authentication, rate limiting, and proper error handling — provide architecture and code outline",
|
||
],
|
||
}
|
||
|
||
PASS = "\033[32mPASS\033[0m"
|
||
FAIL = "\033[31mFAIL\033[0m"
|
||
INFO = "\033[36mINFO\033[0m"
|
||
WARN = "\033[33mWARN\033[0m"
|
||
|
||
results = []
|
||
timings = {} # label → float seconds | None
|
||
|
||
|
||
# ── helpers ───────────────────────────────────────────────────────────────────
|
||
|
||
def report(name, ok, detail=""):
|
||
tag = PASS if ok else FAIL
|
||
print(f" [{tag}] {name}" + (f" — {detail}" if detail else ""))
|
||
results.append((name, ok))
|
||
|
||
|
||
def tf(v):
|
||
"""Format timing value."""
|
||
return f"{v:6.2f}s" if v is not None else " n/a"
|
||
|
||
|
||
def get(url, timeout=5):
|
||
with urllib.request.urlopen(urllib.request.Request(url), timeout=timeout) as r:
|
||
return r.status, r.read().decode()
|
||
|
||
|
||
def post_json(url, payload, timeout=10):
|
||
data = json.dumps(payload).encode()
|
||
req = urllib.request.Request(url, data=data,
|
||
headers={"Content-Type": "application/json"},
|
||
method="POST")
|
||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||
return r.status, json.loads(r.read().decode())
|
||
|
||
|
||
def check_sse(host, port, path):
|
||
try:
|
||
conn = http.client.HTTPConnection(host, port, timeout=5)
|
||
conn.request("GET", path, headers={"Accept": "text/event-stream"})
|
||
r = conn.getresponse()
|
||
conn.close()
|
||
return r.status == 200, f"HTTP {r.status}"
|
||
except Exception as e:
|
||
return False, str(e)
|
||
|
||
|
||
def qdrant_count():
|
||
try:
|
||
_, body = get(f"{QDRANT}/collections/adolf_memories")
|
||
return json.loads(body).get("result", {}).get("points_count", 0)
|
||
except Exception:
|
||
return 0
|
||
|
||
|
||
def fetch_logs(since_s=600):
|
||
"""Return deepagents log lines from the last since_s seconds."""
|
||
try:
|
||
r = subprocess.run(
|
||
["docker", "compose", "-f", COMPOSE_FILE, "logs", "deepagents",
|
||
f"--since={int(since_s)}s", "--no-log-prefix"],
|
||
capture_output=True, text=True, timeout=15,
|
||
)
|
||
return r.stdout.splitlines()
|
||
except Exception:
|
||
return []
|
||
|
||
|
||
def parse_run_block(lines, msg_prefix):
|
||
"""
|
||
Scan log lines for the LAST '[agent] running: <msg_prefix>' block.
|
||
Extracts reply timing, tier, and memory timing from that block.
|
||
|
||
Returns dict or None if the reply has not appeared in logs yet.
|
||
Dict keys:
|
||
reply_total, llm, send, tier, reply_text — from "[agent] replied in ..."
|
||
memory_s — from "[memory] stored in ..."
|
||
memory_error — True if "[memory] error" found
|
||
"""
|
||
search = msg_prefix[:50]
|
||
start_idx = None
|
||
for i, line in enumerate(lines):
|
||
if "[agent] running:" in line and search in line:
|
||
start_idx = i # keep updating — we want the LAST occurrence
|
||
|
||
if start_idx is None:
|
||
return None
|
||
|
||
block = lines[start_idx:]
|
||
last_ai_text = None
|
||
reply_data = None
|
||
|
||
for j, line in enumerate(block):
|
||
# Track last non-tool AIMessage (the final reply)
|
||
if "AIMessage:" in line and "→" not in line:
|
||
txt = line.split("AIMessage:", 1)[-1].strip()
|
||
if txt:
|
||
last_ai_text = txt
|
||
|
||
# For light tier: router reply is stored in _conversation_buffers directly
|
||
# so there may be no AIMessage log — grab from tier=light line
|
||
if "[agent] tier=light" in line and "message=" in line:
|
||
# Extract preview text logged elsewhere if available
|
||
pass
|
||
|
||
m = re.search(r"replied in ([\d.]+)s \(llm=([\d.]+)s, send=([\d.]+)s\)", line)
|
||
if m:
|
||
# Extract optional tier tag at end of line
|
||
tier_m = re.search(r"\btier=(\w+)", line)
|
||
tier = tier_m.group(1) if tier_m else "unknown"
|
||
reply_data = {
|
||
"reply_total": float(m.group(1)),
|
||
"llm": float(m.group(2)),
|
||
"send": float(m.group(3)),
|
||
"tier": tier,
|
||
"reply_text": last_ai_text,
|
||
"memory_s": None,
|
||
"memory_error": False,
|
||
"_j": j,
|
||
}
|
||
break
|
||
|
||
if reply_data is None:
|
||
return None # reply not in logs yet
|
||
|
||
# Memory line can appear after the next "[agent] running:" — no stop condition
|
||
for line in block[reply_data["_j"] + 1:]:
|
||
mm = re.search(r"\[memory\] stored in ([\d.]+)s", line)
|
||
if mm:
|
||
reply_data["memory_s"] = float(mm.group(1))
|
||
break
|
||
if "[memory] error" in line:
|
||
reply_data["memory_error"] = True
|
||
break
|
||
|
||
return reply_data
|
||
|
||
|
||
def wait_for(label, msg_prefix, timeout_s=200, need_memory=True):
|
||
"""
|
||
Poll deepagents logs until the message is fully processed.
|
||
Shows a live progress line.
|
||
Returns timing dict or None on timeout.
|
||
"""
|
||
t_start = time.monotonic()
|
||
deadline = t_start + timeout_s
|
||
tick = 0
|
||
last_result = None
|
||
|
||
while time.monotonic() < deadline:
|
||
# Window grows with elapsed time — never miss a line that appeared late
|
||
since = int(time.monotonic() - t_start) + 90
|
||
lines = fetch_logs(since_s=since)
|
||
result = parse_run_block(lines, msg_prefix)
|
||
|
||
if result:
|
||
last_result = result
|
||
has_mem = result["memory_s"] is not None or result["memory_error"]
|
||
if (not need_memory) or has_mem:
|
||
elapsed = time.monotonic() - t_start
|
||
print(f"\r [{label}] done after {elapsed:.0f}s{' ' * 30}")
|
||
return result
|
||
|
||
time.sleep(4)
|
||
tick += 1
|
||
rem = int(deadline - time.monotonic())
|
||
if last_result:
|
||
phase = "waiting for memory..." if need_memory else "done"
|
||
else:
|
||
phase = "waiting for LLM reply..."
|
||
print(f"\r [{label}] {tick*4}s elapsed, {rem}s left — {phase} ", end="", flush=True)
|
||
|
||
print(f"\r [{label}] TIMEOUT after {timeout_s}s{' ' * 30}")
|
||
return None
|
||
|
||
|
||
# ── args ──────────────────────────────────────────────────────────────────────
|
||
parser = argparse.ArgumentParser(description="Adolf pipeline test")
|
||
parser.add_argument("--chat-id", default=DEFAULT_CHAT_ID)
|
||
parser.add_argument("--bench-only", action="store_true",
|
||
help="Skip sections 1-9, run sections 10+11 (both benchmarks)")
|
||
parser.add_argument("--easy-only", action="store_true",
|
||
help="Skip sections 1-9 and 11, run only section 10 (easy benchmark)")
|
||
parser.add_argument("--medium-only", action="store_true",
|
||
help="Skip sections 1-9 and 10, run only section 11 (medium benchmark)")
|
||
parser.add_argument("--hard-only", action="store_true",
|
||
help="Skip sections 1-9 and 10+11, run only section 12 (hard benchmark)")
|
||
parser.add_argument("--no-bench", action="store_true",
|
||
help="Skip sections 10-12 (all benchmarks)")
|
||
args = parser.parse_args()
|
||
CHAT_ID = args.chat_id
|
||
|
||
# Derived flags for readability
|
||
_skip_pipeline = args.bench_only or args.easy_only or args.medium_only or args.hard_only
|
||
_run_easy = not args.no_bench and not args.medium_only and not args.hard_only
|
||
_run_medium = not args.no_bench and not args.easy_only and not args.hard_only
|
||
_run_hard = not args.no_bench and not args.easy_only and not args.medium_only
|
||
|
||
random_name = random.choice(NAMES)
|
||
|
||
if not _skip_pipeline:
|
||
print(f"\n Test name : \033[1m{random_name}\033[0m")
|
||
print(f" Chat ID : {CHAT_ID}")
|
||
|
||
|
||
# ── 1. service health ─────────────────────────────────────────────────────────
|
||
if not _skip_pipeline:
|
||
print(f"\n[{INFO}] 1. Service health")
|
||
t0 = time.monotonic()
|
||
|
||
try:
|
||
status, body = get(f"{DEEPAGENTS}/health")
|
||
data = json.loads(body)
|
||
ok = status == 200 and data.get("agent_ready") is True
|
||
report("deepagents /health — agent_ready", ok, f"agent_ready={data.get('agent_ready')}")
|
||
except Exception as e:
|
||
report("deepagents /health", False, str(e))
|
||
|
||
ok, detail = check_sse("localhost", 8765, "/sse")
|
||
report("openmemory /sse reachable", ok, detail)
|
||
|
||
ok, detail = check_sse(GRAMMY_HOST, GRAMMY_PORT, "/sse")
|
||
report("grammy /sse reachable", ok, detail)
|
||
|
||
timings["health_check"] = time.monotonic() - t0
|
||
|
||
|
||
# ── 2. GPU Ollama ─────────────────────────────────────────────────────────────
|
||
if not _skip_pipeline:
|
||
print(f"\n[{INFO}] 2. GPU Ollama (port 11436)")
|
||
t0 = time.monotonic()
|
||
|
||
try:
|
||
status, body = get(f"{OLLAMA_GPU}/api/tags")
|
||
models = [m["name"] for m in json.loads(body).get("models", [])]
|
||
has_qwen = any("qwen3" in m for m in models)
|
||
report("GPU Ollama reachable", True, f"models: {models}")
|
||
report("qwen3:8b present", has_qwen)
|
||
except Exception as e:
|
||
report("GPU Ollama reachable", False, str(e))
|
||
report("qwen3:8b present", False, "skipped")
|
||
|
||
timings["gpu_ollama_ping"] = time.monotonic() - t0
|
||
|
||
|
||
# ── 3. CPU Ollama ─────────────────────────────────────────────────────────────
|
||
if not _skip_pipeline:
|
||
print(f"\n[{INFO}] 3. CPU Ollama (port 11435)")
|
||
t0 = time.monotonic()
|
||
|
||
try:
|
||
status, body = get(f"{OLLAMA_CPU}/api/tags")
|
||
models = [m["name"] for m in json.loads(body).get("models", [])]
|
||
has_embed = any("nomic-embed-text" in m for m in models)
|
||
report("CPU Ollama reachable", True, f"models: {models}")
|
||
report("nomic-embed-text present", has_embed)
|
||
except Exception as e:
|
||
report("CPU Ollama reachable", False, str(e))
|
||
report("nomic-embed-text present", False, "skipped")
|
||
|
||
timings["cpu_ollama_ping"] = time.monotonic() - t0
|
||
|
||
|
||
# ── 4. Qdrant ─────────────────────────────────────────────────────────────────
|
||
if not _skip_pipeline:
|
||
print(f"\n[{INFO}] 4. Qdrant (port 6333)")
|
||
t0 = time.monotonic()
|
||
|
||
try:
|
||
status, body = get(f"{QDRANT}/collections")
|
||
cols = [c["name"] for c in json.loads(body).get("result", {}).get("collections", [])]
|
||
report("Qdrant reachable", True, f"collections: {cols}")
|
||
report("adolf_memories collection exists", "adolf_memories" in cols)
|
||
except Exception as e:
|
||
report("Qdrant reachable", False, str(e))
|
||
report("adolf_memories collection exists", False, "skipped")
|
||
|
||
try:
|
||
status, body = get(f"{QDRANT}/collections/adolf_memories")
|
||
info = json.loads(body).get("result", {})
|
||
dims = info.get("config", {}).get("params", {}).get("vectors", {}).get("size")
|
||
report("vector dims = 768", dims == 768, f"got {dims}")
|
||
except Exception as e:
|
||
report("adolf_memories collection info", False, str(e))
|
||
|
||
timings["qdrant_ping"] = time.monotonic() - t0
|
||
|
||
|
||
# ── 5. SearXNG ────────────────────────────────────────────────────────────────
|
||
if not _skip_pipeline:
|
||
print(f"\n[{INFO}] 5. SearXNG (port 11437)")
|
||
t0 = time.monotonic()
|
||
|
||
try:
|
||
status, body = get(f"{SEARXNG}/search?q=test&format=json", timeout=15)
|
||
elapsed = time.monotonic() - t0
|
||
n = len(json.loads(body).get("results", []))
|
||
report("SearXNG reachable + JSON results", status == 200 and n > 0, f"{n} results in {elapsed:.1f}s")
|
||
report("SearXNG response < 5s", elapsed < 5, f"{elapsed:.2f}s")
|
||
timings["searxng_latency"] = elapsed
|
||
except Exception as e:
|
||
report("SearXNG reachable", False, str(e))
|
||
report("SearXNG response < 5s", False, "skipped")
|
||
timings["searxng_latency"] = None
|
||
|
||
timings["searxng_check"] = time.monotonic() - t0
|
||
|
||
|
||
# ── 6–8. Name memory pipeline ─────────────────────────────────────────────────
|
||
if not _skip_pipeline:
|
||
print(f"\n[{INFO}] 6–8. Name memory pipeline")
|
||
print(f" chat_id={CHAT_ID} name={random_name}")
|
||
|
||
store_msg = f"remember that your name is {random_name}"
|
||
recall_msg = "what is your name?"
|
||
|
||
pts_before = qdrant_count()
|
||
print(f" Qdrant points before: {pts_before}")
|
||
|
||
# ── 6. Send store message ─────────────────────────────────────────────────────
|
||
print(f"\n [store] '{store_msg}'")
|
||
t_store = time.monotonic()
|
||
|
||
try:
|
||
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
||
{"message": store_msg, "chat_id": CHAT_ID}, timeout=5)
|
||
t_accept = time.monotonic() - t_store
|
||
report("POST /chat (store) returns 202 immediately",
|
||
status == 202 and t_accept < 1, f"status={status}, t={t_accept:.3f}s")
|
||
timings["store_http_accept"] = t_accept
|
||
except Exception as e:
|
||
report("POST /chat (store)", False, str(e))
|
||
sys.exit(1)
|
||
|
||
store = wait_for("store", store_msg, timeout_s=220, need_memory=True)
|
||
|
||
if store:
|
||
timings["store_llm"] = store["llm"]
|
||
timings["store_send"] = store["send"]
|
||
timings["store_reply"] = store["reply_total"]
|
||
timings["store_memory"] = store["memory_s"]
|
||
report("Agent replied to store message", True,
|
||
f"{store['reply_total']:.1f}s total llm={store['llm']:.1f}s send={store['send']:.1f}s tier={store['tier']}")
|
||
if store["memory_s"] is not None:
|
||
report("Memory stored without error", True, f"{store['memory_s']:.1f}s")
|
||
elif store["memory_error"]:
|
||
report("Memory stored without error", False, "error in [memory] log")
|
||
else:
|
||
report("Memory stored without error", False, "not found in logs (still running?)")
|
||
print(f" Store reply: {store['reply_text']!r}")
|
||
else:
|
||
report("Agent replied to store message", False, "timeout")
|
||
report("Memory stored without error", False, "timeout")
|
||
sys.exit(1)
|
||
|
||
# ── 7. Verify Qdrant ──────────────────────────────────────────────────────────
|
||
pts_after = qdrant_count()
|
||
new_pts = pts_after - pts_before
|
||
report("New memory point(s) added to Qdrant", new_pts > 0,
|
||
f"{pts_before} → {pts_after} (+{new_pts})")
|
||
timings["qdrant_new_points"] = new_pts
|
||
|
||
# ── 8. Send recall message ────────────────────────────────────────────────────
|
||
print(f"\n [recall] '{recall_msg}'")
|
||
t_recall = time.monotonic()
|
||
|
||
try:
|
||
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
||
{"message": recall_msg, "chat_id": CHAT_ID}, timeout=5)
|
||
t_accept2 = time.monotonic() - t_recall
|
||
report("POST /chat (recall) returns 202 immediately",
|
||
status == 202 and t_accept2 < 1, f"status={status}, t={t_accept2:.3f}s")
|
||
timings["recall_http_accept"] = t_accept2
|
||
except Exception as e:
|
||
report("POST /chat (recall)", False, str(e))
|
||
|
||
recall = wait_for("recall", recall_msg, timeout_s=160, need_memory=False)
|
||
|
||
if recall:
|
||
timings["recall_llm"] = recall["llm"]
|
||
timings["recall_send"] = recall["send"]
|
||
timings["recall_reply"] = recall["reply_total"]
|
||
report("Agent replied to recall message", True,
|
||
f"{recall['reply_total']:.1f}s total llm={recall['llm']:.1f}s send={recall['send']:.1f}s tier={recall['tier']}")
|
||
reply_text = recall["reply_text"] or ""
|
||
name_in_reply = random_name.lower() in reply_text.lower()
|
||
report(f"Reply contains '{random_name}'", name_in_reply,
|
||
f"reply: {reply_text[:120]!r}")
|
||
else:
|
||
report("Agent replied to recall message", False, "timeout")
|
||
report(f"Reply contains '{random_name}'", False, "no reply")
|
||
|
||
|
||
# ── 9. Timing profile ─────────────────────────────────────────────────────────
|
||
if not _skip_pipeline:
|
||
print(f"\n[{INFO}] 9. Timing profile")
|
||
|
||
W = 36
|
||
|
||
print(f"\n {'Stage':<{W}} {'Time':>8}")
|
||
print(f" {'─'*W} {'─'*8}")
|
||
|
||
rows_store = [
|
||
("[GPU] HTTP accept — store turn", "store_http_accept"),
|
||
("[GPU] qwen3:Xb inference — store turn","store_llm"),
|
||
("[GPU] Telegram send — store turn", "store_send"),
|
||
("[GPU] Total reply latency — store", "store_reply"),
|
||
("[GPU] qwen2.5:1.5b+embed — async mem", "store_memory"),
|
||
]
|
||
rows_recall = [
|
||
("[GPU] HTTP accept — recall turn", "recall_http_accept"),
|
||
("[GPU] qwen3:Xb inference — recall", "recall_llm"),
|
||
("[GPU] Telegram send — recall turn", "recall_send"),
|
||
("[GPU] Total reply latency — recall", "recall_reply"),
|
||
]
|
||
|
||
for label, key in rows_store:
|
||
v = timings.get(key)
|
||
print(f" {label:<{W}} {tf(v):>8}")
|
||
|
||
print(f" {'─'*W} {'─'*8}")
|
||
|
||
for label, key in rows_recall:
|
||
v = timings.get(key)
|
||
print(f" {label:<{W}} {tf(v):>8}")
|
||
|
||
# Bottleneck bar chart
|
||
print(f"\n Bottleneck analysis (each █ ≈ 5s):")
|
||
print(f" {'─'*(W+12)}")
|
||
|
||
candidates = [
|
||
("[GPU] qwen3:Xb — store reply ", timings.get("store_llm") or 0),
|
||
("[GPU] qwen3:Xb — recall reply", timings.get("recall_llm") or 0),
|
||
("[GPU] qwen2.5:1.5b+embed (async)", timings.get("store_memory") or 0),
|
||
("[net] SearXNG ", timings.get("searxng_latency") or 0),
|
||
]
|
||
candidates.sort(key=lambda x: x[1], reverse=True)
|
||
|
||
for label, t in candidates:
|
||
bar = "█" * min(int(t / 5), 24)
|
||
pct = ""
|
||
total_pipeline = (timings.get("store_reply") or 0) + (timings.get("store_memory") or 0)
|
||
if total_pipeline > 0:
|
||
pct = f" {t/total_pipeline*100:4.0f}%"
|
||
print(f" {label} {t:6.1f}s {bar}{pct}")
|
||
|
||
print()
|
||
|
||
|
||
# ── 10. Tier routing benchmark — easy questions → light path ──────────────────
|
||
if _run_easy:
|
||
print(f"\n[{INFO}] 10. Tier routing benchmark")
|
||
print(f" Sending {len(BENCHMARK['easy'])} easy questions — all must route to 'light'")
|
||
print(f" Chat ID: {CHAT_ID}")
|
||
print()
|
||
|
||
bench_results = [] # list of (question, tier, latency_s, ok)
|
||
LIGHT_TIMEOUT = 60 # seconds — light is fast but may queue behind prior messages
|
||
|
||
for i, question in enumerate(BENCHMARK["easy"], 1):
|
||
tag = f"easy-{i:02d}"
|
||
short_q = question[:55]
|
||
print(f" [{tag}] {short_q!r}")
|
||
|
||
# Send
|
||
t_send = time.monotonic()
|
||
try:
|
||
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
||
{"message": question, "chat_id": CHAT_ID}, timeout=5)
|
||
if status != 202:
|
||
print(f" → [{FAIL}] POST returned {status}")
|
||
bench_results.append((question, "?", None, False))
|
||
continue
|
||
except Exception as e:
|
||
print(f" → [{FAIL}] POST error: {e}")
|
||
bench_results.append((question, "?", None, False))
|
||
continue
|
||
|
||
# Poll for reply
|
||
t_start = time.monotonic()
|
||
found = None
|
||
while time.monotonic() - t_start < LIGHT_TIMEOUT:
|
||
since = int(time.monotonic() - t_start) + 30
|
||
lines = fetch_logs(since_s=since)
|
||
found = parse_run_block(lines, question)
|
||
if found:
|
||
break
|
||
time.sleep(1)
|
||
|
||
elapsed = time.monotonic() - t_send
|
||
|
||
if not found:
|
||
print(f" → [{FAIL}] no reply within {LIGHT_TIMEOUT}s")
|
||
bench_results.append((question, "timeout", None, False))
|
||
continue
|
||
|
||
tier = found.get("tier", "unknown")
|
||
is_light = (tier == "light")
|
||
tag_str = PASS if is_light else FAIL
|
||
print(f" → [{tag_str}] tier={tier} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s")
|
||
bench_results.append((question, tier, found["reply_total"], is_light))
|
||
|
||
# Brief pause between questions to keep logs clean
|
||
time.sleep(1)
|
||
|
||
# Summary table
|
||
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question'}")
|
||
print(f" {'─'*4} {'─'*8} {'─'*8} {'─'*50}")
|
||
for idx, (q, tier, lat, ok) in enumerate(bench_results, 1):
|
||
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
|
||
ok_str = "✓" if ok else "✗"
|
||
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {q[:50]!r}")
|
||
|
||
light_count = sum(1 for _, _, _, ok in bench_results if ok)
|
||
total_bench = len(bench_results)
|
||
lats = [lat for _, _, lat, ok in bench_results if ok and lat is not None]
|
||
avg_lat = sum(lats) / len(lats) if lats else 0
|
||
|
||
print(f"\n Light-path score: {light_count}/{total_bench}")
|
||
if lats:
|
||
print(f" Avg latency (light): {avg_lat:.1f}s "
|
||
f"min={min(lats):.1f}s max={max(lats):.1f}s")
|
||
|
||
report(f"All easy questions routed to light ({light_count}/{total_bench})",
|
||
light_count == total_bench,
|
||
f"{light_count}/{total_bench} via light path, avg {avg_lat:.1f}s")
|
||
|
||
|
||
# ── 11. Medium benchmark — medium questions → medium or light, never complex ──
|
||
if _run_medium:
|
||
print(f"\n[{INFO}] 11. Medium routing benchmark")
|
||
print(f" Sending {len(BENCHMARK['medium'])} medium questions")
|
||
print(f" Expected: tier=medium (needs tools). Light is acceptable for factual questions.")
|
||
print(f" Fail condition: tier=complex or timeout.")
|
||
print(f" Chat ID: {CHAT_ID}")
|
||
print()
|
||
|
||
# Questions where light is a valid alternative (model may know from training data)
|
||
LIGHT_ACCEPTABLE = {
|
||
"who won the last FIFA World Cup?",
|
||
"search for a good pasta carbonara recipe",
|
||
"find Python tutorials for beginners",
|
||
"search for the best coffee shops in Tokyo",
|
||
}
|
||
|
||
med_results = [] # list of (question, tier, latency_s, correct)
|
||
MEDIUM_TIMEOUT = 120 # seconds — medium takes 20-100s, allow for queue buildup
|
||
|
||
for i, question in enumerate(BENCHMARK["medium"], 1):
|
||
tag = f"med-{i:02d}"
|
||
short_q = question[:60]
|
||
print(f" [{tag}] {short_q!r}")
|
||
|
||
# Send
|
||
t_send = time.monotonic()
|
||
try:
|
||
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
||
{"message": question, "chat_id": CHAT_ID}, timeout=5)
|
||
if status != 202:
|
||
print(f" → [{FAIL}] POST returned {status}")
|
||
med_results.append((question, "?", None, False))
|
||
continue
|
||
except Exception as e:
|
||
print(f" → [{FAIL}] POST error: {e}")
|
||
med_results.append((question, "?", None, False))
|
||
continue
|
||
|
||
# Poll for reply
|
||
t_start = time.monotonic()
|
||
found = None
|
||
while time.monotonic() - t_start < MEDIUM_TIMEOUT:
|
||
since = int(time.monotonic() - t_start) + 60
|
||
lines = fetch_logs(since_s=since)
|
||
found = parse_run_block(lines, question)
|
||
if found:
|
||
break
|
||
time.sleep(3)
|
||
|
||
elapsed = time.monotonic() - t_send
|
||
|
||
if not found:
|
||
print(f" → [{FAIL}] no reply within {MEDIUM_TIMEOUT}s")
|
||
med_results.append((question, "timeout", None, False))
|
||
continue
|
||
|
||
tier = found.get("tier", "unknown")
|
||
light_ok = question in LIGHT_ACCEPTABLE
|
||
|
||
if tier == "medium":
|
||
correct = True
|
||
label = PASS
|
||
note = "medium ✓"
|
||
elif tier == "light":
|
||
correct = light_ok # light is only acceptable for certain questions
|
||
label = WARN if not light_ok else PASS
|
||
note = "light (acceptable)" if light_ok else "light (should be medium)"
|
||
elif tier == "complex":
|
||
correct = False
|
||
label = FAIL
|
||
note = "complex — wrong escalation"
|
||
else:
|
||
correct = False
|
||
label = FAIL
|
||
note = f"unknown tier {tier!r}"
|
||
|
||
print(f" → [{label}] {note} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s")
|
||
med_results.append((question, tier, found["reply_total"], correct))
|
||
|
||
# Brief pause between questions
|
||
time.sleep(1)
|
||
|
||
# Summary table
|
||
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question'}")
|
||
print(f" {'─'*4} {'─'*8} {'─'*8} {'─'*55}")
|
||
for idx, (q, tier, lat, ok) in enumerate(med_results, 1):
|
||
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
|
||
ok_str = "✓" if ok else ("~" if tier == "light" else "✗")
|
||
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {q[:55]!r}")
|
||
|
||
total_med = len(med_results)
|
||
medium_count = sum(1 for _, tier, _, _ in med_results if tier == "medium")
|
||
light_count = sum(1 for _, tier, _, _ in med_results if tier == "light")
|
||
complex_count = sum(1 for _, tier, _, _ in med_results if tier == "complex")
|
||
timeout_count = sum(1 for _, tier, _, _ in med_results if tier == "timeout")
|
||
light_misroute = sum(
|
||
1 for q, tier, _, _ in med_results
|
||
if tier == "light" and q not in LIGHT_ACCEPTABLE
|
||
)
|
||
lats = [lat for _, _, lat, _ in med_results if lat is not None]
|
||
correct_count = medium_count + (light_count - light_misroute)
|
||
|
||
print(f"\n Breakdown: medium={medium_count} light={light_count} complex={complex_count} timeout={timeout_count}")
|
||
if light_misroute:
|
||
print(f" [{WARN}] {light_misroute} question(s) answered via light when medium expected (check reply quality)")
|
||
if lats:
|
||
print(f" Avg latency: {sum(lats)/len(lats):.1f}s min={min(lats):.1f}s max={max(lats):.1f}s")
|
||
|
||
no_complex = complex_count == 0
|
||
no_timeout = timeout_count == 0
|
||
all_ok = no_complex and no_timeout
|
||
|
||
report(
|
||
f"Medium questions: no complex escalation ({medium_count + light_count}/{total_med} routed)",
|
||
no_complex,
|
||
f"medium={medium_count} light={light_count} complex={complex_count} timeout={timeout_count}",
|
||
)
|
||
if not no_timeout:
|
||
report(
|
||
f"Medium questions: all completed within {MEDIUM_TIMEOUT}s",
|
||
False,
|
||
f"{timeout_count} question(s) timed out (increase MEDIUM_TIMEOUT or check agent logs)",
|
||
)
|
||
|
||
|
||
# ── 12. Hard benchmark — /think questions → complex tier + VRAM flush verified ─
|
||
if _run_hard:
|
||
print(f"\n[{INFO}] 12. Hard routing benchmark")
|
||
print(f" Sending {len(BENCHMARK['hard'])} /think questions — all must route to 'complex'")
|
||
print(f" Verifies: /think prefix → force_complex=True → VRAM flush → qwen3:8b inference")
|
||
print(f" Acceptable fallback: 'medium' if VRAM eviction timed out (logged warning)")
|
||
print(f" Fail condition: tier=light or timeout")
|
||
print(f" Chat ID: {CHAT_ID}")
|
||
print()
|
||
|
||
hard_results = [] # list of (question, tier, latency_s, ok)
|
||
COMPLEX_TIMEOUT = 300 # seconds — complex takes 60-180s + VRAM flush overhead
|
||
|
||
# Log markers we expect to see for complex path
|
||
_VRAM_ENTER = "[vram] enter_complex_mode"
|
||
_VRAM_EXIT = "[vram] exit_complex_mode"
|
||
|
||
for i, question in enumerate(BENCHMARK["hard"], 1):
|
||
tag = f"hard-{i:02d}"
|
||
# Strip /think prefix for display
|
||
short_q = question[len("/think "):].strip()[:60]
|
||
print(f" [{tag}] /think {short_q!r}")
|
||
|
||
# Snapshot log window start time
|
||
t_send = time.monotonic()
|
||
try:
|
||
status, _ = post_json(f"{DEEPAGENTS}/chat",
|
||
{"message": question, "chat_id": CHAT_ID}, timeout=5)
|
||
if status != 202:
|
||
print(f" → [{FAIL}] POST returned {status}")
|
||
hard_results.append((question, "?", None, False))
|
||
continue
|
||
except Exception as e:
|
||
print(f" → [{FAIL}] POST error: {e}")
|
||
hard_results.append((question, "?", None, False))
|
||
continue
|
||
|
||
# Poll for reply
|
||
t_start = time.monotonic()
|
||
found = None
|
||
while time.monotonic() - t_start < COMPLEX_TIMEOUT:
|
||
since = int(time.monotonic() - t_start) + 90
|
||
lines = fetch_logs(since_s=since)
|
||
found = parse_run_block(lines, question[len("/think "):].strip())
|
||
if found:
|
||
break
|
||
time.sleep(5)
|
||
|
||
elapsed = time.monotonic() - t_send
|
||
|
||
if not found:
|
||
print(f" → [{FAIL}] no reply within {COMPLEX_TIMEOUT}s")
|
||
hard_results.append((question, "timeout", None, False))
|
||
continue
|
||
|
||
tier = found.get("tier", "unknown")
|
||
|
||
if tier == "complex":
|
||
ok = True
|
||
label = PASS
|
||
note = "complex ✓"
|
||
elif tier == "medium":
|
||
# Acceptable fallback if VRAM eviction timed out
|
||
ok = True
|
||
label = WARN
|
||
note = "medium (VRAM fallback — check [vram] logs)"
|
||
else:
|
||
ok = False
|
||
label = FAIL
|
||
note = f"tier={tier} — unexpected"
|
||
|
||
# Check if VRAM enter/exit were logged for this block
|
||
lines_block = fetch_logs(since_s=int(elapsed) + 120)
|
||
msg_key = question[len("/think "):].strip()[:40]
|
||
vram_enter_seen = any(_VRAM_ENTER in ln for ln in lines_block
|
||
if msg_key in ln or
|
||
any(msg_key[:15] in prev_ln
|
||
for prev_ln in lines_block[max(0, lines_block.index(ln)-10):lines_block.index(ln)]))
|
||
|
||
# Simpler: just check the recent log window for enter/exit markers
|
||
recent = "\n".join(lines_block[-200:])
|
||
vram_enter_seen = _VRAM_ENTER in recent
|
||
vram_exit_seen = _VRAM_EXIT in recent
|
||
|
||
vram_note = ""
|
||
if tier == "complex":
|
||
if vram_enter_seen:
|
||
vram_note = " [vram:flush✓]"
|
||
else:
|
||
vram_note = f" [{WARN}:no vram flush log]"
|
||
|
||
print(f" → [{label}] {note} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s{vram_note}")
|
||
hard_results.append((question, tier, found["reply_total"], ok))
|
||
|
||
# Pause to let exit_complex_mode background task complete before next question
|
||
# (flushes qwen3:8b and pre-warms 4b+router — avoids VRAM conflict on next enter)
|
||
time.sleep(5)
|
||
|
||
# Summary table
|
||
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question (/think ...)'}")
|
||
print(f" {'─'*4} {'─'*8} {'─'*8} {'─'*55}")
|
||
for idx, (q, tier, lat, ok) in enumerate(hard_results, 1):
|
||
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
|
||
ok_str = "✓" if tier == "complex" else ("~" if tier == "medium" else "✗")
|
||
short = q[len("/think "):].strip()[:55]
|
||
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {short!r}")
|
||
|
||
total_hard = len(hard_results)
|
||
complex_count = sum(1 for _, t, _, _ in hard_results if t == "complex")
|
||
medium_fb = sum(1 for _, t, _, _ in hard_results if t == "medium")
|
||
light_count = sum(1 for _, t, _, _ in hard_results if t == "light")
|
||
timeout_count = sum(1 for _, t, _, _ in hard_results if t == "timeout")
|
||
lats = [lat for _, _, lat, _ in hard_results if lat is not None]
|
||
|
||
print(f"\n Breakdown: complex={complex_count} medium(fallback)={medium_fb} light={light_count} timeout={timeout_count}")
|
||
if medium_fb:
|
||
print(f" [{WARN}] {medium_fb} question(s) fell back to medium (VRAM eviction timeout)")
|
||
if light_count:
|
||
print(f" [{FAIL}] {light_count} question(s) routed to light — /think prefix not detected")
|
||
if lats:
|
||
print(f" Avg latency: {sum(lats)/len(lats):.1f}s min={min(lats):.1f}s max={max(lats):.1f}s")
|
||
|
||
no_light = light_count == 0
|
||
no_timeout = timeout_count == 0
|
||
|
||
report(
|
||
f"Hard questions routed to complex (not light) ({complex_count + medium_fb}/{total_hard})",
|
||
no_light and no_timeout,
|
||
f"complex={complex_count} medium_fallback={medium_fb} light={light_count} timeout={timeout_count}",
|
||
)
|
||
|
||
|
||
# ── summary ───────────────────────────────────────────────────────────────────
|
||
print(f"\n{'─'*55}")
|
||
total = len(results)
|
||
passed = sum(1 for _, ok in results if ok)
|
||
failed = total - passed
|
||
print(f"Results: {passed}/{total} passed", end="")
|
||
if failed:
|
||
print(f" ({failed} failed)\n")
|
||
print("Failed checks:")
|
||
for name, ok in results:
|
||
if not ok:
|
||
print(f" - {name}")
|
||
else:
|
||
print(" — all good")
|
||
print()
|
||
|
||
# Print benchmark reference
|
||
print(f"[{INFO}] Benchmark questions reference:")
|
||
for tier_name, questions in BENCHMARK.items():
|
||
print(f"\n {tier_name.upper()} ({len(questions)} questions):")
|
||
for j, q in enumerate(questions, 1):
|
||
print(f" {j:2d}. {q}")
|
||
print()
|