Files
AgapHost/adolf/test_pipeline.py
Alvis 09a93c661e Add three-tier model routing with VRAM management and benchmark suite
- Three-tier routing: light (router answers directly ~3s), medium (qwen3:4b
  + tools ~60s), complex (/think prefix → qwen3:8b + subagents ~140s)
- Router: qwen2.5:1.5b, temp=0, regex pre-classifier + raw-text LLM classify
- VRAMManager: explicit flush/poll/prewarm to prevent Ollama CPU-spill bug
- agent_factory: build_medium_agent and build_complex_agent using deepagents
  (TodoListMiddleware + SubAgentMiddleware with research/memory subagents)
- Fix: split Telegram replies >4000 chars into multiple messages
- Benchmark: 30 questions (easy/medium/hard) — 10/10/10 verified passing
  easy→light, medium→medium, hard→complex with VRAM flush confirmed

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-28 17:54:51 +00:00

906 lines
38 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Adolf pipeline integration test with end-to-end timing profiling.
Tests:
1. Service health (deepagents, openmemory, grammy MCP SSE)
2. GPU Ollama models
3. CPU Ollama models
4. Qdrant collection + vector dims
5. SearXNG
6. Name store — "remember that your name is <RandomName>"
7. Qdrant point added after store
8. Name recall — "what is your name?" → reply contains <RandomName>
9. Timing profile + bottleneck report
10. Easy benchmark — 10 easy questions → all must route to light
11. Medium benchmark — 10 medium questions → must route to medium (or light, never complex)
12. Hard benchmark — 10 /think questions → all must route to complex; VRAM flush verified
Usage:
python3 test_pipeline.py [--chat-id CHAT_ID]
[--bench-only] skip sections 1-9, run 10+11+12
[--easy-only] skip 1-9 and 11+12, run only section 10
[--medium-only] skip 1-9 and 10+12, run only section 11
[--hard-only] skip 1-9 and 10+11, run only section 12
[--no-bench] skip sections 10-12
Timing is extracted from deepagents container logs, not estimated from sleeps.
"""
import argparse
import http.client
import json
import random
import re
import subprocess
import sys
import time
import urllib.request
# ── config ────────────────────────────────────────────────────────────────────
DEEPAGENTS = "http://localhost:8000"
OPENMEMORY = "http://localhost:8765"
GRAMMY_HOST = "localhost"
GRAMMY_PORT = 3001
OLLAMA_GPU = "http://localhost:11436"
OLLAMA_CPU = "http://localhost:11435"
QDRANT = "http://localhost:6333"
SEARXNG = "http://localhost:11437"
COMPOSE_FILE = "/home/alvis/agap_git/adolf/docker-compose.yml"
DEFAULT_CHAT_ID = "346967270"
NAMES = [
"Maximilian", "Cornelius", "Zephyr", "Archibald", "Balthazar",
"Ignatius", "Lysander", "Octavian", "Reginald", "Sylvester",
]
# ── benchmark questions ───────────────────────────────────────────────────────
BENCHMARK = {
"easy": [
"hi",
"what is 2+2?",
"what is the capital of France?",
"tell me a short joke",
"how are you doing today?",
"thanks!",
"what day comes after Wednesday?",
"name the three primary colors",
"is the sky blue?",
"what does CPU stand for?",
],
"medium": [
"what is the current weather in Berlin?",
"find the latest news about artificial intelligence",
"what is the current price of Bitcoin?",
"search for a good pasta carbonara recipe",
"what movies are in theaters this week?",
"find Python tutorials for beginners",
"who won the last FIFA World Cup?",
"do you remember what we talked about before?",
"search for the best coffee shops in Tokyo",
"what is happening in the tech industry this week?",
],
"hard": [
"/think compare the top 3 Python web frameworks (Django, FastAPI, Flask) and recommend one for a production REST API",
"/think research the history of artificial intelligence and create a timeline of key milestones",
"/think plan a 7-day trip to Japan with daily itinerary, accommodation suggestions, and budget breakdown",
"/think analyze microservices vs monolithic architecture: pros, cons, and when to choose each",
"/think write a Python script that reads a CSV file, cleans the data, and generates summary statistics",
"/think research quantum computing: explain the key concepts and how it differs from classical computing",
"/think compare PostgreSQL, MongoDB, and Redis — when to use each and what are the trade-offs?",
"/think create a comprehensive Docker deployment guide covering best practices for production",
"/think research climate change: summarize the latest IPCC findings and key data points",
"/think design a REST API with authentication, rate limiting, and proper error handling — provide architecture and code outline",
],
}
PASS = "\033[32mPASS\033[0m"
FAIL = "\033[31mFAIL\033[0m"
INFO = "\033[36mINFO\033[0m"
WARN = "\033[33mWARN\033[0m"
results = []
timings = {} # label → float seconds | None
# ── helpers ───────────────────────────────────────────────────────────────────
def report(name, ok, detail=""):
tag = PASS if ok else FAIL
print(f" [{tag}] {name}" + (f"{detail}" if detail else ""))
results.append((name, ok))
def tf(v):
"""Format timing value."""
return f"{v:6.2f}s" if v is not None else " n/a"
def get(url, timeout=5):
with urllib.request.urlopen(urllib.request.Request(url), timeout=timeout) as r:
return r.status, r.read().decode()
def post_json(url, payload, timeout=10):
data = json.dumps(payload).encode()
req = urllib.request.Request(url, data=data,
headers={"Content-Type": "application/json"},
method="POST")
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.status, json.loads(r.read().decode())
def check_sse(host, port, path):
try:
conn = http.client.HTTPConnection(host, port, timeout=5)
conn.request("GET", path, headers={"Accept": "text/event-stream"})
r = conn.getresponse()
conn.close()
return r.status == 200, f"HTTP {r.status}"
except Exception as e:
return False, str(e)
def qdrant_count():
try:
_, body = get(f"{QDRANT}/collections/adolf_memories")
return json.loads(body).get("result", {}).get("points_count", 0)
except Exception:
return 0
def fetch_logs(since_s=600):
"""Return deepagents log lines from the last since_s seconds."""
try:
r = subprocess.run(
["docker", "compose", "-f", COMPOSE_FILE, "logs", "deepagents",
f"--since={int(since_s)}s", "--no-log-prefix"],
capture_output=True, text=True, timeout=15,
)
return r.stdout.splitlines()
except Exception:
return []
def parse_run_block(lines, msg_prefix):
"""
Scan log lines for the LAST '[agent] running: <msg_prefix>' block.
Extracts reply timing, tier, and memory timing from that block.
Returns dict or None if the reply has not appeared in logs yet.
Dict keys:
reply_total, llm, send, tier, reply_text — from "[agent] replied in ..."
memory_s — from "[memory] stored in ..."
memory_error — True if "[memory] error" found
"""
search = msg_prefix[:50]
start_idx = None
for i, line in enumerate(lines):
if "[agent] running:" in line and search in line:
start_idx = i # keep updating — we want the LAST occurrence
if start_idx is None:
return None
block = lines[start_idx:]
last_ai_text = None
reply_data = None
for j, line in enumerate(block):
# Track last non-tool AIMessage (the final reply)
if "AIMessage:" in line and "" not in line:
txt = line.split("AIMessage:", 1)[-1].strip()
if txt:
last_ai_text = txt
# For light tier: router reply is stored in _conversation_buffers directly
# so there may be no AIMessage log — grab from tier=light line
if "[agent] tier=light" in line and "message=" in line:
# Extract preview text logged elsewhere if available
pass
m = re.search(r"replied in ([\d.]+)s \(llm=([\d.]+)s, send=([\d.]+)s\)", line)
if m:
# Extract optional tier tag at end of line
tier_m = re.search(r"\btier=(\w+)", line)
tier = tier_m.group(1) if tier_m else "unknown"
reply_data = {
"reply_total": float(m.group(1)),
"llm": float(m.group(2)),
"send": float(m.group(3)),
"tier": tier,
"reply_text": last_ai_text,
"memory_s": None,
"memory_error": False,
"_j": j,
}
break
if reply_data is None:
return None # reply not in logs yet
# Memory line can appear after the next "[agent] running:" — no stop condition
for line in block[reply_data["_j"] + 1:]:
mm = re.search(r"\[memory\] stored in ([\d.]+)s", line)
if mm:
reply_data["memory_s"] = float(mm.group(1))
break
if "[memory] error" in line:
reply_data["memory_error"] = True
break
return reply_data
def wait_for(label, msg_prefix, timeout_s=200, need_memory=True):
"""
Poll deepagents logs until the message is fully processed.
Shows a live progress line.
Returns timing dict or None on timeout.
"""
t_start = time.monotonic()
deadline = t_start + timeout_s
tick = 0
last_result = None
while time.monotonic() < deadline:
# Window grows with elapsed time — never miss a line that appeared late
since = int(time.monotonic() - t_start) + 90
lines = fetch_logs(since_s=since)
result = parse_run_block(lines, msg_prefix)
if result:
last_result = result
has_mem = result["memory_s"] is not None or result["memory_error"]
if (not need_memory) or has_mem:
elapsed = time.monotonic() - t_start
print(f"\r [{label}] done after {elapsed:.0f}s{' ' * 30}")
return result
time.sleep(4)
tick += 1
rem = int(deadline - time.monotonic())
if last_result:
phase = "waiting for memory..." if need_memory else "done"
else:
phase = "waiting for LLM reply..."
print(f"\r [{label}] {tick*4}s elapsed, {rem}s left — {phase} ", end="", flush=True)
print(f"\r [{label}] TIMEOUT after {timeout_s}s{' ' * 30}")
return None
# ── args ──────────────────────────────────────────────────────────────────────
parser = argparse.ArgumentParser(description="Adolf pipeline test")
parser.add_argument("--chat-id", default=DEFAULT_CHAT_ID)
parser.add_argument("--bench-only", action="store_true",
help="Skip sections 1-9, run sections 10+11 (both benchmarks)")
parser.add_argument("--easy-only", action="store_true",
help="Skip sections 1-9 and 11, run only section 10 (easy benchmark)")
parser.add_argument("--medium-only", action="store_true",
help="Skip sections 1-9 and 10, run only section 11 (medium benchmark)")
parser.add_argument("--hard-only", action="store_true",
help="Skip sections 1-9 and 10+11, run only section 12 (hard benchmark)")
parser.add_argument("--no-bench", action="store_true",
help="Skip sections 10-12 (all benchmarks)")
args = parser.parse_args()
CHAT_ID = args.chat_id
# Derived flags for readability
_skip_pipeline = args.bench_only or args.easy_only or args.medium_only or args.hard_only
_run_easy = not args.no_bench and not args.medium_only and not args.hard_only
_run_medium = not args.no_bench and not args.easy_only and not args.hard_only
_run_hard = not args.no_bench and not args.easy_only and not args.medium_only
random_name = random.choice(NAMES)
if not _skip_pipeline:
print(f"\n Test name : \033[1m{random_name}\033[0m")
print(f" Chat ID : {CHAT_ID}")
# ── 1. service health ─────────────────────────────────────────────────────────
if not _skip_pipeline:
print(f"\n[{INFO}] 1. Service health")
t0 = time.monotonic()
try:
status, body = get(f"{DEEPAGENTS}/health")
data = json.loads(body)
ok = status == 200 and data.get("agent_ready") is True
report("deepagents /health — agent_ready", ok, f"agent_ready={data.get('agent_ready')}")
except Exception as e:
report("deepagents /health", False, str(e))
ok, detail = check_sse("localhost", 8765, "/sse")
report("openmemory /sse reachable", ok, detail)
ok, detail = check_sse(GRAMMY_HOST, GRAMMY_PORT, "/sse")
report("grammy /sse reachable", ok, detail)
timings["health_check"] = time.monotonic() - t0
# ── 2. GPU Ollama ─────────────────────────────────────────────────────────────
if not _skip_pipeline:
print(f"\n[{INFO}] 2. GPU Ollama (port 11436)")
t0 = time.monotonic()
try:
status, body = get(f"{OLLAMA_GPU}/api/tags")
models = [m["name"] for m in json.loads(body).get("models", [])]
has_qwen = any("qwen3" in m for m in models)
report("GPU Ollama reachable", True, f"models: {models}")
report("qwen3:8b present", has_qwen)
except Exception as e:
report("GPU Ollama reachable", False, str(e))
report("qwen3:8b present", False, "skipped")
timings["gpu_ollama_ping"] = time.monotonic() - t0
# ── 3. CPU Ollama ─────────────────────────────────────────────────────────────
if not _skip_pipeline:
print(f"\n[{INFO}] 3. CPU Ollama (port 11435)")
t0 = time.monotonic()
try:
status, body = get(f"{OLLAMA_CPU}/api/tags")
models = [m["name"] for m in json.loads(body).get("models", [])]
has_embed = any("nomic-embed-text" in m for m in models)
report("CPU Ollama reachable", True, f"models: {models}")
report("nomic-embed-text present", has_embed)
except Exception as e:
report("CPU Ollama reachable", False, str(e))
report("nomic-embed-text present", False, "skipped")
timings["cpu_ollama_ping"] = time.monotonic() - t0
# ── 4. Qdrant ─────────────────────────────────────────────────────────────────
if not _skip_pipeline:
print(f"\n[{INFO}] 4. Qdrant (port 6333)")
t0 = time.monotonic()
try:
status, body = get(f"{QDRANT}/collections")
cols = [c["name"] for c in json.loads(body).get("result", {}).get("collections", [])]
report("Qdrant reachable", True, f"collections: {cols}")
report("adolf_memories collection exists", "adolf_memories" in cols)
except Exception as e:
report("Qdrant reachable", False, str(e))
report("adolf_memories collection exists", False, "skipped")
try:
status, body = get(f"{QDRANT}/collections/adolf_memories")
info = json.loads(body).get("result", {})
dims = info.get("config", {}).get("params", {}).get("vectors", {}).get("size")
report("vector dims = 768", dims == 768, f"got {dims}")
except Exception as e:
report("adolf_memories collection info", False, str(e))
timings["qdrant_ping"] = time.monotonic() - t0
# ── 5. SearXNG ────────────────────────────────────────────────────────────────
if not _skip_pipeline:
print(f"\n[{INFO}] 5. SearXNG (port 11437)")
t0 = time.monotonic()
try:
status, body = get(f"{SEARXNG}/search?q=test&format=json", timeout=15)
elapsed = time.monotonic() - t0
n = len(json.loads(body).get("results", []))
report("SearXNG reachable + JSON results", status == 200 and n > 0, f"{n} results in {elapsed:.1f}s")
report("SearXNG response < 5s", elapsed < 5, f"{elapsed:.2f}s")
timings["searxng_latency"] = elapsed
except Exception as e:
report("SearXNG reachable", False, str(e))
report("SearXNG response < 5s", False, "skipped")
timings["searxng_latency"] = None
timings["searxng_check"] = time.monotonic() - t0
# ── 68. Name memory pipeline ─────────────────────────────────────────────────
if not _skip_pipeline:
print(f"\n[{INFO}] 68. Name memory pipeline")
print(f" chat_id={CHAT_ID} name={random_name}")
store_msg = f"remember that your name is {random_name}"
recall_msg = "what is your name?"
pts_before = qdrant_count()
print(f" Qdrant points before: {pts_before}")
# ── 6. Send store message ─────────────────────────────────────────────────────
print(f"\n [store] '{store_msg}'")
t_store = time.monotonic()
try:
status, _ = post_json(f"{DEEPAGENTS}/chat",
{"message": store_msg, "chat_id": CHAT_ID}, timeout=5)
t_accept = time.monotonic() - t_store
report("POST /chat (store) returns 202 immediately",
status == 202 and t_accept < 1, f"status={status}, t={t_accept:.3f}s")
timings["store_http_accept"] = t_accept
except Exception as e:
report("POST /chat (store)", False, str(e))
sys.exit(1)
store = wait_for("store", store_msg, timeout_s=220, need_memory=True)
if store:
timings["store_llm"] = store["llm"]
timings["store_send"] = store["send"]
timings["store_reply"] = store["reply_total"]
timings["store_memory"] = store["memory_s"]
report("Agent replied to store message", True,
f"{store['reply_total']:.1f}s total llm={store['llm']:.1f}s send={store['send']:.1f}s tier={store['tier']}")
if store["memory_s"] is not None:
report("Memory stored without error", True, f"{store['memory_s']:.1f}s")
elif store["memory_error"]:
report("Memory stored without error", False, "error in [memory] log")
else:
report("Memory stored without error", False, "not found in logs (still running?)")
print(f" Store reply: {store['reply_text']!r}")
else:
report("Agent replied to store message", False, "timeout")
report("Memory stored without error", False, "timeout")
sys.exit(1)
# ── 7. Verify Qdrant ──────────────────────────────────────────────────────────
pts_after = qdrant_count()
new_pts = pts_after - pts_before
report("New memory point(s) added to Qdrant", new_pts > 0,
f"{pts_before}{pts_after} (+{new_pts})")
timings["qdrant_new_points"] = new_pts
# ── 8. Send recall message ────────────────────────────────────────────────────
print(f"\n [recall] '{recall_msg}'")
t_recall = time.monotonic()
try:
status, _ = post_json(f"{DEEPAGENTS}/chat",
{"message": recall_msg, "chat_id": CHAT_ID}, timeout=5)
t_accept2 = time.monotonic() - t_recall
report("POST /chat (recall) returns 202 immediately",
status == 202 and t_accept2 < 1, f"status={status}, t={t_accept2:.3f}s")
timings["recall_http_accept"] = t_accept2
except Exception as e:
report("POST /chat (recall)", False, str(e))
recall = wait_for("recall", recall_msg, timeout_s=160, need_memory=False)
if recall:
timings["recall_llm"] = recall["llm"]
timings["recall_send"] = recall["send"]
timings["recall_reply"] = recall["reply_total"]
report("Agent replied to recall message", True,
f"{recall['reply_total']:.1f}s total llm={recall['llm']:.1f}s send={recall['send']:.1f}s tier={recall['tier']}")
reply_text = recall["reply_text"] or ""
name_in_reply = random_name.lower() in reply_text.lower()
report(f"Reply contains '{random_name}'", name_in_reply,
f"reply: {reply_text[:120]!r}")
else:
report("Agent replied to recall message", False, "timeout")
report(f"Reply contains '{random_name}'", False, "no reply")
# ── 9. Timing profile ─────────────────────────────────────────────────────────
if not _skip_pipeline:
print(f"\n[{INFO}] 9. Timing profile")
W = 36
print(f"\n {'Stage':<{W}} {'Time':>8}")
print(f" {''*W} {''*8}")
rows_store = [
("[GPU] HTTP accept — store turn", "store_http_accept"),
("[GPU] qwen3:Xb inference — store turn","store_llm"),
("[GPU] Telegram send — store turn", "store_send"),
("[GPU] Total reply latency — store", "store_reply"),
("[GPU] qwen2.5:1.5b+embed — async mem", "store_memory"),
]
rows_recall = [
("[GPU] HTTP accept — recall turn", "recall_http_accept"),
("[GPU] qwen3:Xb inference — recall", "recall_llm"),
("[GPU] Telegram send — recall turn", "recall_send"),
("[GPU] Total reply latency — recall", "recall_reply"),
]
for label, key in rows_store:
v = timings.get(key)
print(f" {label:<{W}} {tf(v):>8}")
print(f" {''*W} {''*8}")
for label, key in rows_recall:
v = timings.get(key)
print(f" {label:<{W}} {tf(v):>8}")
# Bottleneck bar chart
print(f"\n Bottleneck analysis (each █ ≈ 5s):")
print(f" {''*(W+12)}")
candidates = [
("[GPU] qwen3:Xb — store reply ", timings.get("store_llm") or 0),
("[GPU] qwen3:Xb — recall reply", timings.get("recall_llm") or 0),
("[GPU] qwen2.5:1.5b+embed (async)", timings.get("store_memory") or 0),
("[net] SearXNG ", timings.get("searxng_latency") or 0),
]
candidates.sort(key=lambda x: x[1], reverse=True)
for label, t in candidates:
bar = "" * min(int(t / 5), 24)
pct = ""
total_pipeline = (timings.get("store_reply") or 0) + (timings.get("store_memory") or 0)
if total_pipeline > 0:
pct = f" {t/total_pipeline*100:4.0f}%"
print(f" {label} {t:6.1f}s {bar}{pct}")
print()
# ── 10. Tier routing benchmark — easy questions → light path ──────────────────
if _run_easy:
print(f"\n[{INFO}] 10. Tier routing benchmark")
print(f" Sending {len(BENCHMARK['easy'])} easy questions — all must route to 'light'")
print(f" Chat ID: {CHAT_ID}")
print()
bench_results = [] # list of (question, tier, latency_s, ok)
LIGHT_TIMEOUT = 60 # seconds — light is fast but may queue behind prior messages
for i, question in enumerate(BENCHMARK["easy"], 1):
tag = f"easy-{i:02d}"
short_q = question[:55]
print(f" [{tag}] {short_q!r}")
# Send
t_send = time.monotonic()
try:
status, _ = post_json(f"{DEEPAGENTS}/chat",
{"message": question, "chat_id": CHAT_ID}, timeout=5)
if status != 202:
print(f" → [{FAIL}] POST returned {status}")
bench_results.append((question, "?", None, False))
continue
except Exception as e:
print(f" → [{FAIL}] POST error: {e}")
bench_results.append((question, "?", None, False))
continue
# Poll for reply
t_start = time.monotonic()
found = None
while time.monotonic() - t_start < LIGHT_TIMEOUT:
since = int(time.monotonic() - t_start) + 30
lines = fetch_logs(since_s=since)
found = parse_run_block(lines, question)
if found:
break
time.sleep(1)
elapsed = time.monotonic() - t_send
if not found:
print(f" → [{FAIL}] no reply within {LIGHT_TIMEOUT}s")
bench_results.append((question, "timeout", None, False))
continue
tier = found.get("tier", "unknown")
is_light = (tier == "light")
tag_str = PASS if is_light else FAIL
print(f" → [{tag_str}] tier={tier} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s")
bench_results.append((question, tier, found["reply_total"], is_light))
# Brief pause between questions to keep logs clean
time.sleep(1)
# Summary table
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question'}")
print(f" {''*4} {''*8} {''*8} {''*50}")
for idx, (q, tier, lat, ok) in enumerate(bench_results, 1):
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
ok_str = "" if ok else ""
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {q[:50]!r}")
light_count = sum(1 for _, _, _, ok in bench_results if ok)
total_bench = len(bench_results)
lats = [lat for _, _, lat, ok in bench_results if ok and lat is not None]
avg_lat = sum(lats) / len(lats) if lats else 0
print(f"\n Light-path score: {light_count}/{total_bench}")
if lats:
print(f" Avg latency (light): {avg_lat:.1f}s "
f"min={min(lats):.1f}s max={max(lats):.1f}s")
report(f"All easy questions routed to light ({light_count}/{total_bench})",
light_count == total_bench,
f"{light_count}/{total_bench} via light path, avg {avg_lat:.1f}s")
# ── 11. Medium benchmark — medium questions → medium or light, never complex ──
if _run_medium:
print(f"\n[{INFO}] 11. Medium routing benchmark")
print(f" Sending {len(BENCHMARK['medium'])} medium questions")
print(f" Expected: tier=medium (needs tools). Light is acceptable for factual questions.")
print(f" Fail condition: tier=complex or timeout.")
print(f" Chat ID: {CHAT_ID}")
print()
# Questions where light is a valid alternative (model may know from training data)
LIGHT_ACCEPTABLE = {
"who won the last FIFA World Cup?",
"search for a good pasta carbonara recipe",
"find Python tutorials for beginners",
"search for the best coffee shops in Tokyo",
}
med_results = [] # list of (question, tier, latency_s, correct)
MEDIUM_TIMEOUT = 120 # seconds — medium takes 20-100s, allow for queue buildup
for i, question in enumerate(BENCHMARK["medium"], 1):
tag = f"med-{i:02d}"
short_q = question[:60]
print(f" [{tag}] {short_q!r}")
# Send
t_send = time.monotonic()
try:
status, _ = post_json(f"{DEEPAGENTS}/chat",
{"message": question, "chat_id": CHAT_ID}, timeout=5)
if status != 202:
print(f" → [{FAIL}] POST returned {status}")
med_results.append((question, "?", None, False))
continue
except Exception as e:
print(f" → [{FAIL}] POST error: {e}")
med_results.append((question, "?", None, False))
continue
# Poll for reply
t_start = time.monotonic()
found = None
while time.monotonic() - t_start < MEDIUM_TIMEOUT:
since = int(time.monotonic() - t_start) + 60
lines = fetch_logs(since_s=since)
found = parse_run_block(lines, question)
if found:
break
time.sleep(3)
elapsed = time.monotonic() - t_send
if not found:
print(f" → [{FAIL}] no reply within {MEDIUM_TIMEOUT}s")
med_results.append((question, "timeout", None, False))
continue
tier = found.get("tier", "unknown")
light_ok = question in LIGHT_ACCEPTABLE
if tier == "medium":
correct = True
label = PASS
note = "medium ✓"
elif tier == "light":
correct = light_ok # light is only acceptable for certain questions
label = WARN if not light_ok else PASS
note = "light (acceptable)" if light_ok else "light (should be medium)"
elif tier == "complex":
correct = False
label = FAIL
note = "complex — wrong escalation"
else:
correct = False
label = FAIL
note = f"unknown tier {tier!r}"
print(f" → [{label}] {note} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s")
med_results.append((question, tier, found["reply_total"], correct))
# Brief pause between questions
time.sleep(1)
# Summary table
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question'}")
print(f" {''*4} {''*8} {''*8} {''*55}")
for idx, (q, tier, lat, ok) in enumerate(med_results, 1):
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
ok_str = "" if ok else ("~" if tier == "light" else "")
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {q[:55]!r}")
total_med = len(med_results)
medium_count = sum(1 for _, tier, _, _ in med_results if tier == "medium")
light_count = sum(1 for _, tier, _, _ in med_results if tier == "light")
complex_count = sum(1 for _, tier, _, _ in med_results if tier == "complex")
timeout_count = sum(1 for _, tier, _, _ in med_results if tier == "timeout")
light_misroute = sum(
1 for q, tier, _, _ in med_results
if tier == "light" and q not in LIGHT_ACCEPTABLE
)
lats = [lat for _, _, lat, _ in med_results if lat is not None]
correct_count = medium_count + (light_count - light_misroute)
print(f"\n Breakdown: medium={medium_count} light={light_count} complex={complex_count} timeout={timeout_count}")
if light_misroute:
print(f" [{WARN}] {light_misroute} question(s) answered via light when medium expected (check reply quality)")
if lats:
print(f" Avg latency: {sum(lats)/len(lats):.1f}s min={min(lats):.1f}s max={max(lats):.1f}s")
no_complex = complex_count == 0
no_timeout = timeout_count == 0
all_ok = no_complex and no_timeout
report(
f"Medium questions: no complex escalation ({medium_count + light_count}/{total_med} routed)",
no_complex,
f"medium={medium_count} light={light_count} complex={complex_count} timeout={timeout_count}",
)
if not no_timeout:
report(
f"Medium questions: all completed within {MEDIUM_TIMEOUT}s",
False,
f"{timeout_count} question(s) timed out (increase MEDIUM_TIMEOUT or check agent logs)",
)
# ── 12. Hard benchmark — /think questions → complex tier + VRAM flush verified ─
if _run_hard:
print(f"\n[{INFO}] 12. Hard routing benchmark")
print(f" Sending {len(BENCHMARK['hard'])} /think questions — all must route to 'complex'")
print(f" Verifies: /think prefix → force_complex=True → VRAM flush → qwen3:8b inference")
print(f" Acceptable fallback: 'medium' if VRAM eviction timed out (logged warning)")
print(f" Fail condition: tier=light or timeout")
print(f" Chat ID: {CHAT_ID}")
print()
hard_results = [] # list of (question, tier, latency_s, ok)
COMPLEX_TIMEOUT = 300 # seconds — complex takes 60-180s + VRAM flush overhead
# Log markers we expect to see for complex path
_VRAM_ENTER = "[vram] enter_complex_mode"
_VRAM_EXIT = "[vram] exit_complex_mode"
for i, question in enumerate(BENCHMARK["hard"], 1):
tag = f"hard-{i:02d}"
# Strip /think prefix for display
short_q = question[len("/think "):].strip()[:60]
print(f" [{tag}] /think {short_q!r}")
# Snapshot log window start time
t_send = time.monotonic()
try:
status, _ = post_json(f"{DEEPAGENTS}/chat",
{"message": question, "chat_id": CHAT_ID}, timeout=5)
if status != 202:
print(f" → [{FAIL}] POST returned {status}")
hard_results.append((question, "?", None, False))
continue
except Exception as e:
print(f" → [{FAIL}] POST error: {e}")
hard_results.append((question, "?", None, False))
continue
# Poll for reply
t_start = time.monotonic()
found = None
while time.monotonic() - t_start < COMPLEX_TIMEOUT:
since = int(time.monotonic() - t_start) + 90
lines = fetch_logs(since_s=since)
found = parse_run_block(lines, question[len("/think "):].strip())
if found:
break
time.sleep(5)
elapsed = time.monotonic() - t_send
if not found:
print(f" → [{FAIL}] no reply within {COMPLEX_TIMEOUT}s")
hard_results.append((question, "timeout", None, False))
continue
tier = found.get("tier", "unknown")
if tier == "complex":
ok = True
label = PASS
note = "complex ✓"
elif tier == "medium":
# Acceptable fallback if VRAM eviction timed out
ok = True
label = WARN
note = "medium (VRAM fallback — check [vram] logs)"
else:
ok = False
label = FAIL
note = f"tier={tier} — unexpected"
# Check if VRAM enter/exit were logged for this block
lines_block = fetch_logs(since_s=int(elapsed) + 120)
msg_key = question[len("/think "):].strip()[:40]
vram_enter_seen = any(_VRAM_ENTER in ln for ln in lines_block
if msg_key in ln or
any(msg_key[:15] in prev_ln
for prev_ln in lines_block[max(0, lines_block.index(ln)-10):lines_block.index(ln)]))
# Simpler: just check the recent log window for enter/exit markers
recent = "\n".join(lines_block[-200:])
vram_enter_seen = _VRAM_ENTER in recent
vram_exit_seen = _VRAM_EXIT in recent
vram_note = ""
if tier == "complex":
if vram_enter_seen:
vram_note = " [vram:flush✓]"
else:
vram_note = f" [{WARN}:no vram flush log]"
print(f" → [{label}] {note} latency={found['reply_total']:.1f}s llm={found['llm']:.1f}s{vram_note}")
hard_results.append((question, tier, found["reply_total"], ok))
# Pause to let exit_complex_mode background task complete before next question
# (flushes qwen3:8b and pre-warms 4b+router — avoids VRAM conflict on next enter)
time.sleep(5)
# Summary table
print(f"\n {'#':<4} {'Tier':<8} {'Latency':>8} {'Question (/think ...)'}")
print(f" {''*4} {''*8} {''*8} {''*55}")
for idx, (q, tier, lat, ok) in enumerate(hard_results, 1):
lat_str = f"{lat:.1f}s" if lat is not None else "timeout"
ok_str = "" if tier == "complex" else ("~" if tier == "medium" else "")
short = q[len("/think "):].strip()[:55]
print(f" {ok_str} {idx:<3} {tier:<8} {lat_str:>8} {short!r}")
total_hard = len(hard_results)
complex_count = sum(1 for _, t, _, _ in hard_results if t == "complex")
medium_fb = sum(1 for _, t, _, _ in hard_results if t == "medium")
light_count = sum(1 for _, t, _, _ in hard_results if t == "light")
timeout_count = sum(1 for _, t, _, _ in hard_results if t == "timeout")
lats = [lat for _, _, lat, _ in hard_results if lat is not None]
print(f"\n Breakdown: complex={complex_count} medium(fallback)={medium_fb} light={light_count} timeout={timeout_count}")
if medium_fb:
print(f" [{WARN}] {medium_fb} question(s) fell back to medium (VRAM eviction timeout)")
if light_count:
print(f" [{FAIL}] {light_count} question(s) routed to light — /think prefix not detected")
if lats:
print(f" Avg latency: {sum(lats)/len(lats):.1f}s min={min(lats):.1f}s max={max(lats):.1f}s")
no_light = light_count == 0
no_timeout = timeout_count == 0
report(
f"Hard questions routed to complex (not light) ({complex_count + medium_fb}/{total_hard})",
no_light and no_timeout,
f"complex={complex_count} medium_fallback={medium_fb} light={light_count} timeout={timeout_count}",
)
# ── summary ───────────────────────────────────────────────────────────────────
print(f"\n{''*55}")
total = len(results)
passed = sum(1 for _, ok in results if ok)
failed = total - passed
print(f"Results: {passed}/{total} passed", end="")
if failed:
print(f" ({failed} failed)\n")
print("Failed checks:")
for name, ok in results:
if not ok:
print(f" - {name}")
else:
print(" — all good")
print()
# Print benchmark reference
print(f"[{INFO}] Benchmark questions reference:")
for tier_name, questions in BENCHMARK.items():
print(f"\n {tier_name.upper()} ({len(questions)} questions):")
for j, q in enumerate(questions, 1):
print(f" {j:2d}. {q}")
print()