Compare commits

13 Commits

Author SHA1 Message Date
fc53632c7b Merge pull request 'feat: rename dry_run to no_inference for all tiers' (#17) from worktree-agent-afc013ce into main
Reviewed-on: #17
2026-03-24 07:27:04 +00:00
47a1166be6 Merge pull request 'feat: rename --dry-run to --no-inference in run_benchmark.py' (#18) from feat/no-inference-benchmark into main
Reviewed-on: #18
2026-03-24 07:26:44 +00:00
74e5b1758d Merge pull request 'feat: add run_routing_benchmark.py — routing-only benchmark' (#19) from feat/routing-benchmark into main
Reviewed-on: #19
2026-03-24 07:26:31 +00:00
0fbdbf3a5e Add run_routing_benchmark.py — dedicated routing-only benchmark
Tests routing accuracy for all tiers with no_inference=True hardcoded.
Fast (QUERY_TIMEOUT=30s), no GPU check, shares benchmark.json dataset.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 07:25:16 +00:00
77db739819 Rename --dry-run to --no-inference, apply to all tiers in run_benchmark.py
No-inference mode now skips LLM for all tiers (not just complex),
GPU check is auto-skipped, and the metadata key matches agent.py.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 03:49:09 +00:00
9c2f27eed4 Rename dry_run → no_inference, extend to all tiers in agent.py
When no_inference=True, routing decision is captured but all LLM
inference is skipped — yields constant "I don't know" immediately.
Also disables fast-tool short-circuit so routing path always runs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 03:43:42 +00:00
a363347ae5 Merge pull request 'Fix routing: add Russian tech def patterns to light, strengthen medium smart home' (#13) from fix/routing-accuracy into main
Reviewed-on: #13
2026-03-24 02:51:17 +00:00
1d2787766e Merge pull request 'Remove Bifrost: replace test 4 with LiteLLM health check' (#14) from fix/remove-bifrost into main
Reviewed-on: #14
2026-03-24 02:48:40 +00:00
abf792a2ec Remove Bifrost: replace test 4 with LiteLLM health check
- Remove BIFROST constant and fetch_bifrost_logs() from common.py
- Add LITELLM constant (localhost:4000)
- Replace test_memory.py test 4 (Bifrost pass-through) with LiteLLM health check

Fixes #5

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 02:46:01 +00:00
537e927146 Fix routing: add Russian tech def patterns to light, strengthen medium smart home
- _LIGHT_PATTERNS: add что\s+такое, что\s+означает, сколько бит/байт,
  compound greetings (привет, как дела) — these fell through to embedding
  which sometimes misclassified short Russian phrases as medium
- _MEDIUM_PATTERNS: add non-verb-first smart home patterns (свет/лампочка
  as subject, режим/сцена commands) for benchmark queries with different phrasing

Fixes #8, #9

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 02:45:42 +00:00
186e16284b Merge pull request 'Fix tier logging: capture actual_tier, fix parse_run_block regex, remove reply_text truncation' (#11) from fix/tier-logging into main
Reviewed-on: #11
2026-03-24 02:44:35 +00:00
0b428e4ada Merge pull request 'Fix benchmark log extraction: first tier match, increase log tail to 300' (#12) from fix/benchmark-log-extraction into main
Reviewed-on: #12
2026-03-24 02:43:26 +00:00
8ef4897869 Fix tier logging: capture actual_tier, fix parse_run_block regex, remove reply_text truncation
- Add tier_capture param to _run_agent_pipeline; append tier after determination
- Capture actual_tier in run_agent_task from tier_capture list
- Log tier in replied-in line: [agent] replied in Xs tier=Y
- Remove reply_text[:200] truncation (was breaking benchmark keyword matching)
- Update parse_run_block regex to match new log format; llm/send fields now None

Fixes #1, #3, #4

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 02:41:59 +00:00
6 changed files with 273 additions and 62 deletions

View File

@@ -431,12 +431,13 @@ async def _run_agent_pipeline(
history: list[dict], history: list[dict],
session_id: str, session_id: str,
tier_override: str | None = None, tier_override: str | None = None,
dry_run: bool = False, no_inference: bool = False,
tier_capture: list | None = None,
) -> AsyncGenerator[str, None]: ) -> AsyncGenerator[str, None]:
"""Core pipeline: pre-flight → routing → inference. Yields text chunks. """Core pipeline: pre-flight → routing → inference. Yields text chunks.
tier_override: "light" | "medium" | "complex" | None (auto-route) tier_override: "light" | "medium" | "complex" | None (auto-route)
dry_run: if True and tier=complex, log tier=complex but use medium model (avoids API cost) no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately
Caller is responsible for scheduling _store_memory after consuming all chunks. Caller is responsible for scheduling _store_memory after consuming all chunks.
""" """
async with _reply_semaphore: async with _reply_semaphore:
@@ -470,7 +471,7 @@ async def _run_agent_pipeline(
try: try:
# Short-circuit: fast tool already has the answer # Short-circuit: fast tool already has the answer
if fast_context and tier_override is None and not url_context: if fast_context and tier_override is None and not url_context and not no_inference:
tier = "fast" tier = "fast"
final_text = fast_context final_text = fast_context
llm_elapsed = time.monotonic() - t0 llm_elapsed = time.monotonic() - t0
@@ -493,14 +494,13 @@ async def _run_agent_pipeline(
light_reply = None light_reply = None
print("[agent] URL in message → upgraded light→medium", flush=True) print("[agent] URL in message → upgraded light→medium", flush=True)
# Dry-run: log as complex but infer with medium (no remote API call)
effective_tier = tier
if dry_run and tier == "complex":
effective_tier = "medium"
print(f"[agent] tier=complex (dry-run) → using medium model, message={clean_message[:60]!r}", flush=True)
else:
print(f"[agent] tier={tier} message={clean_message[:60]!r}", flush=True) print(f"[agent] tier={tier} message={clean_message[:60]!r}", flush=True)
tier = effective_tier if tier_capture is not None:
tier_capture.append(tier)
if no_inference:
yield "I don't know"
return
if tier == "light": if tier == "light":
final_text = light_reply final_text = light_reply
@@ -591,16 +591,15 @@ async def run_agent_task(
t0 = time.monotonic() t0 = time.monotonic()
meta = metadata or {} meta = metadata or {}
dry_run = bool(meta.get("dry_run", False)) no_inference = bool(meta.get("no_inference", False))
is_benchmark = bool(meta.get("benchmark", False)) is_benchmark = bool(meta.get("benchmark", False))
history = _conversation_buffers.get(session_id, []) history = _conversation_buffers.get(session_id, [])
final_text = None final_text = None
actual_tier = "unknown" actual_tier = "unknown"
tier_capture: list = []
# Patch pipeline to capture tier for logging async for chunk in _run_agent_pipeline(message, history, session_id, no_inference=no_inference, tier_capture=tier_capture):
# We read it from logs post-hoc; capture via a wrapper
async for chunk in _run_agent_pipeline(message, history, session_id, dry_run=dry_run):
await _push_stream_chunk(session_id, chunk) await _push_stream_chunk(session_id, chunk)
if final_text is None: if final_text is None:
final_text = chunk final_text = chunk
@@ -608,6 +607,7 @@ async def run_agent_task(
final_text += chunk final_text += chunk
await _end_stream(session_id) await _end_stream(session_id)
actual_tier = tier_capture[0] if tier_capture else "unknown"
elapsed_ms = int((time.monotonic() - t0) * 1000) elapsed_ms = int((time.monotonic() - t0) * 1000)
@@ -621,8 +621,8 @@ async def run_agent_task(
except Exception as e: except Exception as e:
print(f"[agent] delivery error (non-fatal): {e}", flush=True) print(f"[agent] delivery error (non-fatal): {e}", flush=True)
print(f"[agent] replied in {elapsed_ms / 1000:.1f}s", flush=True) print(f"[agent] replied in {elapsed_ms / 1000:.1f}s tier={actual_tier}", flush=True)
print(f"[agent] reply_text: {final_text[:200]}", flush=True) print(f"[agent] reply_text: {final_text}", flush=True)
# Update conversation buffer # Update conversation buffer
buf = _conversation_buffers.get(session_id, []) buf = _conversation_buffers.get(session_id, [])

View File

@@ -11,7 +11,7 @@ Usage:
python3 run_benchmark.py --category <name> python3 run_benchmark.py --category <name>
python3 run_benchmark.py --ids 1,2,3 python3 run_benchmark.py --ids 1,2,3
python3 run_benchmark.py --list-categories python3 run_benchmark.py --list-categories
python3 run_benchmark.py --dry-run # complex queries use medium model (no API cost) python3 run_benchmark.py --no-inference # skip all LLM inference — routing decisions only, all tiers
IMPORTANT: Always check GPU is free before running. This script does it automatically. IMPORTANT: Always check GPU is free before running. This script does it automatically.
@@ -121,10 +121,10 @@ def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
before_lines = set(logs_before.splitlines()) before_lines = set(logs_before.splitlines())
new_lines = [l for l in logs_after.splitlines() if l not in before_lines] new_lines = [l for l in logs_after.splitlines() if l not in before_lines]
for line in new_lines: for line in new_lines:
m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line) m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
if m: if m:
tier_raw = m.group(1) tier_raw = m.group(1)
# Normalise: "complex (dry-run)" → "complex" # Normalise: "complex (no-inference)" → "complex"
return tier_raw.split()[0] return tier_raw.split()[0]
return None return None
@@ -135,14 +135,14 @@ async def post_message(
client: httpx.AsyncClient, client: httpx.AsyncClient,
query_id: int, query_id: int,
query: str, query: str,
dry_run: bool = False, no_inference: bool = False,
) -> bool: ) -> bool:
payload = { payload = {
"text": query, "text": query,
"session_id": f"benchmark-{query_id}", "session_id": f"benchmark-{query_id}",
"channel": "cli", "channel": "cli",
"user_id": "benchmark", "user_id": "benchmark",
"metadata": {"dry_run": dry_run, "benchmark": True}, "metadata": {"no_inference": no_inference, "benchmark": True},
} }
try: try:
r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10) r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
@@ -172,7 +172,7 @@ def filter_queries(queries, tier, category, ids):
# ── Main run ─────────────────────────────────────────────────────────────────── # ── Main run ───────────────────────────────────────────────────────────────────
async def run(queries: list[dict], dry_run: bool = False) -> list[dict]: async def run(queries: list[dict], no_inference: bool = False) -> list[dict]:
results = [] results = []
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
@@ -186,7 +186,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
total = len(queries) total = len(queries)
correct = 0 correct = 0
dry_label = " [DRY-RUN: complex→medium]" if dry_run else "" dry_label = " [NO-INFERENCE: routing only]" if no_inference else ""
print(f"\nRunning {total} queries{dry_label}\n") print(f"\nRunning {total} queries{dry_label}\n")
print(f"{'ID':>3} {'EXPECTED':8} {'ACTUAL':8} {'OK':3} {'TIME':6} {'CATEGORY':22} QUERY") print(f"{'ID':>3} {'EXPECTED':8} {'ACTUAL':8} {'OK':3} {'TIME':6} {'CATEGORY':22} QUERY")
print("" * 110) print("" * 110)
@@ -197,8 +197,6 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
category = q["category"] category = q["category"]
query_text = q["query"] query_text = q["query"]
# In dry-run, complex queries still use complex classification (logged), but medium infers
send_dry = dry_run and expected == "complex"
session_id = f"benchmark-{qid}" session_id = f"benchmark-{qid}"
print(f"{qid:>3} {expected:8} ", end="", flush=True) print(f"{qid:>3} {expected:8} ", end="", flush=True)
@@ -206,7 +204,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
logs_before = get_log_tail(300) logs_before = get_log_tail(300)
t0 = time.monotonic() t0 = time.monotonic()
ok_post = await post_message(client, qid, query_text, dry_run=send_dry) ok_post = await post_message(client, qid, query_text, no_inference=no_inference)
if not ok_post: if not ok_post:
print(f"{'?':8} {'ERR':3} {'?':6} {category:22} {query_text[:40]}") print(f"{'?':8} {'ERR':3} {'?':6} {category:22} {query_text[:40]}")
results.append({"id": qid, "expected": expected, "actual": None, "ok": False}) results.append({"id": qid, "expected": expected, "actual": None, "ok": False})
@@ -245,7 +243,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
"elapsed": round(elapsed, 1), "elapsed": round(elapsed, 1),
"category": category, "category": category,
"query": query_text, "query": query_text,
"dry_run": send_dry, "no_inference": no_inference,
}) })
print("" * 110) print("" * 110)
@@ -281,9 +279,9 @@ def main():
parser.add_argument("--ids", help="Comma-separated IDs") parser.add_argument("--ids", help="Comma-separated IDs")
parser.add_argument("--list-categories", action="store_true") parser.add_argument("--list-categories", action="store_true")
parser.add_argument( parser.add_argument(
"--dry-run", "--no-inference",
action="store_true", action="store_true",
help="For complex queries: route classification is tested but medium model is used for inference (no API cost)", help="Skip LLM inference for all tiers — only routing decisions are tested (no GPU/API cost)",
) )
parser.add_argument( parser.add_argument(
"--skip-gpu-check", "--skip-gpu-check",
@@ -302,7 +300,7 @@ def main():
return return
# ALWAYS check GPU and RAM before running # ALWAYS check GPU and RAM before running
if not preflight_checks(skip_gpu_check=args.skip_gpu_check): if not preflight_checks(skip_gpu_check=args.no_inference):
sys.exit(1) sys.exit(1)
ids = [int(i) for i in args.ids.split(",")] if args.ids else None ids = [int(i) for i in args.ids.split(",")] if args.ids else None
@@ -311,7 +309,7 @@ def main():
print("No queries match filters.") print("No queries match filters.")
sys.exit(1) sys.exit(1)
asyncio.run(run(queries, dry_run=args.dry_run)) asyncio.run(run(queries, no_inference=args.no_inference))
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -0,0 +1,217 @@
#!/usr/bin/env python3
"""
Adolf routing benchmark — tests routing decisions only, no LLM inference.
Sends each query with no_inference=True, waits for the routing decision to
appear in docker logs, and records whether the correct tier was selected.
Usage:
python3 run_routing_benchmark.py [options]
python3 run_routing_benchmark.py --tier light|medium|complex
python3 run_routing_benchmark.py --category <name>
python3 run_routing_benchmark.py --ids 1,2,3
python3 run_routing_benchmark.py --list-categories
No GPU check needed — inference is disabled for all queries.
Adolf must be running at http://localhost:8000.
"""
import argparse
import asyncio
import json
import re
import subprocess
import sys
import time
from pathlib import Path
import httpx
ADOLF_URL = "http://localhost:8000"
DATASET = Path(__file__).parent / "benchmark.json"
RESULTS = Path(__file__).parent / "routing_results_latest.json"
QUERY_TIMEOUT = 30 # seconds — routing is fast, no LLM wait
# ── Log helpers ────────────────────────────────────────────────────────────────
def get_log_tail(n: int = 50) -> str:
result = subprocess.run(
["docker", "logs", "deepagents", "--tail", str(n)],
capture_output=True, text=True,
)
return result.stdout + result.stderr
def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
"""Find new tier= lines that appeared after we sent the query."""
before_lines = set(logs_before.splitlines())
new_lines = [line for line in logs_after.splitlines() if line not in before_lines]
for line in new_lines:
m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
if m:
tier_raw = m.group(1)
return tier_raw.split()[0]
return None
# ── Request helpers ────────────────────────────────────────────────────────────
async def post_message(client: httpx.AsyncClient, query_id: int, query: str) -> bool:
payload = {
"text": query,
"session_id": f"routing-bench-{query_id}",
"channel": "cli",
"user_id": "benchmark",
"metadata": {"no_inference": True, "benchmark": True},
}
try:
r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
r.raise_for_status()
return True
except Exception as e:
print(f" POST_ERROR: {e}", end="")
return False
# ── Dataset ────────────────────────────────────────────────────────────────────
def load_dataset() -> list[dict]:
with open(DATASET) as f:
return json.load(f)["queries"]
def filter_queries(queries, tier, category, ids):
if tier:
queries = [q for q in queries if q["tier"] == tier]
if category:
queries = [q for q in queries if q["category"] == category]
if ids:
queries = [q for q in queries if q["id"] in ids]
return queries
# ── Main run ───────────────────────────────────────────────────────────────────
async def run(queries: list[dict]) -> list[dict]:
results = []
async with httpx.AsyncClient() as client:
try:
r = await client.get(f"{ADOLF_URL}/health", timeout=5)
r.raise_for_status()
except Exception as e:
print(f"ERROR: Adolf not reachable: {e}", file=sys.stderr)
sys.exit(1)
total = len(queries)
correct = 0
print(f"\nRunning {total} queries [NO-INFERENCE: routing only]\n")
print(f"{'ID':>3} {'EXPECTED':8} {'ACTUAL':8} {'OK':3} {'TIME':6} {'CATEGORY':22} QUERY")
print("" * 110)
for q in queries:
qid = q["id"]
expected = q["tier"]
category = q["category"]
query_text = q["query"]
session_id = f"routing-bench-{qid}"
print(f"{qid:>3} {expected:8} ", end="", flush=True)
logs_before = get_log_tail(300)
t0 = time.monotonic()
ok_post = await post_message(client, qid, query_text)
if not ok_post:
print(f"{'?':8} {'ERR':3} {'?':6} {category:22} {query_text[:40]}")
results.append({"id": qid, "expected": expected, "actual": None, "ok": False})
continue
try:
async with client.stream(
"GET", f"{ADOLF_URL}/stream/{session_id}", timeout=QUERY_TIMEOUT
) as sse:
async for line in sse.aiter_lines():
if "data: [DONE]" in line:
break
except Exception:
pass # timeout or connection issue — check logs anyway
await asyncio.sleep(0.3)
logs_after = get_log_tail(300)
actual = extract_tier_from_logs(logs_before, logs_after)
elapsed = time.monotonic() - t0
match = actual == expected or (actual == "fast" and expected == "medium")
if match:
correct += 1
mark = "" if match else ""
actual_str = actual or "?"
print(f"{actual_str:8} {mark:3} {elapsed:5.1f}s {category:22} {query_text[:40]}")
results.append({
"id": qid,
"expected": expected,
"actual": actual_str,
"ok": match,
"elapsed": round(elapsed, 1),
"category": category,
"query": query_text,
})
print("" * 110)
accuracy = correct / total * 100 if total else 0
print(f"\nAccuracy: {correct}/{total} ({accuracy:.0f}%)")
for tier_name in ["light", "medium", "complex"]:
tier_qs = [r for r in results if r["expected"] == tier_name]
if tier_qs:
tier_ok = sum(1 for r in tier_qs if r["ok"])
print(f" {tier_name:8}: {tier_ok}/{len(tier_qs)}")
wrong = [r for r in results if not r["ok"]]
if wrong:
print(f"\nMisclassified ({len(wrong)}):")
for r in wrong:
print(f" id={r['id']:3} expected={r['expected']:8} actual={r['actual']:8} {r['query'][:60]}")
with open(RESULTS, "w") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\nResults saved to {RESULTS}")
return results
def main():
parser = argparse.ArgumentParser(
description="Adolf routing benchmark — routing decisions only, no LLM inference",
)
parser.add_argument("--tier", choices=["light", "medium", "complex"])
parser.add_argument("--category")
parser.add_argument("--ids", help="Comma-separated IDs")
parser.add_argument("--list-categories", action="store_true")
args = parser.parse_args()
queries = load_dataset()
if args.list_categories:
cats = sorted(set(q["category"] for q in queries))
tiers = {t: sum(1 for q in queries if q["tier"] == t) for t in ["light", "medium", "complex"]}
print(f"Total: {len(queries)} | Tiers: {tiers}")
print(f"Categories: {cats}")
return
ids = [int(i) for i in args.ids.split(",")] if args.ids else None
queries = filter_queries(queries, args.tier, args.category, ids)
if not queries:
print("No queries match filters.")
sys.exit(1)
asyncio.run(run(queries))
if __name__ == "__main__":
main()

View File

@@ -52,6 +52,13 @@ _LIGHT_PATTERNS = re.compile(
r"|окей|хорошо|отлично|понятно|ок|ладно|договорились|спс|благодарю" r"|окей|хорошо|отлично|понятно|ок|ладно|договорились|спс|благодарю"
r"|пожалуйста|не за что|всё понятно|ясно" r"|пожалуйста|не за что|всё понятно|ясно"
r"|как дела|как ты|как жизнь|всё хорошо|всё ок" r"|как дела|как ты|как жизнь|всё хорошо|всё ок"
# Russian tech definitions — static knowledge (no tools needed)
r"|что\s+такое\s+\S+"
r"|что\s+означает\s+\S+"
r"|сколько\s+(?:бит|байт|байтов|мегабайт|мегабайтов|гигабайт|гигабайтов)(?:\s+\w+)*"
# Compound Russian greetings
r"|привет[,!]?\s+как\s+дела"
r"|добрый\s+(?:день|вечер|утро)[,!]?\s+как\s+дела"
r")[\s!.?]*$", r")[\s!.?]*$",
re.IGNORECASE, re.IGNORECASE,
) )
@@ -314,6 +321,10 @@ _MEDIUM_PATTERNS = re.compile(
r"|курс (?:доллара|биткоина|евро|рубл)" r"|курс (?:доллара|биткоина|евро|рубл)"
r"|(?:последние |свежие )?новости\b" r"|(?:последние |свежие )?новости\b"
r"|(?:погода|температура)\s+(?:на завтра|на неделю)" r"|(?:погода|температура)\s+(?:на завтра|на неделю)"
# Smart home commands that don't use verb-first pattern
r"|(?:свет|лампочк|освещени)\w*\s+(?:включ|выключ|убавь|прибавь)"
r"|(?:дома|в доме|по всему дому)\s+(?:свет|лампочк)"
r"|(?:режим|сцена)\s+(?:ночной|утренний|вечерний|кинотеатр)"
r")", r")",
re.IGNORECASE, re.IGNORECASE,
) )

View File

@@ -11,7 +11,7 @@ import urllib.request
# ── config ──────────────────────────────────────────────────────────────────── # ── config ────────────────────────────────────────────────────────────────────
DEEPAGENTS = "http://localhost:8000" DEEPAGENTS = "http://localhost:8000"
BIFROST = "http://localhost:8080" LITELLM = "http://localhost:4000"
OPENMEMORY = "http://localhost:8765" OPENMEMORY = "http://localhost:8765"
GRAMMY_HOST = "localhost" GRAMMY_HOST = "localhost"
GRAMMY_PORT = 3001 GRAMMY_PORT = 3001
@@ -156,19 +156,6 @@ def fetch_logs(since_s=600):
return [] return []
def fetch_bifrost_logs(since_s=120):
"""Return bifrost container log lines from the last since_s seconds."""
try:
r = subprocess.run(
["docker", "compose", "-f", COMPOSE_FILE, "logs", "bifrost",
f"--since={int(since_s)}s", "--no-log-prefix"],
capture_output=True, text=True, timeout=10,
)
return r.stdout.splitlines()
except Exception:
return []
def parse_run_block(lines, msg_prefix): def parse_run_block(lines, msg_prefix):
""" """
Scan log lines for the LAST '[agent] running: <msg_prefix>' block. Scan log lines for the LAST '[agent] running: <msg_prefix>' block.
@@ -199,14 +186,13 @@ def parse_run_block(lines, msg_prefix):
if txt: if txt:
last_ai_text = txt last_ai_text = txt
m = re.search(r"replied in ([\d.]+)s \(llm=([\d.]+)s, send=([\d.]+)s\)", line) m = re.search(r"replied in ([\d.]+)s(?:\s+tier=(\w+))?", line)
if m: if m:
tier_m = re.search(r"\btier=(\w+)", line) tier = m.group(2) if m.group(2) else "unknown"
tier = tier_m.group(1) if tier_m else "unknown"
reply_data = { reply_data = {
"reply_total": float(m.group(1)), "reply_total": float(m.group(1)),
"llm": float(m.group(2)), "llm": None,
"send": float(m.group(3)), "send": None,
"tier": tier, "tier": tier,
"reply_text": last_ai_text, "reply_text": last_ai_text,
"memory_s": None, "memory_s": None,

View File

@@ -6,7 +6,7 @@ Tests:
1. Name store — POST "remember that your name is <RandomName>" 1. Name store — POST "remember that your name is <RandomName>"
2. Qdrant point — verifies a new vector was written after store 2. Qdrant point — verifies a new vector was written after store
3. Name recall — POST "what is your name?" → reply must contain <RandomName> 3. Name recall — POST "what is your name?" → reply must contain <RandomName>
4. Bifrost — verifies store/recall requests passed through Bifrost 4. LiteLLM — verifies LiteLLM proxy is reachable (replaced Bifrost)
5. Timing profile — breakdown of store and recall latencies 5. Timing profile — breakdown of store and recall latencies
6. Memory benchmark — store 5 personal facts, recall with 10 questions 6. Memory benchmark — store 5 personal facts, recall with 10 questions
7. Dedup test — same fact stored twice must not grow Qdrant by 2 points 7. Dedup test — same fact stored twice must not grow Qdrant by 2 points
@@ -24,11 +24,11 @@ import time
import urllib.request import urllib.request
from common import ( from common import (
DEEPAGENTS, QDRANT, COMPOSE_FILE, DEFAULT_CHAT_ID, DEEPAGENTS, LITELLM, QDRANT, COMPOSE_FILE, DEFAULT_CHAT_ID,
NAMES, NAMES,
INFO, PASS, FAIL, WARN, INFO, PASS, FAIL, WARN,
report, print_summary, tf, report, print_summary, tf,
get, post_json, qdrant_count, fetch_logs, fetch_bifrost_logs, get, post_json, qdrant_count, fetch_logs,
parse_run_block, wait_for, parse_run_block, wait_for,
) )
@@ -155,14 +155,13 @@ if _run_name:
report(results, "Agent replied to recall message", False, "timeout") report(results, "Agent replied to recall message", False, "timeout")
report(results, f"Reply contains '{random_name}'", False, "no reply") report(results, f"Reply contains '{random_name}'", False, "no reply")
# ── 4. Bifrost pass-through check ───────────────────────────────────────── # ── 4. LiteLLM proxy reachable (replaced Bifrost) ─────────────────────────
bifrost_lines = fetch_bifrost_logs(since_s=300) try:
report(results, "Bifrost container has log output (requests forwarded)", status, _ = get(f"{LITELLM}/health", timeout=5)
len(bifrost_lines) > 0, f"{len(bifrost_lines)} lines in bifrost logs") litellm_ok = status == 200
bifrost_raw = "\n".join(bifrost_lines) except Exception:
report(results, " Bifrost log shows AsyncOpenAI agent requests", litellm_ok = False
"AsyncOpenAI" in bifrost_raw, report(results, "LiteLLM proxy reachable", litellm_ok)
f"{'found' if 'AsyncOpenAI' in bifrost_raw else 'NOT found'} in bifrost logs")
# ── 5. Timing profile ───────────────────────────────────────────────────── # ── 5. Timing profile ─────────────────────────────────────────────────────
print(f"\n[{INFO}] 5. Timing profile") print(f"\n[{INFO}] 5. Timing profile")