Compare commits
13 Commits
fix/tier-l
...
fc53632c7b
| Author | SHA1 | Date | |
|---|---|---|---|
| fc53632c7b | |||
| 47a1166be6 | |||
| 74e5b1758d | |||
| 0fbdbf3a5e | |||
| 77db739819 | |||
| 9c2f27eed4 | |||
| a363347ae5 | |||
| 1d2787766e | |||
| abf792a2ec | |||
| 537e927146 | |||
| 186e16284b | |||
| 0b428e4ada | |||
| 98095679be |
23
agent.py
23
agent.py
@@ -431,13 +431,13 @@ async def _run_agent_pipeline(
|
|||||||
history: list[dict],
|
history: list[dict],
|
||||||
session_id: str,
|
session_id: str,
|
||||||
tier_override: str | None = None,
|
tier_override: str | None = None,
|
||||||
dry_run: bool = False,
|
no_inference: bool = False,
|
||||||
tier_capture: list | None = None,
|
tier_capture: list | None = None,
|
||||||
) -> AsyncGenerator[str, None]:
|
) -> AsyncGenerator[str, None]:
|
||||||
"""Core pipeline: pre-flight → routing → inference. Yields text chunks.
|
"""Core pipeline: pre-flight → routing → inference. Yields text chunks.
|
||||||
|
|
||||||
tier_override: "light" | "medium" | "complex" | None (auto-route)
|
tier_override: "light" | "medium" | "complex" | None (auto-route)
|
||||||
dry_run: if True and tier=complex, log tier=complex but use medium model (avoids API cost)
|
no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately
|
||||||
Caller is responsible for scheduling _store_memory after consuming all chunks.
|
Caller is responsible for scheduling _store_memory after consuming all chunks.
|
||||||
"""
|
"""
|
||||||
async with _reply_semaphore:
|
async with _reply_semaphore:
|
||||||
@@ -471,7 +471,7 @@ async def _run_agent_pipeline(
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Short-circuit: fast tool already has the answer
|
# Short-circuit: fast tool already has the answer
|
||||||
if fast_context and tier_override is None and not url_context:
|
if fast_context and tier_override is None and not url_context and not no_inference:
|
||||||
tier = "fast"
|
tier = "fast"
|
||||||
final_text = fast_context
|
final_text = fast_context
|
||||||
llm_elapsed = time.monotonic() - t0
|
llm_elapsed = time.monotonic() - t0
|
||||||
@@ -494,17 +494,14 @@ async def _run_agent_pipeline(
|
|||||||
light_reply = None
|
light_reply = None
|
||||||
print("[agent] URL in message → upgraded light→medium", flush=True)
|
print("[agent] URL in message → upgraded light→medium", flush=True)
|
||||||
|
|
||||||
# Dry-run: log as complex but infer with medium (no remote API call)
|
print(f"[agent] tier={tier} message={clean_message[:60]!r}", flush=True)
|
||||||
effective_tier = tier
|
|
||||||
if dry_run and tier == "complex":
|
|
||||||
effective_tier = "medium"
|
|
||||||
print(f"[agent] tier=complex (dry-run) → using medium model, message={clean_message[:60]!r}", flush=True)
|
|
||||||
else:
|
|
||||||
print(f"[agent] tier={tier} message={clean_message[:60]!r}", flush=True)
|
|
||||||
tier = effective_tier
|
|
||||||
if tier_capture is not None:
|
if tier_capture is not None:
|
||||||
tier_capture.append(tier)
|
tier_capture.append(tier)
|
||||||
|
|
||||||
|
if no_inference:
|
||||||
|
yield "I don't know"
|
||||||
|
return
|
||||||
|
|
||||||
if tier == "light":
|
if tier == "light":
|
||||||
final_text = light_reply
|
final_text = light_reply
|
||||||
llm_elapsed = time.monotonic() - t0
|
llm_elapsed = time.monotonic() - t0
|
||||||
@@ -594,7 +591,7 @@ async def run_agent_task(
|
|||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
|
|
||||||
meta = metadata or {}
|
meta = metadata or {}
|
||||||
dry_run = bool(meta.get("dry_run", False))
|
no_inference = bool(meta.get("no_inference", False))
|
||||||
is_benchmark = bool(meta.get("benchmark", False))
|
is_benchmark = bool(meta.get("benchmark", False))
|
||||||
|
|
||||||
history = _conversation_buffers.get(session_id, [])
|
history = _conversation_buffers.get(session_id, [])
|
||||||
@@ -602,7 +599,7 @@ async def run_agent_task(
|
|||||||
actual_tier = "unknown"
|
actual_tier = "unknown"
|
||||||
tier_capture: list = []
|
tier_capture: list = []
|
||||||
|
|
||||||
async for chunk in _run_agent_pipeline(message, history, session_id, dry_run=dry_run, tier_capture=tier_capture):
|
async for chunk in _run_agent_pipeline(message, history, session_id, no_inference=no_inference, tier_capture=tier_capture):
|
||||||
await _push_stream_chunk(session_id, chunk)
|
await _push_stream_chunk(session_id, chunk)
|
||||||
if final_text is None:
|
if final_text is None:
|
||||||
final_text = chunk
|
final_text = chunk
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ Usage:
|
|||||||
python3 run_benchmark.py --category <name>
|
python3 run_benchmark.py --category <name>
|
||||||
python3 run_benchmark.py --ids 1,2,3
|
python3 run_benchmark.py --ids 1,2,3
|
||||||
python3 run_benchmark.py --list-categories
|
python3 run_benchmark.py --list-categories
|
||||||
python3 run_benchmark.py --dry-run # complex queries use medium model (no API cost)
|
python3 run_benchmark.py --no-inference # skip all LLM inference — routing decisions only, all tiers
|
||||||
|
|
||||||
IMPORTANT: Always check GPU is free before running. This script does it automatically.
|
IMPORTANT: Always check GPU is free before running. This script does it automatically.
|
||||||
|
|
||||||
@@ -120,11 +120,11 @@ def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
|
|||||||
"""Find new tier= lines that appeared after we sent the query."""
|
"""Find new tier= lines that appeared after we sent the query."""
|
||||||
before_lines = set(logs_before.splitlines())
|
before_lines = set(logs_before.splitlines())
|
||||||
new_lines = [l for l in logs_after.splitlines() if l not in before_lines]
|
new_lines = [l for l in logs_after.splitlines() if l not in before_lines]
|
||||||
for line in reversed(new_lines):
|
for line in new_lines:
|
||||||
m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line)
|
m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
|
||||||
if m:
|
if m:
|
||||||
tier_raw = m.group(1)
|
tier_raw = m.group(1)
|
||||||
# Normalise: "complex (dry-run)" → "complex"
|
# Normalise: "complex (no-inference)" → "complex"
|
||||||
return tier_raw.split()[0]
|
return tier_raw.split()[0]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -135,14 +135,14 @@ async def post_message(
|
|||||||
client: httpx.AsyncClient,
|
client: httpx.AsyncClient,
|
||||||
query_id: int,
|
query_id: int,
|
||||||
query: str,
|
query: str,
|
||||||
dry_run: bool = False,
|
no_inference: bool = False,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
payload = {
|
payload = {
|
||||||
"text": query,
|
"text": query,
|
||||||
"session_id": f"benchmark-{query_id}",
|
"session_id": f"benchmark-{query_id}",
|
||||||
"channel": "cli",
|
"channel": "cli",
|
||||||
"user_id": "benchmark",
|
"user_id": "benchmark",
|
||||||
"metadata": {"dry_run": dry_run, "benchmark": True},
|
"metadata": {"no_inference": no_inference, "benchmark": True},
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
|
r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
|
||||||
@@ -172,7 +172,7 @@ def filter_queries(queries, tier, category, ids):
|
|||||||
|
|
||||||
# ── Main run ───────────────────────────────────────────────────────────────────
|
# ── Main run ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
async def run(queries: list[dict], no_inference: bool = False) -> list[dict]:
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
@@ -186,7 +186,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
|||||||
total = len(queries)
|
total = len(queries)
|
||||||
correct = 0
|
correct = 0
|
||||||
|
|
||||||
dry_label = " [DRY-RUN: complex→medium]" if dry_run else ""
|
dry_label = " [NO-INFERENCE: routing only]" if no_inference else ""
|
||||||
print(f"\nRunning {total} queries{dry_label}\n")
|
print(f"\nRunning {total} queries{dry_label}\n")
|
||||||
print(f"{'ID':>3} {'EXPECTED':8} {'ACTUAL':8} {'OK':3} {'TIME':6} {'CATEGORY':22} QUERY")
|
print(f"{'ID':>3} {'EXPECTED':8} {'ACTUAL':8} {'OK':3} {'TIME':6} {'CATEGORY':22} QUERY")
|
||||||
print("─" * 110)
|
print("─" * 110)
|
||||||
@@ -197,16 +197,14 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
|||||||
category = q["category"]
|
category = q["category"]
|
||||||
query_text = q["query"]
|
query_text = q["query"]
|
||||||
|
|
||||||
# In dry-run, complex queries still use complex classification (logged), but medium infers
|
|
||||||
send_dry = dry_run and expected == "complex"
|
|
||||||
session_id = f"benchmark-{qid}"
|
session_id = f"benchmark-{qid}"
|
||||||
|
|
||||||
print(f"{qid:>3} {expected:8} ", end="", flush=True)
|
print(f"{qid:>3} {expected:8} ", end="", flush=True)
|
||||||
|
|
||||||
logs_before = get_log_tail(80)
|
logs_before = get_log_tail(300)
|
||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
|
|
||||||
ok_post = await post_message(client, qid, query_text, dry_run=send_dry)
|
ok_post = await post_message(client, qid, query_text, no_inference=no_inference)
|
||||||
if not ok_post:
|
if not ok_post:
|
||||||
print(f"{'?':8} {'ERR':3} {'?':6} {category:22} {query_text[:40]}")
|
print(f"{'?':8} {'ERR':3} {'?':6} {category:22} {query_text[:40]}")
|
||||||
results.append({"id": qid, "expected": expected, "actual": None, "ok": False})
|
results.append({"id": qid, "expected": expected, "actual": None, "ok": False})
|
||||||
@@ -225,7 +223,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
|||||||
|
|
||||||
# Now the query is done — check logs for tier
|
# Now the query is done — check logs for tier
|
||||||
await asyncio.sleep(0.3)
|
await asyncio.sleep(0.3)
|
||||||
logs_after = get_log_tail(80)
|
logs_after = get_log_tail(300)
|
||||||
actual = extract_tier_from_logs(logs_before, logs_after)
|
actual = extract_tier_from_logs(logs_before, logs_after)
|
||||||
|
|
||||||
elapsed = time.monotonic() - t0
|
elapsed = time.monotonic() - t0
|
||||||
@@ -245,7 +243,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
|
|||||||
"elapsed": round(elapsed, 1),
|
"elapsed": round(elapsed, 1),
|
||||||
"category": category,
|
"category": category,
|
||||||
"query": query_text,
|
"query": query_text,
|
||||||
"dry_run": send_dry,
|
"no_inference": no_inference,
|
||||||
})
|
})
|
||||||
|
|
||||||
print("─" * 110)
|
print("─" * 110)
|
||||||
@@ -281,9 +279,9 @@ def main():
|
|||||||
parser.add_argument("--ids", help="Comma-separated IDs")
|
parser.add_argument("--ids", help="Comma-separated IDs")
|
||||||
parser.add_argument("--list-categories", action="store_true")
|
parser.add_argument("--list-categories", action="store_true")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dry-run",
|
"--no-inference",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="For complex queries: route classification is tested but medium model is used for inference (no API cost)",
|
help="Skip LLM inference for all tiers — only routing decisions are tested (no GPU/API cost)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--skip-gpu-check",
|
"--skip-gpu-check",
|
||||||
@@ -302,7 +300,7 @@ def main():
|
|||||||
return
|
return
|
||||||
|
|
||||||
# ALWAYS check GPU and RAM before running
|
# ALWAYS check GPU and RAM before running
|
||||||
if not preflight_checks(skip_gpu_check=args.skip_gpu_check):
|
if not preflight_checks(skip_gpu_check=args.no_inference):
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
ids = [int(i) for i in args.ids.split(",")] if args.ids else None
|
ids = [int(i) for i in args.ids.split(",")] if args.ids else None
|
||||||
@@ -311,7 +309,7 @@ def main():
|
|||||||
print("No queries match filters.")
|
print("No queries match filters.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
asyncio.run(run(queries, dry_run=args.dry_run))
|
asyncio.run(run(queries, no_inference=args.no_inference))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
217
benchmarks/run_routing_benchmark.py
Normal file
217
benchmarks/run_routing_benchmark.py
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Adolf routing benchmark — tests routing decisions only, no LLM inference.
|
||||||
|
|
||||||
|
Sends each query with no_inference=True, waits for the routing decision to
|
||||||
|
appear in docker logs, and records whether the correct tier was selected.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 run_routing_benchmark.py [options]
|
||||||
|
python3 run_routing_benchmark.py --tier light|medium|complex
|
||||||
|
python3 run_routing_benchmark.py --category <name>
|
||||||
|
python3 run_routing_benchmark.py --ids 1,2,3
|
||||||
|
python3 run_routing_benchmark.py --list-categories
|
||||||
|
|
||||||
|
No GPU check needed — inference is disabled for all queries.
|
||||||
|
Adolf must be running at http://localhost:8000.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
ADOLF_URL = "http://localhost:8000"
|
||||||
|
DATASET = Path(__file__).parent / "benchmark.json"
|
||||||
|
RESULTS = Path(__file__).parent / "routing_results_latest.json"
|
||||||
|
QUERY_TIMEOUT = 30 # seconds — routing is fast, no LLM wait
|
||||||
|
|
||||||
|
|
||||||
|
# ── Log helpers ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def get_log_tail(n: int = 50) -> str:
|
||||||
|
result = subprocess.run(
|
||||||
|
["docker", "logs", "deepagents", "--tail", str(n)],
|
||||||
|
capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
return result.stdout + result.stderr
|
||||||
|
|
||||||
|
|
||||||
|
def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
|
||||||
|
"""Find new tier= lines that appeared after we sent the query."""
|
||||||
|
before_lines = set(logs_before.splitlines())
|
||||||
|
new_lines = [line for line in logs_after.splitlines() if line not in before_lines]
|
||||||
|
for line in new_lines:
|
||||||
|
m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
|
||||||
|
if m:
|
||||||
|
tier_raw = m.group(1)
|
||||||
|
return tier_raw.split()[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Request helpers ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def post_message(client: httpx.AsyncClient, query_id: int, query: str) -> bool:
|
||||||
|
payload = {
|
||||||
|
"text": query,
|
||||||
|
"session_id": f"routing-bench-{query_id}",
|
||||||
|
"channel": "cli",
|
||||||
|
"user_id": "benchmark",
|
||||||
|
"metadata": {"no_inference": True, "benchmark": True},
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
|
||||||
|
r.raise_for_status()
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f" POST_ERROR: {e}", end="")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ── Dataset ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def load_dataset() -> list[dict]:
|
||||||
|
with open(DATASET) as f:
|
||||||
|
return json.load(f)["queries"]
|
||||||
|
|
||||||
|
|
||||||
|
def filter_queries(queries, tier, category, ids):
|
||||||
|
if tier:
|
||||||
|
queries = [q for q in queries if q["tier"] == tier]
|
||||||
|
if category:
|
||||||
|
queries = [q for q in queries if q["category"] == category]
|
||||||
|
if ids:
|
||||||
|
queries = [q for q in queries if q["id"] in ids]
|
||||||
|
return queries
|
||||||
|
|
||||||
|
|
||||||
|
# ── Main run ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def run(queries: list[dict]) -> list[dict]:
|
||||||
|
results = []
|
||||||
|
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
try:
|
||||||
|
r = await client.get(f"{ADOLF_URL}/health", timeout=5)
|
||||||
|
r.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR: Adolf not reachable: {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
total = len(queries)
|
||||||
|
correct = 0
|
||||||
|
|
||||||
|
print(f"\nRunning {total} queries [NO-INFERENCE: routing only]\n")
|
||||||
|
print(f"{'ID':>3} {'EXPECTED':8} {'ACTUAL':8} {'OK':3} {'TIME':6} {'CATEGORY':22} QUERY")
|
||||||
|
print("─" * 110)
|
||||||
|
|
||||||
|
for q in queries:
|
||||||
|
qid = q["id"]
|
||||||
|
expected = q["tier"]
|
||||||
|
category = q["category"]
|
||||||
|
query_text = q["query"]
|
||||||
|
session_id = f"routing-bench-{qid}"
|
||||||
|
|
||||||
|
print(f"{qid:>3} {expected:8} ", end="", flush=True)
|
||||||
|
|
||||||
|
logs_before = get_log_tail(300)
|
||||||
|
t0 = time.monotonic()
|
||||||
|
|
||||||
|
ok_post = await post_message(client, qid, query_text)
|
||||||
|
if not ok_post:
|
||||||
|
print(f"{'?':8} {'ERR':3} {'?':6} {category:22} {query_text[:40]}")
|
||||||
|
results.append({"id": qid, "expected": expected, "actual": None, "ok": False})
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with client.stream(
|
||||||
|
"GET", f"{ADOLF_URL}/stream/{session_id}", timeout=QUERY_TIMEOUT
|
||||||
|
) as sse:
|
||||||
|
async for line in sse.aiter_lines():
|
||||||
|
if "data: [DONE]" in line:
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass # timeout or connection issue — check logs anyway
|
||||||
|
|
||||||
|
await asyncio.sleep(0.3)
|
||||||
|
logs_after = get_log_tail(300)
|
||||||
|
actual = extract_tier_from_logs(logs_before, logs_after)
|
||||||
|
|
||||||
|
elapsed = time.monotonic() - t0
|
||||||
|
match = actual == expected or (actual == "fast" and expected == "medium")
|
||||||
|
if match:
|
||||||
|
correct += 1
|
||||||
|
|
||||||
|
mark = "✓" if match else "✗"
|
||||||
|
actual_str = actual or "?"
|
||||||
|
print(f"{actual_str:8} {mark:3} {elapsed:5.1f}s {category:22} {query_text[:40]}")
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"id": qid,
|
||||||
|
"expected": expected,
|
||||||
|
"actual": actual_str,
|
||||||
|
"ok": match,
|
||||||
|
"elapsed": round(elapsed, 1),
|
||||||
|
"category": category,
|
||||||
|
"query": query_text,
|
||||||
|
})
|
||||||
|
|
||||||
|
print("─" * 110)
|
||||||
|
accuracy = correct / total * 100 if total else 0
|
||||||
|
print(f"\nAccuracy: {correct}/{total} ({accuracy:.0f}%)")
|
||||||
|
|
||||||
|
for tier_name in ["light", "medium", "complex"]:
|
||||||
|
tier_qs = [r for r in results if r["expected"] == tier_name]
|
||||||
|
if tier_qs:
|
||||||
|
tier_ok = sum(1 for r in tier_qs if r["ok"])
|
||||||
|
print(f" {tier_name:8}: {tier_ok}/{len(tier_qs)}")
|
||||||
|
|
||||||
|
wrong = [r for r in results if not r["ok"]]
|
||||||
|
if wrong:
|
||||||
|
print(f"\nMisclassified ({len(wrong)}):")
|
||||||
|
for r in wrong:
|
||||||
|
print(f" id={r['id']:3} expected={r['expected']:8} actual={r['actual']:8} {r['query'][:60]}")
|
||||||
|
|
||||||
|
with open(RESULTS, "w") as f:
|
||||||
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||||
|
print(f"\nResults saved to {RESULTS}")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Adolf routing benchmark — routing decisions only, no LLM inference",
|
||||||
|
)
|
||||||
|
parser.add_argument("--tier", choices=["light", "medium", "complex"])
|
||||||
|
parser.add_argument("--category")
|
||||||
|
parser.add_argument("--ids", help="Comma-separated IDs")
|
||||||
|
parser.add_argument("--list-categories", action="store_true")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
queries = load_dataset()
|
||||||
|
|
||||||
|
if args.list_categories:
|
||||||
|
cats = sorted(set(q["category"] for q in queries))
|
||||||
|
tiers = {t: sum(1 for q in queries if q["tier"] == t) for t in ["light", "medium", "complex"]}
|
||||||
|
print(f"Total: {len(queries)} | Tiers: {tiers}")
|
||||||
|
print(f"Categories: {cats}")
|
||||||
|
return
|
||||||
|
|
||||||
|
ids = [int(i) for i in args.ids.split(",")] if args.ids else None
|
||||||
|
queries = filter_queries(queries, args.tier, args.category, ids)
|
||||||
|
if not queries:
|
||||||
|
print("No queries match filters.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
asyncio.run(run(queries))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
11
router.py
11
router.py
@@ -52,6 +52,13 @@ _LIGHT_PATTERNS = re.compile(
|
|||||||
r"|окей|хорошо|отлично|понятно|ок|ладно|договорились|спс|благодарю"
|
r"|окей|хорошо|отлично|понятно|ок|ладно|договорились|спс|благодарю"
|
||||||
r"|пожалуйста|не за что|всё понятно|ясно"
|
r"|пожалуйста|не за что|всё понятно|ясно"
|
||||||
r"|как дела|как ты|как жизнь|всё хорошо|всё ок"
|
r"|как дела|как ты|как жизнь|всё хорошо|всё ок"
|
||||||
|
# Russian tech definitions — static knowledge (no tools needed)
|
||||||
|
r"|что\s+такое\s+\S+"
|
||||||
|
r"|что\s+означает\s+\S+"
|
||||||
|
r"|сколько\s+(?:бит|байт|байтов|мегабайт|мегабайтов|гигабайт|гигабайтов)(?:\s+\w+)*"
|
||||||
|
# Compound Russian greetings
|
||||||
|
r"|привет[,!]?\s+как\s+дела"
|
||||||
|
r"|добрый\s+(?:день|вечер|утро)[,!]?\s+как\s+дела"
|
||||||
r")[\s!.?]*$",
|
r")[\s!.?]*$",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
@@ -314,6 +321,10 @@ _MEDIUM_PATTERNS = re.compile(
|
|||||||
r"|курс (?:доллара|биткоина|евро|рубл)"
|
r"|курс (?:доллара|биткоина|евро|рубл)"
|
||||||
r"|(?:последние |свежие )?новости\b"
|
r"|(?:последние |свежие )?новости\b"
|
||||||
r"|(?:погода|температура)\s+(?:на завтра|на неделю)"
|
r"|(?:погода|температура)\s+(?:на завтра|на неделю)"
|
||||||
|
# Smart home commands that don't use verb-first pattern
|
||||||
|
r"|(?:свет|лампочк|освещени)\w*\s+(?:включ|выключ|убавь|прибавь)"
|
||||||
|
r"|(?:дома|в доме|по всему дому)\s+(?:свет|лампочк)"
|
||||||
|
r"|(?:режим|сцена)\s+(?:ночной|утренний|вечерний|кинотеатр)"
|
||||||
r")",
|
r")",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ import urllib.request
|
|||||||
|
|
||||||
# ── config ────────────────────────────────────────────────────────────────────
|
# ── config ────────────────────────────────────────────────────────────────────
|
||||||
DEEPAGENTS = "http://localhost:8000"
|
DEEPAGENTS = "http://localhost:8000"
|
||||||
BIFROST = "http://localhost:8080"
|
LITELLM = "http://localhost:4000"
|
||||||
OPENMEMORY = "http://localhost:8765"
|
OPENMEMORY = "http://localhost:8765"
|
||||||
GRAMMY_HOST = "localhost"
|
GRAMMY_HOST = "localhost"
|
||||||
GRAMMY_PORT = 3001
|
GRAMMY_PORT = 3001
|
||||||
@@ -156,19 +156,6 @@ def fetch_logs(since_s=600):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
def fetch_bifrost_logs(since_s=120):
|
|
||||||
"""Return bifrost container log lines from the last since_s seconds."""
|
|
||||||
try:
|
|
||||||
r = subprocess.run(
|
|
||||||
["docker", "compose", "-f", COMPOSE_FILE, "logs", "bifrost",
|
|
||||||
f"--since={int(since_s)}s", "--no-log-prefix"],
|
|
||||||
capture_output=True, text=True, timeout=10,
|
|
||||||
)
|
|
||||||
return r.stdout.splitlines()
|
|
||||||
except Exception:
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def parse_run_block(lines, msg_prefix):
|
def parse_run_block(lines, msg_prefix):
|
||||||
"""
|
"""
|
||||||
Scan log lines for the LAST '[agent] running: <msg_prefix>' block.
|
Scan log lines for the LAST '[agent] running: <msg_prefix>' block.
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ Tests:
|
|||||||
1. Name store — POST "remember that your name is <RandomName>"
|
1. Name store — POST "remember that your name is <RandomName>"
|
||||||
2. Qdrant point — verifies a new vector was written after store
|
2. Qdrant point — verifies a new vector was written after store
|
||||||
3. Name recall — POST "what is your name?" → reply must contain <RandomName>
|
3. Name recall — POST "what is your name?" → reply must contain <RandomName>
|
||||||
4. Bifrost — verifies store/recall requests passed through Bifrost
|
4. LiteLLM — verifies LiteLLM proxy is reachable (replaced Bifrost)
|
||||||
5. Timing profile — breakdown of store and recall latencies
|
5. Timing profile — breakdown of store and recall latencies
|
||||||
6. Memory benchmark — store 5 personal facts, recall with 10 questions
|
6. Memory benchmark — store 5 personal facts, recall with 10 questions
|
||||||
7. Dedup test — same fact stored twice must not grow Qdrant by 2 points
|
7. Dedup test — same fact stored twice must not grow Qdrant by 2 points
|
||||||
@@ -24,11 +24,11 @@ import time
|
|||||||
import urllib.request
|
import urllib.request
|
||||||
|
|
||||||
from common import (
|
from common import (
|
||||||
DEEPAGENTS, QDRANT, COMPOSE_FILE, DEFAULT_CHAT_ID,
|
DEEPAGENTS, LITELLM, QDRANT, COMPOSE_FILE, DEFAULT_CHAT_ID,
|
||||||
NAMES,
|
NAMES,
|
||||||
INFO, PASS, FAIL, WARN,
|
INFO, PASS, FAIL, WARN,
|
||||||
report, print_summary, tf,
|
report, print_summary, tf,
|
||||||
get, post_json, qdrant_count, fetch_logs, fetch_bifrost_logs,
|
get, post_json, qdrant_count, fetch_logs,
|
||||||
parse_run_block, wait_for,
|
parse_run_block, wait_for,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -155,14 +155,13 @@ if _run_name:
|
|||||||
report(results, "Agent replied to recall message", False, "timeout")
|
report(results, "Agent replied to recall message", False, "timeout")
|
||||||
report(results, f"Reply contains '{random_name}'", False, "no reply")
|
report(results, f"Reply contains '{random_name}'", False, "no reply")
|
||||||
|
|
||||||
# ── 4. Bifrost pass-through check ─────────────────────────────────────────
|
# ── 4. LiteLLM proxy reachable (replaced Bifrost) ─────────────────────────
|
||||||
bifrost_lines = fetch_bifrost_logs(since_s=300)
|
try:
|
||||||
report(results, "Bifrost container has log output (requests forwarded)",
|
status, _ = get(f"{LITELLM}/health", timeout=5)
|
||||||
len(bifrost_lines) > 0, f"{len(bifrost_lines)} lines in bifrost logs")
|
litellm_ok = status == 200
|
||||||
bifrost_raw = "\n".join(bifrost_lines)
|
except Exception:
|
||||||
report(results, " Bifrost log shows AsyncOpenAI agent requests",
|
litellm_ok = False
|
||||||
"AsyncOpenAI" in bifrost_raw,
|
report(results, "LiteLLM proxy reachable", litellm_ok)
|
||||||
f"{'found' if 'AsyncOpenAI' in bifrost_raw else 'NOT found'} in bifrost logs")
|
|
||||||
|
|
||||||
# ── 5. Timing profile ─────────────────────────────────────────────────────
|
# ── 5. Timing profile ─────────────────────────────────────────────────────
|
||||||
print(f"\n[{INFO}] 5. Timing profile")
|
print(f"\n[{INFO}] 5. Timing profile")
|
||||||
|
|||||||
Reference in New Issue
Block a user