Merge pull request 'feat: rename dry_run to no_inference for all tiers' (#17 ) from worktree-agent-afc013ce into main

Reviewed-on: #17
Merge pull request 'feat: rename --dry-run to --no-inference in run_benchmark.py' (#18 ) from feat/no-inference-benchmark into main
2026-03-24 07:27:04 +00:00 · 2026-03-24 07:26:44 +00:00 · 2026-03-24 07:26:31 +00:00 · 2026-03-24 07:25:16 +00:00 · 2026-03-24 03:49:09 +00:00 · 2026-03-24 03:43:42 +00:00
6 changed files with 265 additions and 56 deletions
--- a/agent.py
+++ b/agent.py
@@ -431,13 +431,13 @@ async def _run_agent_pipeline(
    history: list[dict],
    session_id: str,
    tier_override: str | None = None,
-    dry_run: bool = False,
+    no_inference: bool = False,
    tier_capture: list | None = None,
 ) -> AsyncGenerator[str, None]:
    """Core pipeline: pre-flight → routing → inference. Yields text chunks.
    tier_override: "light" | "medium" | "complex" | None (auto-route)
-    dry_run: if True and tier=complex, log tier=complex but use medium model (avoids API cost)
+    no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately
    Caller is responsible for scheduling _store_memory after consuming all chunks.
    """
    async with _reply_semaphore:
@@ -471,7 +471,7 @@ async def _run_agent_pipeline(
        try:
            # Short-circuit: fast tool already has the answer
-            if fast_context and tier_override is None and not url_context:
+            if fast_context and tier_override is None and not url_context and not no_inference:
                tier = "fast"
                final_text = fast_context
                llm_elapsed = time.monotonic() - t0
@@ -494,17 +494,14 @@ async def _run_agent_pipeline(
                        light_reply = None
                        print("[agent] URL in message → upgraded light→medium", flush=True)
-                # Dry-run: log as complex but infer with medium (no remote API call)
+                print(f"[agent] tier={tier} message={clean_message[:60]!r}", flush=True)
                effective_tier = tier
                if dry_run and tier == "complex":
                    effective_tier = "medium"
                    print(f"[agent] tier=complex (dry-run) → using medium model, message={clean_message[:60]!r}", flush=True)
                else:
                    print(f"[agent] tier={tier} message={clean_message[:60]!r}", flush=True)
                tier = effective_tier
                if tier_capture is not None:
                    tier_capture.append(tier)
                if no_inference:
                    yield "I don't know"
                    return
                if tier == "light":
                    final_text = light_reply
                    llm_elapsed = time.monotonic() - t0
@@ -594,7 +591,7 @@ async def run_agent_task(
    t0 = time.monotonic()
    meta = metadata or {}
-    dry_run = bool(meta.get("dry_run", False))
+    no_inference = bool(meta.get("no_inference", False))
    is_benchmark = bool(meta.get("benchmark", False))
    history = _conversation_buffers.get(session_id, [])
@@ -602,7 +599,7 @@ async def run_agent_task(
    actual_tier = "unknown"
    tier_capture: list = []
-    async for chunk in _run_agent_pipeline(message, history, session_id, dry_run=dry_run, tier_capture=tier_capture):
+    async for chunk in _run_agent_pipeline(message, history, session_id, no_inference=no_inference, tier_capture=tier_capture):
        await _push_stream_chunk(session_id, chunk)
        if final_text is None:
            final_text = chunk
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -11,7 +11,7 @@ Usage:
    python3 run_benchmark.py --category <name>
    python3 run_benchmark.py --ids 1,2,3
    python3 run_benchmark.py --list-categories
-    python3 run_benchmark.py --dry-run         # complex queries use medium model (no API cost)
+    python3 run_benchmark.py --no-inference    # skip all LLM inference — routing decisions only, all tiers
 IMPORTANT: Always check GPU is free before running. This script does it automatically.
@@ -120,11 +120,11 @@ def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
    """Find new tier= lines that appeared after we sent the query."""
    before_lines = set(logs_before.splitlines())
    new_lines = [l for l in logs_after.splitlines() if l not in before_lines]
-    for line in reversed(new_lines):
+    for line in new_lines:
-        m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line)
+        m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
        if m:
            tier_raw = m.group(1)
-            # Normalise: "complex (dry-run)" → "complex"
+            # Normalise: "complex (no-inference)" → "complex"
            return tier_raw.split()[0]
    return None
@@ -135,14 +135,14 @@ async def post_message(
    client: httpx.AsyncClient,
    query_id: int,
    query: str,
-    dry_run: bool = False,
+    no_inference: bool = False,
 ) -> bool:
    payload = {
        "text": query,
        "session_id": f"benchmark-{query_id}",
        "channel": "cli",
        "user_id": "benchmark",
-        "metadata": {"dry_run": dry_run, "benchmark": True},
+        "metadata": {"no_inference": no_inference, "benchmark": True},
    }
    try:
        r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
@@ -172,7 +172,7 @@ def filter_queries(queries, tier, category, ids):
 # ── Main run ───────────────────────────────────────────────────────────────────
-async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
+async def run(queries: list[dict], no_inference: bool = False) -> list[dict]:
    results = []
    async with httpx.AsyncClient() as client:
@@ -186,7 +186,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
        total = len(queries)
        correct = 0
-        dry_label = " [DRY-RUN: complex→medium]" if dry_run else ""
+        dry_label = " [NO-INFERENCE: routing only]" if no_inference else ""
        print(f"\nRunning {total} queries{dry_label}\n")
        print(f"{'ID':>3}  {'EXPECTED':8}  {'ACTUAL':8}  {'OK':3}  {'TIME':6}  {'CATEGORY':22}  QUERY")
        print("─" * 110)
@@ -197,16 +197,14 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
            category = q["category"]
            query_text = q["query"]
            # In dry-run, complex queries still use complex classification (logged), but medium infers
            send_dry = dry_run and expected == "complex"
            session_id = f"benchmark-{qid}"
            print(f"{qid:>3}  {expected:8}  ", end="", flush=True)
-            logs_before = get_log_tail(80)
+            logs_before = get_log_tail(300)
            t0 = time.monotonic()
-            ok_post = await post_message(client, qid, query_text, dry_run=send_dry)
+            ok_post = await post_message(client, qid, query_text, no_inference=no_inference)
            if not ok_post:
                print(f"{'?':8}  {'ERR':3}  {'?':6}  {category:22}  {query_text[:40]}")
                results.append({"id": qid, "expected": expected, "actual": None, "ok": False})
@@ -225,7 +223,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
            # Now the query is done — check logs for tier
            await asyncio.sleep(0.3)
-            logs_after = get_log_tail(80)
+            logs_after = get_log_tail(300)
            actual = extract_tier_from_logs(logs_before, logs_after)
            elapsed = time.monotonic() - t0
@@ -245,7 +243,7 @@ async def run(queries: list[dict], dry_run: bool = False) -> list[dict]:
                "elapsed": round(elapsed, 1),
                "category": category,
                "query": query_text,
-                "dry_run": send_dry,
+                "no_inference": no_inference,
            })
        print("─" * 110)
@@ -281,9 +279,9 @@ def main():
    parser.add_argument("--ids", help="Comma-separated IDs")
    parser.add_argument("--list-categories", action="store_true")
    parser.add_argument(
-        "--dry-run",
+        "--no-inference",
        action="store_true",
-        help="For complex queries: route classification is tested but medium model is used for inference (no API cost)",
+        help="Skip LLM inference for all tiers — only routing decisions are tested (no GPU/API cost)",
    )
    parser.add_argument(
        "--skip-gpu-check",
@@ -302,7 +300,7 @@ def main():
        return
    # ALWAYS check GPU and RAM before running
-    if not preflight_checks(skip_gpu_check=args.skip_gpu_check):
+    if not preflight_checks(skip_gpu_check=args.no_inference):
        sys.exit(1)
    ids = [int(i) for i in args.ids.split(",")] if args.ids else None
@@ -311,7 +309,7 @@ def main():
        print("No queries match filters.")
        sys.exit(1)
-    asyncio.run(run(queries, dry_run=args.dry_run))
+    asyncio.run(run(queries, no_inference=args.no_inference))
 if __name__ == "__main__":
--- a/benchmarks/run_routing_benchmark.py
+++ b/benchmarks/run_routing_benchmark.py
@@ -0,0 +1,217 @@
 #!/usr/bin/env python3
 """
 Adolf routing benchmark — tests routing decisions only, no LLM inference.
 Sends each query with no_inference=True, waits for the routing decision to
 appear in docker logs, and records whether the correct tier was selected.
 Usage:
    python3 run_routing_benchmark.py [options]
    python3 run_routing_benchmark.py --tier light|medium|complex
    python3 run_routing_benchmark.py --category <name>
    python3 run_routing_benchmark.py --ids 1,2,3
    python3 run_routing_benchmark.py --list-categories
 No GPU check needed — inference is disabled for all queries.
 Adolf must be running at http://localhost:8000.
 """
 import argparse
 import asyncio
 import json
 import re
 import subprocess
 import sys
 import time
 from pathlib import Path
 import httpx
 ADOLF_URL = "http://localhost:8000"
 DATASET = Path(__file__).parent / "benchmark.json"
 RESULTS = Path(__file__).parent / "routing_results_latest.json"
 QUERY_TIMEOUT = 30  # seconds — routing is fast, no LLM wait
 # ── Log helpers ────────────────────────────────────────────────────────────────
 def get_log_tail(n: int = 50) -> str:
    result = subprocess.run(
        ["docker", "logs", "deepagents", "--tail", str(n)],
        capture_output=True, text=True,
    )
    return result.stdout + result.stderr
 def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
    """Find new tier= lines that appeared after we sent the query."""
    before_lines = set(logs_before.splitlines())
    new_lines = [line for line in logs_after.splitlines() if line not in before_lines]
    for line in new_lines:
        m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
        if m:
            tier_raw = m.group(1)
            return tier_raw.split()[0]
    return None
 # ── Request helpers ────────────────────────────────────────────────────────────
 async def post_message(client: httpx.AsyncClient, query_id: int, query: str) -> bool:
    payload = {
        "text": query,
        "session_id": f"routing-bench-{query_id}",
        "channel": "cli",
        "user_id": "benchmark",
        "metadata": {"no_inference": True, "benchmark": True},
    }
    try:
        r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
        r.raise_for_status()
        return True
    except Exception as e:
        print(f" POST_ERROR: {e}", end="")
        return False
 # ── Dataset ────────────────────────────────────────────────────────────────────
 def load_dataset() -> list[dict]:
    with open(DATASET) as f:
        return json.load(f)["queries"]
 def filter_queries(queries, tier, category, ids):
    if tier:
        queries = [q for q in queries if q["tier"] == tier]
    if category:
        queries = [q for q in queries if q["category"] == category]
    if ids:
        queries = [q for q in queries if q["id"] in ids]
    return queries
 # ── Main run ───────────────────────────────────────────────────────────────────
 async def run(queries: list[dict]) -> list[dict]:
    results = []
    async with httpx.AsyncClient() as client:
        try:
            r = await client.get(f"{ADOLF_URL}/health", timeout=5)
            r.raise_for_status()
        except Exception as e:
            print(f"ERROR: Adolf not reachable: {e}", file=sys.stderr)
            sys.exit(1)
        total = len(queries)
        correct = 0
        print(f"\nRunning {total} queries [NO-INFERENCE: routing only]\n")
        print(f"{'ID':>3}  {'EXPECTED':8}  {'ACTUAL':8}  {'OK':3}  {'TIME':6}  {'CATEGORY':22}  QUERY")
        print("─" * 110)
        for q in queries:
            qid = q["id"]
            expected = q["tier"]
            category = q["category"]
            query_text = q["query"]
            session_id = f"routing-bench-{qid}"
            print(f"{qid:>3}  {expected:8}  ", end="", flush=True)
            logs_before = get_log_tail(300)
            t0 = time.monotonic()
            ok_post = await post_message(client, qid, query_text)
            if not ok_post:
                print(f"{'?':8}  {'ERR':3}  {'?':6}  {category:22}  {query_text[:40]}")
                results.append({"id": qid, "expected": expected, "actual": None, "ok": False})
                continue
            try:
                async with client.stream(
                    "GET", f"{ADOLF_URL}/stream/{session_id}", timeout=QUERY_TIMEOUT
                ) as sse:
                    async for line in sse.aiter_lines():
                        if "data: [DONE]" in line:
                            break
            except Exception:
                pass  # timeout or connection issue — check logs anyway
            await asyncio.sleep(0.3)
            logs_after = get_log_tail(300)
            actual = extract_tier_from_logs(logs_before, logs_after)
            elapsed = time.monotonic() - t0
            match = actual == expected or (actual == "fast" and expected == "medium")
            if match:
                correct += 1
            mark = "✓" if match else "✗"
            actual_str = actual or "?"
            print(f"{actual_str:8}  {mark:3}  {elapsed:5.1f}s  {category:22}  {query_text[:40]}")
            results.append({
                "id": qid,
                "expected": expected,
                "actual": actual_str,
                "ok": match,
                "elapsed": round(elapsed, 1),
                "category": category,
                "query": query_text,
            })
        print("─" * 110)
        accuracy = correct / total * 100 if total else 0
        print(f"\nAccuracy: {correct}/{total} ({accuracy:.0f}%)")
        for tier_name in ["light", "medium", "complex"]:
            tier_qs = [r for r in results if r["expected"] == tier_name]
            if tier_qs:
                tier_ok = sum(1 for r in tier_qs if r["ok"])
                print(f"  {tier_name:8}: {tier_ok}/{len(tier_qs)}")
        wrong = [r for r in results if not r["ok"]]
        if wrong:
            print(f"\nMisclassified ({len(wrong)}):")
            for r in wrong:
                print(f"  id={r['id']:3}  expected={r['expected']:8}  actual={r['actual']:8}  {r['query'][:60]}")
    with open(RESULTS, "w") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\nResults saved to {RESULTS}")
    return results
 def main():
    parser = argparse.ArgumentParser(
        description="Adolf routing benchmark — routing decisions only, no LLM inference",
    )
    parser.add_argument("--tier", choices=["light", "medium", "complex"])
    parser.add_argument("--category")
    parser.add_argument("--ids", help="Comma-separated IDs")
    parser.add_argument("--list-categories", action="store_true")
    args = parser.parse_args()
    queries = load_dataset()
    if args.list_categories:
        cats = sorted(set(q["category"] for q in queries))
        tiers = {t: sum(1 for q in queries if q["tier"] == t) for t in ["light", "medium", "complex"]}
        print(f"Total: {len(queries)} | Tiers: {tiers}")
        print(f"Categories: {cats}")
        return
    ids = [int(i) for i in args.ids.split(",")] if args.ids else None
    queries = filter_queries(queries, args.tier, args.category, ids)
    if not queries:
        print("No queries match filters.")
        sys.exit(1)
    asyncio.run(run(queries))
 if __name__ == "__main__":
    main()
--- a/router.py
+++ b/router.py
@@ -52,6 +52,13 @@ _LIGHT_PATTERNS = re.compile(
    r"|окей|хорошо|отлично|понятно|ок|ладно|договорились|спс|благодарю"
    r"|пожалуйста|не за что|всё понятно|ясно"
    r"|как дела|как ты|как жизнь|всё хорошо|всё ок"
    # Russian tech definitions — static knowledge (no tools needed)
    r"|что\s+такое\s+\S+"
    r"|что\s+означает\s+\S+"
    r"|сколько\s+(?:бит|байт|байтов|мегабайт|мегабайтов|гигабайт|гигабайтов)(?:\s+\w+)*"
    # Compound Russian greetings
    r"|привет[,!]?\s+как\s+дела"
    r"|добрый\s+(?:день|вечер|утро)[,!]?\s+как\s+дела"
    r")[\s!.?]*$",
    re.IGNORECASE,
 )
@@ -314,6 +321,10 @@ _MEDIUM_PATTERNS = re.compile(
    r"|курс (?:доллара|биткоина|евро|рубл)"
    r"|(?:последние |свежие )?новости\b"
    r"|(?:погода|температура)\s+(?:на завтра|на неделю)"
    # Smart home commands that don't use verb-first pattern
    r"|(?:свет|лампочк|освещени)\w*\s+(?:включ|выключ|убавь|прибавь)"
    r"|(?:дома|в доме|по всему дому)\s+(?:свет|лампочк)"
    r"|(?:режим|сцена)\s+(?:ночной|утренний|вечерний|кинотеатр)"
    r")",
    re.IGNORECASE,
 )
--- a/tests/integration/common.py
+++ b/tests/integration/common.py
@@ -11,7 +11,7 @@ import urllib.request
 # ── config ────────────────────────────────────────────────────────────────────
 DEEPAGENTS   = "http://localhost:8000"
-BIFROST      = "http://localhost:8080"
+LITELLM      = "http://localhost:4000"
 OPENMEMORY   = "http://localhost:8765"
 GRAMMY_HOST  = "localhost"
 GRAMMY_PORT  = 3001
@@ -156,19 +156,6 @@ def fetch_logs(since_s=600):
        return []
 def fetch_bifrost_logs(since_s=120):
    """Return bifrost container log lines from the last since_s seconds."""
    try:
        r = subprocess.run(
            ["docker", "compose", "-f", COMPOSE_FILE, "logs", "bifrost",
             f"--since={int(since_s)}s", "--no-log-prefix"],
            capture_output=True, text=True, timeout=10,
        )
        return r.stdout.splitlines()
    except Exception:
        return []
 def parse_run_block(lines, msg_prefix):
    """
    Scan log lines for the LAST '[agent] running: <msg_prefix>' block.
--- a/tests/integration/test_memory.py
+++ b/tests/integration/test_memory.py
@@ -6,7 +6,7 @@ Tests:
  1. Name store   — POST "remember that your name is <RandomName>"
  2. Qdrant point — verifies a new vector was written after store
  3. Name recall  — POST "what is your name?" → reply must contain <RandomName>
-  4. Bifrost      — verifies store/recall requests passed through Bifrost
+  4. LiteLLM      — verifies LiteLLM proxy is reachable (replaced Bifrost)
  5. Timing profile — breakdown of store and recall latencies
  6. Memory benchmark — store 5 personal facts, recall with 10 questions
  7. Dedup test   — same fact stored twice must not grow Qdrant by 2 points
@@ -24,11 +24,11 @@ import time
 import urllib.request
 from common import (
-    DEEPAGENTS, QDRANT, COMPOSE_FILE, DEFAULT_CHAT_ID,
+    DEEPAGENTS, LITELLM, QDRANT, COMPOSE_FILE, DEFAULT_CHAT_ID,
    NAMES,
    INFO, PASS, FAIL, WARN,
    report, print_summary, tf,
-    get, post_json, qdrant_count, fetch_logs, fetch_bifrost_logs,
+    get, post_json, qdrant_count, fetch_logs,
    parse_run_block, wait_for,
 )
@@ -155,14 +155,13 @@ if _run_name:
        report(results, "Agent replied to recall message", False, "timeout")
        report(results, f"Reply contains '{random_name}'", False, "no reply")
-    # ── 4. Bifrost pass-through check ─────────────────────────────────────────
+    # ── 4. LiteLLM proxy reachable (replaced Bifrost) ─────────────────────────
-    bifrost_lines = fetch_bifrost_logs(since_s=300)
+    try:
-    report(results, "Bifrost container has log output (requests forwarded)",
+        status, _ = get(f"{LITELLM}/health", timeout=5)
-           len(bifrost_lines) > 0, f"{len(bifrost_lines)} lines in bifrost logs")
+        litellm_ok = status == 200
-    bifrost_raw = "\n".join(bifrost_lines)
+    except Exception:
-    report(results, "  Bifrost log shows AsyncOpenAI agent requests",
+        litellm_ok = False
-           "AsyncOpenAI" in bifrost_raw,
+    report(results, "LiteLLM proxy reachable", litellm_ok)
           f"{'found' if 'AsyncOpenAI' in bifrost_raw else 'NOT found'} in bifrost logs")
    # ── 5. Timing profile ─────────────────────────────────────────────────────
    print(f"\n[{INFO}] 5. Timing profile")
Author	SHA1	Message	Date
alvis	fc53632c7b	Merge pull request 'feat: rename dry_run to no_inference for all tiers' (#17 ) from worktree-agent-afc013ce into main Reviewed-on: #17	2026-03-24 07:27:04 +00:00
alvis	47a1166be6	Merge pull request 'feat: rename --dry-run to --no-inference in run_benchmark.py' (#18 ) from feat/no-inference-benchmark into main Reviewed-on: #18	2026-03-24 07:26:44 +00:00
alvis	74e5b1758d	Merge pull request 'feat: add run_routing_benchmark.py — routing-only benchmark' (#19 ) from feat/routing-benchmark into main Reviewed-on: #19	2026-03-24 07:26:31 +00:00
alvis	0fbdbf3a5e	Add run_routing_benchmark.py — dedicated routing-only benchmark Tests routing accuracy for all tiers with no_inference=True hardcoded. Fast (QUERY_TIMEOUT=30s), no GPU check, shares benchmark.json dataset. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-24 07:25:16 +00:00
alvis	77db739819	Rename --dry-run to --no-inference, apply to all tiers in run_benchmark.py No-inference mode now skips LLM for all tiers (not just complex), GPU check is auto-skipped, and the metadata key matches agent.py. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-24 03:49:09 +00:00
alvis	9c2f27eed4	Rename dry_run → no_inference, extend to all tiers in agent.py When no_inference=True, routing decision is captured but all LLM inference is skipped — yields constant "I don't know" immediately. Also disables fast-tool short-circuit so routing path always runs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-24 03:43:42 +00:00
alvis	a363347ae5	Merge pull request 'Fix routing: add Russian tech def patterns to light, strengthen medium smart home' (#13 ) from fix/routing-accuracy into main Reviewed-on: #13	2026-03-24 02:51:17 +00:00
alvis	1d2787766e	Merge pull request 'Remove Bifrost: replace test 4 with LiteLLM health check' (#14 ) from fix/remove-bifrost into main Reviewed-on: #14	2026-03-24 02:48:40 +00:00
alvis	abf792a2ec	Remove Bifrost: replace test 4 with LiteLLM health check - Remove BIFROST constant and fetch_bifrost_logs() from common.py - Add LITELLM constant (localhost:4000) - Replace test_memory.py test 4 (Bifrost pass-through) with LiteLLM health check Fixes #5 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-24 02:46:01 +00:00
alvis	537e927146	Fix routing: add Russian tech def patterns to light, strengthen medium smart home - _LIGHT_PATTERNS: add что\s+такое, что\s+означает, сколько бит/байт, compound greetings (привет, как дела) — these fell through to embedding which sometimes misclassified short Russian phrases as medium - _MEDIUM_PATTERNS: add non-verb-first smart home patterns (свет/лампочка as subject, режим/сцена commands) for benchmark queries with different phrasing Fixes #8, #9 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-24 02:45:42 +00:00
alvis	186e16284b	Merge pull request 'Fix tier logging: capture actual_tier, fix parse_run_block regex, remove reply_text truncation' (#11 ) from fix/tier-logging into main Reviewed-on: #11	2026-03-24 02:44:35 +00:00
alvis	0b428e4ada	Merge pull request 'Fix benchmark log extraction: first tier match, increase log tail to 300' (#12 ) from fix/benchmark-log-extraction into main Reviewed-on: #12	2026-03-24 02:43:26 +00:00
alvis	98095679be	Fix benchmark log extraction: first tier match, increase log tail to 300 - Remove reversed() from extract_tier_from_logs: first match = routing decision (dry-run complex logs tier=complex early, then overwrites with tier=medium at done) - Increase log tail from 80→300 to handle concurrent log activity Fixes #7, #10 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-24 02:42:27 +00:00