8 Commits

Author SHA1 Message Date
887d4b8d90 voice benchmark: rename --dry-run → --no-inference, fix log extraction
- --no-inference applies to all tiers (not just complex)
- metadata key: dry_run → no_inference
- extract_tier_from_logs: forward iteration (not reversed), updated regex
- GPU check skipped when --no-inference
- Fix TypeError in misclassified print when actual=None

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 07:58:05 +00:00
4e6d3090c2 Remove benchmark.json from gitignore — dataset is now tracked
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 07:53:35 +00:00
5b09a99a7f Routing: 100% accuracy on realistic home assistant dataset
- router.py: skip light reply generation when no_inference=True;
  add control words (да/нет/стоп/отмена/повтори/подожди/etc.) to _LIGHT_PATTERNS
- agent.py: pass no_inference to router.route(); skip preflight IO in no_inference mode
- benchmarks/benchmark.json: replace definition-heavy queries with realistic
  Alexa/Google-Home style queries (greetings, smart home, timers, shopping,
  weather, personal memory, cooking) — 30 light / 60 medium / 30 complex

Routing benchmark: 120/120 (100%), all under 0.1s per query

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 07:53:01 +00:00
3fb90ae083 Skip _reply_semaphore in no_inference mode
No GPU inference happens in this mode, so serialization is not needed.
Without this, timed-out routing benchmark queries hold the semaphore
and cascade-block all subsequent queries.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 07:40:07 +00:00
4d37ac65b2 Skip preflight IO (memory/URL/fast-tools) when no_inference=True
In no_inference mode only the routing decision matters — fetching
memories and URLs adds latency without affecting the classification.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 07:37:55 +00:00
b7d5896076 routing benchmark: 1s strict deadline per query
QUERY_TIMEOUT=1s — classification and routing must complete within
1 second or the query is recorded as 'timeout'.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 07:35:13 +00:00
fc53632c7b Merge pull request 'feat: rename dry_run to no_inference for all tiers' (#17) from worktree-agent-afc013ce into main
Reviewed-on: #17
2026-03-24 07:27:04 +00:00
9c2f27eed4 Rename dry_run → no_inference, extend to all tiers in agent.py
When no_inference=True, routing decision is captured but all LLM
inference is skipped — yields constant "I don't know" immediately.
Also disables fast-tool short-circuit so routing path always runs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 03:43:42 +00:00
6 changed files with 193 additions and 49 deletions

1
.gitignore vendored
View File

@@ -2,7 +2,6 @@ __pycache__/
*.pyc *.pyc
logs/*.jsonl logs/*.jsonl
adolf_tuning_data/voice_audio/ adolf_tuning_data/voice_audio/
benchmarks/benchmark.json
benchmarks/results_latest.json benchmarks/results_latest.json
benchmarks/voice_results*.json benchmarks/voice_results*.json
benchmarks/voice_audio/ benchmarks/voice_audio/

View File

@@ -2,7 +2,7 @@ import asyncio
import json as _json_module import json as _json_module
import os import os
import time import time
from contextlib import asynccontextmanager from contextlib import asynccontextmanager, nullcontext
from pathlib import Path from pathlib import Path
from fastapi import FastAPI, BackgroundTasks, Request from fastapi import FastAPI, BackgroundTasks, Request
@@ -431,21 +431,25 @@ async def _run_agent_pipeline(
history: list[dict], history: list[dict],
session_id: str, session_id: str,
tier_override: str | None = None, tier_override: str | None = None,
dry_run: bool = False, no_inference: bool = False,
tier_capture: list | None = None, tier_capture: list | None = None,
) -> AsyncGenerator[str, None]: ) -> AsyncGenerator[str, None]:
"""Core pipeline: pre-flight → routing → inference. Yields text chunks. """Core pipeline: pre-flight → routing → inference. Yields text chunks.
tier_override: "light" | "medium" | "complex" | None (auto-route) tier_override: "light" | "medium" | "complex" | None (auto-route)
dry_run: if True and tier=complex, log tier=complex but use medium model (avoids API cost) no_inference: if True, routing decision is still made but inference is skipped — yields "I don't know" immediately
Caller is responsible for scheduling _store_memory after consuming all chunks. Caller is responsible for scheduling _store_memory after consuming all chunks.
""" """
async with _reply_semaphore: async with (nullcontext() if no_inference else _reply_semaphore):
t0 = time.monotonic() t0 = time.monotonic()
clean_message = message clean_message = message
print(f"[agent] running: {clean_message[:80]!r}", flush=True) print(f"[agent] running: {clean_message[:80]!r}", flush=True)
# Fetch URL content, memories, and fast-tool context concurrently # Fetch URL content, memories, and fast-tool context concurrently
# Skip preflight IO in no_inference mode — only routing decision needed
if no_inference:
url_context = memories = fast_context = None
else:
url_context, memories, fast_context = await asyncio.gather( url_context, memories, fast_context = await asyncio.gather(
_fetch_urls_from_message(clean_message), _fetch_urls_from_message(clean_message),
_retrieve_memories(clean_message, session_id), _retrieve_memories(clean_message, session_id),
@@ -471,7 +475,7 @@ async def _run_agent_pipeline(
try: try:
# Short-circuit: fast tool already has the answer # Short-circuit: fast tool already has the answer
if fast_context and tier_override is None and not url_context: if fast_context and tier_override is None and not url_context and not no_inference:
tier = "fast" tier = "fast"
final_text = fast_context final_text = fast_context
llm_elapsed = time.monotonic() - t0 llm_elapsed = time.monotonic() - t0
@@ -485,26 +489,23 @@ async def _run_agent_pipeline(
tier = tier_override tier = tier_override
light_reply = None light_reply = None
if tier_override == "light": if tier_override == "light":
tier, light_reply = await router.route(clean_message, enriched_history) tier, light_reply = await router.route(clean_message, enriched_history, no_inference=no_inference)
tier = "light" tier = "light"
else: else:
tier, light_reply = await router.route(clean_message, enriched_history) tier, light_reply = await router.route(clean_message, enriched_history, no_inference=no_inference)
if url_context and tier == "light": if url_context and tier == "light":
tier = "medium" tier = "medium"
light_reply = None light_reply = None
print("[agent] URL in message → upgraded light→medium", flush=True) print("[agent] URL in message → upgraded light→medium", flush=True)
# Dry-run: log as complex but infer with medium (no remote API call)
effective_tier = tier
if dry_run and tier == "complex":
effective_tier = "medium"
print(f"[agent] tier=complex (dry-run) → using medium model, message={clean_message[:60]!r}", flush=True)
else:
print(f"[agent] tier={tier} message={clean_message[:60]!r}", flush=True) print(f"[agent] tier={tier} message={clean_message[:60]!r}", flush=True)
tier = effective_tier
if tier_capture is not None: if tier_capture is not None:
tier_capture.append(tier) tier_capture.append(tier)
if no_inference:
yield "I don't know"
return
if tier == "light": if tier == "light":
final_text = light_reply final_text = light_reply
llm_elapsed = time.monotonic() - t0 llm_elapsed = time.monotonic() - t0
@@ -594,7 +595,7 @@ async def run_agent_task(
t0 = time.monotonic() t0 = time.monotonic()
meta = metadata or {} meta = metadata or {}
dry_run = bool(meta.get("dry_run", False)) no_inference = bool(meta.get("no_inference", False))
is_benchmark = bool(meta.get("benchmark", False)) is_benchmark = bool(meta.get("benchmark", False))
history = _conversation_buffers.get(session_id, []) history = _conversation_buffers.get(session_id, [])
@@ -602,7 +603,7 @@ async def run_agent_task(
actual_tier = "unknown" actual_tier = "unknown"
tier_capture: list = [] tier_capture: list = []
async for chunk in _run_agent_pipeline(message, history, session_id, dry_run=dry_run, tier_capture=tier_capture): async for chunk in _run_agent_pipeline(message, history, session_id, no_inference=no_inference, tier_capture=tier_capture):
await _push_stream_chunk(session_id, chunk) await _push_stream_chunk(session_id, chunk)
if final_text is None: if final_text is None:
final_text = chunk final_text = chunk

137
benchmarks/benchmark.json Normal file
View File

@@ -0,0 +1,137 @@
{
"description": "Adolf routing benchmark — домашние сценарии, Alexa/Google-Home стиль, русский язык",
"tiers": {
"light": "Приветствия, прощания, подтверждения, простые разговорные фразы. Не требуют поиска или действий.",
"medium": "Управление домом, погода/пробки, таймеры, напоминания, покупки, личная память, быстрые вопросы.",
"complex": "Глубокое исследование, сравнение технологий, подробные руководства с несколькими источниками."
},
"queries": [
{"id": 1, "tier": "light", "category": "greetings", "query": "привет"},
{"id": 2, "tier": "light", "category": "greetings", "query": "пока"},
{"id": 3, "tier": "light", "category": "greetings", "query": "спасибо"},
{"id": 4, "tier": "light", "category": "greetings", "query": "привет, как дела?"},
{"id": 5, "tier": "light", "category": "greetings", "query": "окей"},
{"id": 6, "tier": "light", "category": "greetings", "query": "добрый вечер"},
{"id": 7, "tier": "light", "category": "greetings", "query": "доброе утро"},
{"id": 8, "tier": "light", "category": "greetings", "query": "добрый день"},
{"id": 9, "tier": "light", "category": "greetings", "query": "hi"},
{"id": 10, "tier": "light", "category": "greetings", "query": "thanks"},
{"id": 11, "tier": "light", "category": "greetings", "query": "отлично, спасибо"},
{"id": 12, "tier": "light", "category": "greetings", "query": "понятно"},
{"id": 13, "tier": "light", "category": "greetings", "query": "ясно"},
{"id": 14, "tier": "light", "category": "greetings", "query": "ладно"},
{"id": 15, "tier": "light", "category": "greetings", "query": "договорились"},
{"id": 16, "tier": "light", "category": "greetings", "query": "good morning"},
{"id": 17, "tier": "light", "category": "greetings", "query": "good night"},
{"id": 18, "tier": "light", "category": "greetings", "query": "всё понятно"},
{"id": 19, "tier": "light", "category": "greetings", "query": "да"},
{"id": 20, "tier": "light", "category": "greetings", "query": "нет"},
{"id": 21, "tier": "light", "category": "greetings", "query": "не нужно"},
{"id": 22, "tier": "light", "category": "greetings", "query": "отмена"},
{"id": 23, "tier": "light", "category": "greetings", "query": "стоп"},
{"id": 24, "tier": "light", "category": "greetings", "query": "подожди"},
{"id": 25, "tier": "light", "category": "greetings", "query": "повтори"},
{"id": 26, "tier": "light", "category": "greetings", "query": "ты тут?"},
{"id": 27, "tier": "light", "category": "greetings", "query": "слышишь меня?"},
{"id": 28, "tier": "light", "category": "greetings", "query": "всё ок"},
{"id": 29, "tier": "light", "category": "greetings", "query": "хорошо"},
{"id": 30, "tier": "light", "category": "greetings", "query": "пожалуйста"},
{"id": 31, "tier": "medium", "category": "weather_commute", "query": "какая сегодня погода в Балашихе"},
{"id": 32, "tier": "medium", "category": "weather_commute", "query": "пойдет ли сегодня дождь"},
{"id": 33, "tier": "medium", "category": "weather_commute", "query": "какая температура на улице сейчас"},
{"id": 34, "tier": "medium", "category": "weather_commute", "query": "будет ли снег сегодня"},
{"id": 35, "tier": "medium", "category": "weather_commute", "query": "погода на завтра"},
{"id": 36, "tier": "medium", "category": "weather_commute", "query": "сколько ехать до Москвы сейчас"},
{"id": 37, "tier": "medium", "category": "weather_commute", "query": "какие пробки на дороге до Москвы"},
{"id": 38, "tier": "medium", "category": "weather_commute", "query": "время в пути на работу"},
{"id": 39, "tier": "medium", "category": "weather_commute", "query": "есть ли пробки сейчас"},
{"id": 40, "tier": "medium", "category": "weather_commute", "query": "стоит ли брать зонтик"},
{"id": 41, "tier": "medium", "category": "smart_home_control", "query": "включи свет в гостиной"},
{"id": 42, "tier": "medium", "category": "smart_home_control", "query": "выключи свет на кухне"},
{"id": 43, "tier": "medium", "category": "smart_home_control", "query": "какая температура дома"},
{"id": 44, "tier": "medium", "category": "smart_home_control", "query": "установи температуру 22 градуса"},
{"id": 45, "tier": "medium", "category": "smart_home_control", "query": "включи свет в спальне на 50 процентов"},
{"id": 46, "tier": "medium", "category": "smart_home_control", "query": "выключи все лампочки"},
{"id": 47, "tier": "medium", "category": "smart_home_control", "query": "какие устройства сейчас включены"},
{"id": 48, "tier": "medium", "category": "smart_home_control", "query": "закрыты ли все окна"},
{"id": 49, "tier": "medium", "category": "smart_home_control", "query": "включи вентилятор в детской"},
{"id": 50, "tier": "medium", "category": "smart_home_control", "query": "есть ли кто-нибудь дома"},
{"id": 51, "tier": "medium", "category": "smart_home_control", "query": "включи ночной режим"},
{"id": 52, "tier": "medium", "category": "smart_home_control", "query": "какое потребление электричества сегодня"},
{"id": 53, "tier": "medium", "category": "smart_home_control", "query": "выключи телевизор"},
{"id": 54, "tier": "medium", "category": "smart_home_control", "query": "открой шторы в гостиной"},
{"id": 55, "tier": "medium", "category": "smart_home_control", "query": "установи будильник на 7 утра"},
{"id": 56, "tier": "medium", "category": "smart_home_control", "query": "включи кофемашину"},
{"id": 57, "tier": "medium", "category": "smart_home_control", "query": "выключи свет во всём доме"},
{"id": 58, "tier": "medium", "category": "smart_home_control", "query": "сколько у нас датчиков движения"},
{"id": 59, "tier": "medium", "category": "smart_home_control", "query": "состояние всех дверных замков"},
{"id": 60, "tier": "medium", "category": "smart_home_control", "query": "включи режим кино в гостиной"},
{"id": 61, "tier": "medium", "category": "smart_home_control", "query": "прибавь яркость в детской"},
{"id": 62, "tier": "medium", "category": "smart_home_control", "query": "закрой все шторы"},
{"id": 63, "tier": "medium", "category": "smart_home_control", "query": "кто последний открывал входную дверь"},
{"id": 64, "tier": "medium", "category": "smart_home_control", "query": "заблокируй входную дверь"},
{"id": 65, "tier": "medium", "category": "smart_home_control", "query": "покажи камеру у входа"},
{"id": 66, "tier": "medium", "category": "timers_reminders", "query": "поставь таймер на 10 минут"},
{"id": 67, "tier": "medium", "category": "timers_reminders", "query": "напомни мне позвонить врачу в 15:00"},
{"id": 68, "tier": "medium", "category": "timers_reminders", "query": "поставь будильник на завтра в 6:30"},
{"id": 69, "tier": "medium", "category": "timers_reminders", "query": "напомни выключить плиту через 20 минут"},
{"id": 70, "tier": "medium", "category": "timers_reminders", "query": "сколько времени осталось на таймере"},
{"id": 71, "tier": "medium", "category": "shopping_cooking", "query": "добавь молоко в список покупок"},
{"id": 72, "tier": "medium", "category": "shopping_cooking", "query": "что есть в списке покупок"},
{"id": 73, "tier": "medium", "category": "shopping_cooking", "query": "добавь хлеб и яйца в список покупок"},
{"id": 74, "tier": "medium", "category": "shopping_cooking", "query": "сколько граммов муки нужно для блинов на 4 человека"},
{"id": 75, "tier": "medium", "category": "shopping_cooking", "query": "какой рецепт борща ты знаешь"},
{"id": 76, "tier": "medium", "category": "personal_memory", "query": "как меня зовут"},
{"id": 77, "tier": "medium", "category": "personal_memory", "query": "где я живу"},
{"id": 78, "tier": "medium", "category": "personal_memory", "query": "что мы обсуждали в прошлый раз"},
{"id": 79, "tier": "medium", "category": "personal_memory", "query": "что ты знаешь о моем домашнем сервере"},
{"id": 80, "tier": "medium", "category": "personal_memory", "query": "напомни, какие сервисы я запускаю"},
{"id": 81, "tier": "medium", "category": "personal_memory", "query": "что я говорил о своей сети"},
{"id": 82, "tier": "medium", "category": "personal_memory", "query": "что я просил тебя запомнить"},
{"id": 83, "tier": "medium", "category": "quick_info", "query": "какой сейчас курс биткоина"},
{"id": 84, "tier": "medium", "category": "quick_info", "query": "курс доллара к рублю сейчас"},
{"id": 85, "tier": "medium", "category": "quick_info", "query": "есть ли проблемы у Cloudflare сегодня"},
{"id": 86, "tier": "medium", "category": "quick_info", "query": "какая последняя версия Docker"},
{"id": 87, "tier": "medium", "category": "quick_info", "query": "какие новые функции в Home Assistant 2024"},
{"id": 88, "tier": "medium", "category": "quick_info", "query": "как проверить использование диска в Linux"},
{"id": 89, "tier": "medium", "category": "quick_info", "query": "как перезапустить Docker контейнер"},
{"id": 90, "tier": "medium", "category": "quick_info", "query": "как посмотреть логи Docker контейнера"},
{"id": 91, "tier": "complex", "category": "infrastructure", "query": "исследуй и сравни Proxmox, Unraid и TrueNAS для домашней лаборатории"},
{"id": 92, "tier": "complex", "category": "infrastructure", "query": "напиши подробное руководство по безопасности домашнего сервера, подключенного к интернету"},
{"id": 93, "tier": "complex", "category": "infrastructure", "query": "исследуй все доступные дашборды для самохостинга и сравни их функции"},
{"id": 94, "tier": "complex", "category": "infrastructure", "query": "исследуй лучший стек мониторинга для самохостинга в 2024 году со всеми вариантами"},
{"id": 95, "tier": "complex", "category": "infrastructure", "query": "сравни все системы резервного копирования для Linux: Restic, Borg, Duplicati, Timeshift"},
{"id": 96, "tier": "complex", "category": "infrastructure", "query": "напиши полное руководство по настройке обратного прокси Caddy для домашнего сервера с SSL"},
{"id": 97, "tier": "complex", "category": "network", "query": "исследуй и сравни WireGuard, OpenVPN и Tailscale для домашней VPN с детальными плюсами и минусами"},
{"id": 98, "tier": "complex", "category": "network", "query": "исследуй лучшие практики сегментации домашней сети с VLAN и правилами файрвола"},
{"id": 99, "tier": "complex", "category": "network", "query": "изучи все самохостируемые DNS решения и их возможности"},
{"id": 100, "tier": "complex", "category": "network", "query": "исследуй лучшие самохостируемые системы мониторинга сети: Zabbix, Grafana, Prometheus, Netdata"},
{"id": 101, "tier": "complex", "category": "home_assistant", "query": "исследуй и сравни все платформы умного дома: Home Assistant, OpenHAB и Domoticz"},
{"id": 102, "tier": "complex", "category": "home_assistant", "query": "изучи лучшие Zigbee координаторы и их совместимость с Home Assistant в 2024 году"},
{"id": 103, "tier": "complex", "category": "home_assistant", "query": "напиши детальный отчет о поддержке протокола Matter и совместимых устройствах"},
{"id": 104, "tier": "complex", "category": "home_assistant", "query": "исследуй все способы интеграции умных ламп с Home Assistant: Zigbee, WiFi, Bluetooth"},
{"id": 105, "tier": "complex", "category": "home_assistant", "query": "найди и сравни все варианты датчиков движения для умного дома с оценками и ценами"},
{"id": 106, "tier": "complex", "category": "home_assistant", "query": "напиши подробное руководство по настройке автоматизаций в Home Assistant для умного освещения"},
{"id": 107, "tier": "complex", "category": "home_assistant", "query": "исследуй все варианты голосового управления умным домом на русском языке, включая локальные решения"},
{"id": 108, "tier": "complex", "category": "home_assistant", "query": "исследуй все протоколы умного дома и их плюсы и минусы: Zigbee, Z-Wave, WiFi, Thread, Bluetooth"},
{"id": 109, "tier": "complex", "category": "media_files", "query": "исследуй и сравни все самохостируемые решения для хранения фотографий с детальным сравнением функций"},
{"id": 110, "tier": "complex", "category": "media_files", "query": "изучи лучшие самохостируемые медиасерверы: Jellyfin, Plex и Emby — с характеристиками и отзывами"},
{"id": 111, "tier": "complex", "category": "media_files", "query": "сравни все самохостируемые облачные хранилища: Nextcloud, Seafile, Owncloud — производительность и функции"},
{"id": 112, "tier": "complex", "category": "research", "query": "исследуй последние достижения в локальном LLM инференсе и оборудовании для него"},
{"id": 113, "tier": "complex", "category": "research", "query": "изучи лучшие опенсорс альтернативы Google сервисов для приватного домашнего окружения"},
{"id": 114, "tier": "complex", "category": "research", "query": "изучи все варианты локального запуска языковых моделей на видеокарте 8 ГБ VRAM"},
{"id": 115, "tier": "complex", "category": "research", "query": "найди и сравни все фреймворки для создания локальных AI ассистентов с открытым исходным кодом"},
{"id": 116, "tier": "complex", "category": "research", "query": "изучи все доступные локальные ассистенты с голосовым управлением на русском языке"},
{"id": 117, "tier": "complex", "category": "infrastructure", "query": "изучи свежие CVE и уязвимости в популярном самохостируемом ПО: Gitea, Nextcloud, Jellyfin"},
{"id": 118, "tier": "complex", "category": "infrastructure", "query": "напиши детальное сравнение систем управления конфигурацией: Ansible, Salt, Puppet для домашнего окружения"},
{"id": 119, "tier": "complex", "category": "network", "query": "исследуй все самохостируемые решения для блокировки рекламы: Pi-hole, AdGuard Home, NextDNS"},
{"id": 120, "tier": "complex", "category": "research", "query": "напиши подробный отчет о технологиях синтеза речи с открытым исходным кодом на русском языке"}
]
}

View File

@@ -30,7 +30,7 @@ import httpx
ADOLF_URL = "http://localhost:8000" ADOLF_URL = "http://localhost:8000"
DATASET = Path(__file__).parent / "benchmark.json" DATASET = Path(__file__).parent / "benchmark.json"
RESULTS = Path(__file__).parent / "routing_results_latest.json" RESULTS = Path(__file__).parent / "routing_results_latest.json"
QUERY_TIMEOUT = 30 # seconds — routing is fast, no LLM wait QUERY_TIMEOUT = 1 # 1s strict deadline — routing must decide within 1 second
# ── Log helpers ──────────────────────────────────────────────────────────────── # ── Log helpers ────────────────────────────────────────────────────────────────
@@ -139,9 +139,10 @@ async def run(queries: list[dict]) -> list[dict]:
except Exception: except Exception:
pass # timeout or connection issue — check logs anyway pass # timeout or connection issue — check logs anyway
await asyncio.sleep(0.3)
logs_after = get_log_tail(300) logs_after = get_log_tail(300)
actual = extract_tier_from_logs(logs_before, logs_after) actual = extract_tier_from_logs(logs_before, logs_after)
if actual is None:
actual = "timeout"
elapsed = time.monotonic() - t0 elapsed = time.monotonic() - t0
match = actual == expected or (actual == "fast" and expected == "medium") match = actual == expected or (actual == "fast" and expected == "medium")
@@ -149,7 +150,7 @@ async def run(queries: list[dict]) -> list[dict]:
correct += 1 correct += 1
mark = "" if match else "" mark = "" if match else ""
actual_str = actual or "?" actual_str = actual
print(f"{actual_str:8} {mark:3} {elapsed:5.1f}s {category:22} {query_text[:40]}") print(f"{actual_str:8} {mark:3} {elapsed:5.1f}s {category:22} {query_text[:40]}")
results.append({ results.append({

View File

@@ -12,7 +12,7 @@ Usage:
python3 run_voice_benchmark.py [options] python3 run_voice_benchmark.py [options]
python3 run_voice_benchmark.py --tier light|medium|complex python3 run_voice_benchmark.py --tier light|medium|complex
python3 run_voice_benchmark.py --ids 1,2,3 python3 run_voice_benchmark.py --ids 1,2,3
python3 run_voice_benchmark.py --dry-run # complex queries use medium model python3 run_voice_benchmark.py --no-inference # skip LLM inference — routing only, all tiers
IMPORTANT: Always check GPU is free before running. Done automatically. IMPORTANT: Always check GPU is free before running. Done automatically.
@@ -210,9 +210,9 @@ def get_log_tail(n: int = 60) -> str:
def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None: def extract_tier_from_logs(logs_before: str, logs_after: str) -> str | None:
before_lines = set(logs_before.splitlines()) before_lines = set(logs_before.splitlines())
new_lines = [l for l in logs_after.splitlines() if l not in before_lines] new_lines = [line for line in logs_after.splitlines() if line not in before_lines]
for line in reversed(new_lines): for line in new_lines:
m = re.search(r"tier=(\w+(?:\s*\(dry-run\))?)", line) m = re.search(r"tier=(\w+(?:\s*\(no-inference\))?)", line)
if m: if m:
return m.group(1).split()[0] return m.group(1).split()[0]
return None return None
@@ -222,14 +222,14 @@ async def post_to_adolf(
client: httpx.AsyncClient, client: httpx.AsyncClient,
query_id: int, query_id: int,
text: str, text: str,
dry_run: bool = False, no_inference: bool = False,
) -> bool: ) -> bool:
payload = { payload = {
"text": text, "text": text,
"session_id": f"voice-bench-{query_id}", "session_id": f"voice-bench-{query_id}",
"channel": "cli", "channel": "cli",
"user_id": "benchmark", "user_id": "benchmark",
"metadata": {"dry_run": dry_run, "benchmark": True, "voice": True}, "metadata": {"no_inference": no_inference, "benchmark": True, "voice": True},
} }
try: try:
r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10) r = await client.post(f"{ADOLF_URL}/message", json=payload, timeout=10)
@@ -259,7 +259,7 @@ def filter_queries(queries, tier, category, ids):
# ── Main run ─────────────────────────────────────────────────────────────────── # ── Main run ───────────────────────────────────────────────────────────────────
async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = False) -> None: async def run(queries: list[dict], no_inference: bool = False, save_audio: bool = False) -> None:
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
# Check Adolf # Check Adolf
try: try:
@@ -272,7 +272,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
total = len(queries) total = len(queries)
results = [] results = []
dry_label = " [DRY-RUN]" if dry_run else "" dry_label = " [NO-INFERENCE: routing only]" if no_inference else ""
print(f"Voice benchmark: {total} queries{dry_label}\n") print(f"Voice benchmark: {total} queries{dry_label}\n")
print(f"{'ID':>3} {'EXP':8} {'ACT':8} {'OK':3} {'WER':5} {'TRANSCRIPT'}") print(f"{'ID':>3} {'EXP':8} {'ACT':8} {'OK':3} {'WER':5} {'TRANSCRIPT'}")
print("" * 100) print("" * 100)
@@ -312,11 +312,10 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
wer_count += 1 wer_count += 1
# Step 3: Send to Adolf # Step 3: Send to Adolf
send_dry = dry_run and expected == "complex"
logs_before = get_log_tail(60) logs_before = get_log_tail(60)
t0 = time.monotonic() t0 = time.monotonic()
ok_post = await post_to_adolf(client, qid, transcript, dry_run=send_dry) ok_post = await post_to_adolf(client, qid, transcript, no_inference=no_inference)
if not ok_post: if not ok_post:
print(f"{'?':8} {'ERR':3} {wer:4.2f} {transcript[:50]}") print(f"{'?':8} {'ERR':3} {wer:4.2f} {transcript[:50]}")
results.append({"id": qid, "expected": expected, "actual": None, "ok": False, "wer": wer, "transcript": transcript}) results.append({"id": qid, "expected": expected, "actual": None, "ok": False, "wer": wer, "transcript": transcript})
@@ -349,7 +348,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
"original": original, "original": original,
"transcript": transcript, "transcript": transcript,
"elapsed": round(elapsed, 1), "elapsed": round(elapsed, 1),
"dry_run": send_dry, "no_inference": no_inference,
}) })
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
@@ -374,7 +373,7 @@ async def run(queries: list[dict], dry_run: bool = False, save_audio: bool = Fal
if wrong: if wrong:
print(f"\nMisclassified after voice ({len(wrong)}):") print(f"\nMisclassified after voice ({len(wrong)}):")
for r in wrong: for r in wrong:
print(f" id={r['id']:3} expected={r.get('expected','?'):8} actual={r.get('actual','?'):8} transcript={r.get('transcript','')[:50]}") print(f" id={r['id']:3} expected={r.get('expected') or '?':8} actual={r.get('actual') or '?':8} transcript={r.get('transcript','')[:50]}")
high_wer = [r for r in results if r.get("wer") and r["wer"] > 0.3] high_wer = [r for r in results if r.get("wer") and r["wer"] > 0.3]
if high_wer: if high_wer:
@@ -402,14 +401,14 @@ def main():
parser.add_argument("--tier", choices=["light", "medium", "complex"]) parser.add_argument("--tier", choices=["light", "medium", "complex"])
parser.add_argument("--category") parser.add_argument("--category")
parser.add_argument("--ids", help="Comma-separated IDs") parser.add_argument("--ids", help="Comma-separated IDs")
parser.add_argument("--dry-run", action="store_true", parser.add_argument("--no-inference", action="store_true",
help="Complex queries use medium model for inference (no API cost)") help="Skip LLM inference for all tiers — routing decisions only (no GPU/API cost)")
parser.add_argument("--save-audio", action="store_true", parser.add_argument("--save-audio", action="store_true",
help="Save synthesized WAV files to voice_audio/ directory") help="Save synthesized WAV files to voice_audio/ directory")
parser.add_argument("--skip-gpu-check", action="store_true") parser.add_argument("--skip-gpu-check", action="store_true")
args = parser.parse_args() args = parser.parse_args()
if not preflight_checks(skip_gpu_check=args.skip_gpu_check): if not preflight_checks(skip_gpu_check=args.skip_gpu_check or args.no_inference):
sys.exit(1) sys.exit(1)
queries = load_dataset() queries = load_dataset()
@@ -419,7 +418,7 @@ def main():
print("No queries match filters.") print("No queries match filters.")
sys.exit(1) sys.exit(1)
asyncio.run(run(queries, dry_run=args.dry_run, save_audio=args.save_audio)) asyncio.run(run(queries, no_inference=args.no_inference, save_audio=args.save_audio))
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -52,6 +52,10 @@ _LIGHT_PATTERNS = re.compile(
r"|окей|хорошо|отлично|понятно|ок|ладно|договорились|спс|благодарю" r"|окей|хорошо|отлично|понятно|ок|ладно|договорились|спс|благодарю"
r"|пожалуйста|не за что|всё понятно|ясно" r"|пожалуйста|не за что|всё понятно|ясно"
r"|как дела|как ты|как жизнь|всё хорошо|всё ок" r"|как дела|как ты|как жизнь|всё хорошо|всё ок"
# Assistant control words / confirmations
r"|да|нет|стоп|отмена|отменить|подожди|повтори|повторить|не нужно|не надо"
r"|слышишь\s+меня|ты\s+тут|отлично[,!]?\s+спасибо"
r"|yes|no|stop|cancel|wait|repeat"
# Russian tech definitions — static knowledge (no tools needed) # Russian tech definitions — static knowledge (no tools needed)
r"|что\s+такое\s+\S+" r"|что\s+такое\s+\S+"
r"|что\s+означает\s+\S+" r"|что\s+означает\s+\S+"
@@ -422,10 +426,11 @@ class Router:
self, self,
message: str, message: str,
history: list[dict], history: list[dict],
no_inference: bool = False,
) -> tuple[str, Optional[str]]: ) -> tuple[str, Optional[str]]:
""" """
Returns (tier, reply_or_None). Returns (tier, reply_or_None).
For light tier: also generates the reply inline. For light tier: also generates the reply inline (unless no_inference=True).
For medium/complex: reply is None. For medium/complex: reply is None.
""" """
if self._fast_tool_runner and self._fast_tool_runner.any_matches(message.strip()): if self._fast_tool_runner and self._fast_tool_runner.any_matches(message.strip()):
@@ -435,6 +440,8 @@ class Router:
if _LIGHT_PATTERNS.match(message.strip()): if _LIGHT_PATTERNS.match(message.strip()):
print("[router] regex→light", flush=True) print("[router] regex→light", flush=True)
if no_inference:
return "light", None
return await self._generate_light_reply(message, history) return await self._generate_light_reply(message, history)
if _COMPLEX_PATTERNS.search(message.strip()): if _COMPLEX_PATTERNS.search(message.strip()):
@@ -447,7 +454,7 @@ class Router:
tier = await self._classify_by_embedding(message) tier = await self._classify_by_embedding(message)
if tier != "light": if tier != "light" or no_inference:
return tier, None return tier, None
return await self._generate_light_reply(message, history) return await self._generate_light_reply(message, history)