adolf/tests/integration/common.py

"""
Shared config, helpers, and utilities for Adolf integration tests.
"""

import http.client
import json
import re
import subprocess
import time
import urllib.request

# ── config ────────────────────────────────────────────────────────────────────
DEEPAGENTS   = "http://localhost:8000"
LITELLM      = "http://localhost:4000"
OPENMEMORY   = "http://localhost:8765"
GRAMMY_HOST  = "localhost"
GRAMMY_PORT  = 3001
OLLAMA_GPU   = "http://localhost:11436"
OLLAMA_CPU   = "http://localhost:11435"
QDRANT       = "http://localhost:6333"
SEARXNG      = "http://localhost:11437"
COMPOSE_FILE = "/home/alvis/adolf/docker-compose.yml"
DEFAULT_CHAT_ID = "346967270"

NAMES = [
    "Maximilian", "Cornelius", "Zephyr", "Archibald", "Balthazar",
    "Ignatius", "Lysander", "Octavian", "Reginald", "Sylvester",
]

BENCHMARK = {
    "easy": [
        "hi",
        "what is 2+2?",
        "what is the capital of France?",
        "tell me a short joke",
        "how are you doing today?",
        "thanks!",
        "what day comes after Wednesday?",
        "name the three primary colors",
        "is the sky blue?",
        "what does CPU stand for?",
    ],
    "medium": [
        "what is the current weather in Berlin?",
        "find the latest news about artificial intelligence",
        "what is the current price of Bitcoin?",
        "search for a good pasta carbonara recipe",
        "what movies are in theaters this week?",
        "find Python tutorials for beginners",
        "who won the last FIFA World Cup?",
        "do you remember what we talked about before?",
        "search for the best coffee shops in Tokyo",
        "what is happening in the tech industry this week?",
        "what's the weather like today?",
    ],
    "hard": [
        "/think compare the top 3 Python web frameworks (Django, FastAPI, Flask) and recommend one for a production REST API",
        "/think research the history of artificial intelligence and create a timeline of key milestones",
        "/think plan a 7-day trip to Japan with daily itinerary, accommodation suggestions, and budget breakdown",
        "/think analyze microservices vs monolithic architecture: pros, cons, and when to choose each",
        "/think write a Python script that reads a CSV file, cleans the data, and generates summary statistics",
        "/think research quantum computing: explain the key concepts and how it differs from classical computing",
        "/think compare PostgreSQL, MongoDB, and Redis — when to use each and what are the trade-offs?",
        "/think create a comprehensive Docker deployment guide covering best practices for production",
        "/think research climate change: summarize the latest IPCC findings and key data points",
        "/think design a REST API with authentication, rate limiting, and proper error handling — provide architecture and code outline",
    ],
}

# ── terminal colours ──────────────────────────────────────────────────────────
PASS = "\033[32mPASS\033[0m"
FAIL = "\033[31mFAIL\033[0m"
INFO = "\033[36mINFO\033[0m"
WARN = "\033[33mWARN\033[0m"


# ── result helpers ────────────────────────────────────────────────────────────

def report(results: list, name: str, ok: bool, detail: str = ""):
    tag = PASS if ok else FAIL
    print(f"  [{tag}] {name}" + (f" — {detail}" if detail else ""))
    results.append((name, ok))


def print_summary(results: list):
    print(f"\n{'─'*55}")
    total  = len(results)
    passed = sum(1 for _, ok in results if ok)
    failed = total - passed
    print(f"Results: {passed}/{total} passed", end="")
    if failed:
        print(f"  ({failed} failed)\n")
        print("Failed checks:")
        for name, ok in results:
            if not ok:
                print(f"  - {name}")
    else:
        print(" — all good")
    print()


def tf(v):
    """Format timing value."""
    return f"{v:6.2f}s" if v is not None else "   n/a"


# ── HTTP helpers ──────────────────────────────────────────────────────────────

def get(url, timeout=5):
    with urllib.request.urlopen(urllib.request.Request(url), timeout=timeout) as r:
        return r.status, r.read().decode()


def post_json(url, payload, timeout=10):
    data = json.dumps(payload).encode()
    req = urllib.request.Request(
        url, data=data,
        headers={"Content-Type": "application/json"},
        method="POST",
    )
    with urllib.request.urlopen(req, timeout=timeout) as r:
        return r.status, json.loads(r.read().decode())


def check_sse(host, port, path):
    try:
        conn = http.client.HTTPConnection(host, port, timeout=5)
        conn.request("GET", path, headers={"Accept": "text/event-stream"})
        r = conn.getresponse()
        conn.close()
        return r.status == 200, f"HTTP {r.status}"
    except Exception as e:
        return False, str(e)


def qdrant_count():
    try:
        _, body = get(f"{QDRANT}/collections/adolf_memories")
        return json.loads(body).get("result", {}).get("points_count", 0)
    except Exception:
        return 0


# ── log helpers ───────────────────────────────────────────────────────────────

def fetch_logs(since_s=600):
    """Return deepagents log lines from the last since_s seconds."""
    try:
        r = subprocess.run(
            ["docker", "compose", "-f", COMPOSE_FILE, "logs", "deepagents",
             f"--since={int(since_s)}s", "--no-log-prefix"],
            capture_output=True, text=True, timeout=15,
        )
        return r.stdout.splitlines()
    except Exception:
        return []


def parse_run_block(lines, msg_prefix):
    """
    Scan log lines for the LAST '[agent] running: <msg_prefix>' block.
    Extracts reply timing, tier, and memory timing from that block.

    Returns dict or None if the reply has not appeared in logs yet.
    Dict keys:
      reply_total, llm, send, tier, reply_text  — from "[agent] replied in ..."
      memory_s                                  — from "[memory] stored in ..."
      memory_error                              — True if "[memory] error" found
    """
    search = msg_prefix[:50]
    start_idx = None
    for i, line in enumerate(lines):
        if "[agent] running:" in line and search in line:
            start_idx = i  # keep updating — we want the LAST occurrence

    if start_idx is None:
        return None

    block = lines[start_idx:]
    last_ai_text = None
    reply_data = None

    for j, line in enumerate(block):
        if "AIMessage:" in line and "→" not in line:
            txt = line.split("AIMessage:", 1)[-1].strip()
            if txt:
                last_ai_text = txt

        m = re.search(r"replied in ([\d.]+)s \(llm=([\d.]+)s, send=([\d.]+)s\)", line)
        if m:
            tier_m = re.search(r"\btier=(\w+)", line)
            tier = tier_m.group(1) if tier_m else "unknown"
            reply_data = {
                "reply_total": float(m.group(1)),
                "llm":         float(m.group(2)),
                "send":        float(m.group(3)),
                "tier":        tier,
                "reply_text":  last_ai_text,
                "memory_s":    None,
                "memory_error": False,
                "_j": j,
            }
            break

    if reply_data is not None:
        next_lines = block[reply_data["_j"] + 1: reply_data["_j"] + 3]
        for line in next_lines:
            if line.startswith("[agent] reply_text:"):
                reply_data["reply_text"] = line[len("[agent] reply_text:"):].strip()
                break

    if reply_data is None:
        return None

    for line in block[reply_data["_j"] + 1:]:
        mm = re.search(r"\[memory\] stored in ([\d.]+)s", line)
        if mm:
            reply_data["memory_s"] = float(mm.group(1))
            break
        if "[memory] error" in line:
            reply_data["memory_error"] = True
            break

    return reply_data


def wait_for(label, msg_prefix, timeout_s=200, need_memory=True):
    """
    Poll deepagents logs until the message is fully processed.
    Shows a live progress line. Returns timing dict or None on timeout.
    """
    t_start = time.monotonic()
    deadline = t_start + timeout_s
    tick = 0
    last_result = None

    while time.monotonic() < deadline:
        since = int(time.monotonic() - t_start) + 90
        lines = fetch_logs(since_s=since)
        result = parse_run_block(lines, msg_prefix)

        if result:
            last_result = result
            has_mem = result["memory_s"] is not None or result["memory_error"]
            if (not need_memory) or has_mem:
                elapsed = time.monotonic() - t_start
                print(f"\r  [{label}] done after {elapsed:.0f}s{' ' * 30}")
                return result

        time.sleep(4)
        tick += 1
        rem = int(deadline - time.monotonic())
        if last_result:
            phase = "waiting for memory..." if need_memory else "done"
        else:
            phase = "waiting for LLM reply..."
        print(f"\r  [{label}] {tick*4}s elapsed, {rem}s left — {phase}  ", end="", flush=True)

    print(f"\r  [{label}] TIMEOUT after {timeout_s}s{' ' * 30}")
    return None