Add Rich token streaming: server SSE + CLI live display + CLI container

Server (agent.py): - _stream_queues: per-session asyncio.Queue for token chunks - _push_stream_chunk() / _end_stream() helpers - Medium tier: astream() with <think> block filtering — real token streaming - Light tier: full reply pushed as single chunk then [DONE] - Complex tier: full reply pushed after agent completes then [DONE] - GET /stream/{session_id} SSE endpoint (data: <chunk>\n\n, data: [DONE]\n\n) - medium_model promoted to module-level global for astream() access CLI (cli.py): - stream_reply(): reads /stream/ SSE, renders tokens live with Rich Live (transient) - Final reply rendered as Markdown after stream completes - os.getlogin() replaced with os.getenv("USER") for container compatibility Dockerfile.cli + docker-compose cli service (profiles: tools): - Run: docker compose --profile tools run --rm -it cli Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-12 17:26:52 +00:00
parent edc9a96f7a
commit b04e8a0925
6 changed files with 151 additions and 38 deletions
--- a/Dockerfile.cli
+++ b/Dockerfile.cli
@@ -0,0 +1,9 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+RUN pip install --no-cache-dir rich
+
+COPY cli.py .
+
+CMD ["python3", "cli.py"]
--- a/agent.py
+++ b/agent.py
@@ -41,6 +41,19 @@ CRAWL4AI_URL = os.getenv("CRAWL4AI_URL", "http://crawl4ai:11235")
 MAX_HISTORY_TURNS = 5
 _conversation_buffers: dict[str, list] = {}

+# Per-session streaming queues — filled during inference, read by /stream/{session_id}
+_stream_queues: dict[str, asyncio.Queue] = {}
+
+
+async def _push_stream_chunk(session_id: str, chunk: str) -> None:
+    q = _stream_queues.setdefault(session_id, asyncio.Queue())
+    await q.put(chunk)
+
+
+async def _end_stream(session_id: str) -> None:
+    q = _stream_queues.setdefault(session_id, asyncio.Queue())
+    await q.put("[DONE]")
+

 async def _crawl4ai_fetch_async(url: str) -> str:
    """Async fetch via Crawl4AI — JS-rendered, bot-bypass, returns clean markdown."""
@@ -95,6 +108,7 @@ COMPLEX_SYSTEM_PROMPT = (
    "NEVER invent URLs. End with: **Sources checked: N**"
 )

+medium_model = None
 medium_agent = None
 complex_agent = None
 router: Router = None
@@ -109,7 +123,7 @@ _reply_semaphore = asyncio.Semaphore(1)

@asynccontextmanager
 async def lifespan(app: FastAPI):
-    global medium_agent, complex_agent, router, vram_manager, mcp_client, \
+    global medium_model, medium_agent, complex_agent, router, vram_manager, mcp_client, \
        _memory_add_tool, _memory_search_tool

    # Register channel adapters
@@ -263,6 +277,7 @@ async def lifespan(app: FastAPI):

    yield

+    medium_model = None
    medium_agent = None
    complex_agent = None
    router = None
@@ -394,6 +409,8 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
                final_text = light_reply
                llm_elapsed = time.monotonic() - t0
                print(f"[agent] light path: answered by router", flush=True)
+                await _push_stream_chunk(session_id, final_text)
+                await _end_stream(session_id)

            elif tier == "medium":
                system_prompt = MEDIUM_SYSTEM_PROMPT
@@ -401,16 +418,39 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
                    system_prompt = system_prompt + "\n\n" + memories
                if url_context:
                    system_prompt = system_prompt + "\n\n" + url_context
-                result = await medium_agent.ainvoke({
-                    "messages": [
-                        {"role": "system", "content": system_prompt},
-                        *history,
-                        {"role": "user", "content": clean_message},
-                    ]
-                })
+
+                # Stream tokens directly — filter out qwen3 <think> blocks
+                in_think = False
+                response_parts = []
+                async for chunk in medium_model.astream([
+                    {"role": "system", "content": system_prompt},
+                    *history,
+                    {"role": "user", "content": clean_message},
+                ]):
+                    token = chunk.content or ""
+                    if not token:
+                        continue
+                    if in_think:
+                        if "</think>" in token:
+                            in_think = False
+                            after = token.split("</think>", 1)[1]
+                            if after:
+                                await _push_stream_chunk(session_id, after)
+                                response_parts.append(after)
+                    else:
+                        if "<think>" in token:
+                            in_think = True
+                            before = token.split("<think>", 1)[0]
+                            if before:
+                                await _push_stream_chunk(session_id, before)
+                                response_parts.append(before)
+                        else:
+                            await _push_stream_chunk(session_id, token)
+                            response_parts.append(token)
+
+                await _end_stream(session_id)
                llm_elapsed = time.monotonic() - t0
-                _log_messages(result)
-                final_text = _extract_final_text(result)
+                final_text = "".join(response_parts).strip() or None

            else:  # complex
                ok = await vram_manager.enter_complex_mode()
@@ -432,7 +472,6 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
                else:
                    system_prompt = COMPLEX_SYSTEM_PROMPT.format(user_id=session_id)
                    if url_context:
-                        # Inject pre-fetched content — complex agent can still re-fetch or follow links
                        system_prompt = system_prompt + "\n\n[Pre-fetched URL content from user's message:]\n" + url_context
                    result = await complex_agent.ainvoke({
                        "messages": [
@@ -446,12 +485,16 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
                llm_elapsed = time.monotonic() - t0
                _log_messages(result)
                final_text = _extract_final_text(result)
+                if final_text:
+                    await _push_stream_chunk(session_id, final_text)
+                await _end_stream(session_id)

        except Exception as e:
            import traceback
            llm_elapsed = time.monotonic() - t0
            print(f"[agent] error after {llm_elapsed:.1f}s for chat {session_id}: {e}", flush=True)
            traceback.print_exc()
+            await _end_stream(session_id)

        # Deliver reply through the originating channel
        if final_text:
@@ -521,6 +564,32 @@ async def reply_stream(session_id: str):
    return StreamingResponse(event_generator(), media_type="text/event-stream")


+@app.get("/stream/{session_id}")
+async def stream_reply(session_id: str):
+    """
+    SSE endpoint — streams reply tokens as they are generated.
+    Each chunk: data: <token>\\n\\n
+    Signals completion: data: [DONE]\\n\\n
+
+    Medium tier: real token-by-token streaming (think blocks filtered out).
+    Light and complex tiers: full reply delivered as one chunk then [DONE].
+    """
+    q = _stream_queues.setdefault(session_id, asyncio.Queue())
+
+    async def event_generator():
+        try:
+            while True:
+                chunk = await asyncio.wait_for(q.get(), timeout=900)
+                escaped = chunk.replace("\n", "\\n").replace("\r", "")
+                yield f"data: {escaped}\n\n"
+                if chunk == "[DONE]":
+                    break
+        except asyncio.TimeoutError:
+            yield "data: [DONE]\n\n"
+
+    return StreamingResponse(event_generator(), media_type="text/event-stream")
+
+
@app.get("/health")
 async def health():
    return {"status": "ok", "agent_ready": medium_agent is not None}
--- a/cli.py
+++ b/cli.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 """
-Adolf CLI — interactive REPL for the multi-channel gateway.
+Adolf CLI — interactive REPL with Rich streaming display.

 Usage:
-    python3 cli.py [--url http://localhost:8000] [--session cli-alvis]
+    python3 cli.py [--url http://deepagents:8000] [--session cli-alvis]
 """

 import argparse
@@ -12,7 +12,13 @@ import os
 import sys
 import urllib.request

-GATEWAY = "http://localhost:8000"
+from rich.console import Console
+from rich.live import Live
+from rich.markdown import Markdown
+from rich.text import Text
+
+GATEWAY = "http://deepagents:8000"
+console = Console()


 def post_message(gateway: str, text: str, session_id: str) -> None:
@@ -20,7 +26,7 @@ def post_message(gateway: str, text: str, session_id: str) -> None:
        "text": text,
        "session_id": session_id,
        "channel": "cli",
-        "user_id": os.getlogin(),
+        "user_id": os.getenv("USER", "user"),
    }).encode()
    req = urllib.request.Request(
        f"{gateway}/message",
@@ -30,33 +36,49 @@ def post_message(gateway: str, text: str, session_id: str) -> None:
    )
    with urllib.request.urlopen(req, timeout=10) as r:
        if r.status != 202:
-            print(f"[error] gateway returned {r.status}", file=sys.stderr)
+            console.print(f"[red][error] gateway returned {r.status}[/red]")
            sys.exit(1)


-def wait_for_reply(gateway: str, session_id: str, timeout: int = 400) -> str:
-    """Open SSE stream and return first data event."""
+def stream_reply(gateway: str, session_id: str, timeout: int = 400) -> str:
+    """
+    Open the /stream/{session_id} SSE endpoint and display tokens live with
+    Rich. Returns the full assembled reply text.
+    """
    req = urllib.request.Request(
-        f"{gateway}/reply/{session_id}",
+        f"{gateway}/stream/{session_id}",
        headers={"Accept": "text/event-stream"},
    )
+    buffer = ""
    with urllib.request.urlopen(req, timeout=timeout + 5) as r:
-        for raw_line in r:
-            line = raw_line.decode("utf-8").rstrip("\n")
-            if line.startswith("data:"):
-                return line[5:].strip().replace("\\n", "\n")
-    return ""
+        with Live(Text(""), console=console, refresh_per_second=20, transient=True) as live:
+            for raw_line in r:
+                line = raw_line.decode("utf-8").rstrip("\n")
+                if not line.startswith("data:"):
+                    continue
+                chunk = line[5:].strip()
+                if chunk == "[DONE]":
+                    break
+                chunk = chunk.replace("\\n", "\n")
+                buffer += chunk
+                live.update(Text(buffer))
+
+    # Render the complete reply as Markdown once streaming is done
+    console.print(Markdown(buffer))
+    return buffer


 def main():
    parser = argparse.ArgumentParser(description="Adolf CLI")
    parser.add_argument("--url", default=GATEWAY, help="Gateway URL")
-    parser.add_argument("--session", default=f"cli-{os.getlogin()}", help="Session ID")
+    parser.add_argument("--session", default=f"cli-{os.getenv('USER', 'user')}",
+                        help="Session ID")
    parser.add_argument("--timeout", type=int, default=400, help="Reply timeout (seconds)")
    args = parser.parse_args()

-    print(f"Adolf CLI  (session={args.session}, gateway={args.url})")
-    print("Type your message and press Enter. Ctrl+C or Ctrl+D to exit.\n")
+    console.print(f"[bold]Adolf CLI[/bold]  (session=[cyan]{args.session}[/cyan], "
+                  f"gateway=[cyan]{args.url}[/cyan])")
+    console.print("Type your message and press Enter. Ctrl+C or Ctrl+D to exit.\n")

    try:
        while True:
@@ -68,12 +90,11 @@ def main():
                continue

            post_message(args.url, text, args.session)
-            print("...", end="", flush=True)
-            reply = wait_for_reply(args.url, args.session, timeout=args.timeout)
-            print(f"\r{reply}\n")
+            stream_reply(args.url, args.session, timeout=args.timeout)
+            console.print()

    except KeyboardInterrupt:
-        print("\nbye")
+        console.print("\n[dim]bye[/dim]")


 if __name__ == "__main__":
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -65,6 +65,20 @@ services:
      - DEEPAGENTS_URL=http://deepagents:8000
    restart: unless-stopped

+  cli:
+    build:
+      context: .
+      dockerfile: Dockerfile.cli
+    container_name: cli
+    environment:
+      - DEEPAGENTS_URL=http://deepagents:8000
+    depends_on:
+      - deepagents
+    stdin_open: true
+    tty: true
+    profiles:
+      - tools
+
  crawl4ai:
    image: unclecode/crawl4ai:latest
    container_name: crawl4ai
--- a/tests/use_cases/apple_pie_research.md
+++ b/tests/use_cases/apple_pie_research.md
@@ -13,17 +13,17 @@ curl -s -X POST http://localhost:8000/message \
  -d '{"text": "/think what is the best recipe for an apple pie?", "session_id": "use-case-apple-pie", "channel": "cli", "user_id": "claude"}'
 ```

-**2. Wait for the reply** via SSE (complex tier can take up to 5 minutes):
+**2. Wait for the streaming reply** (complex tier can take up to 5 minutes):

 ```bash
-curl -s -N --max-time 300 "http://localhost:8000/reply/use-case-apple-pie"
+curl -s -N --max-time 300 "http://localhost:8000/stream/use-case-apple-pie"
 ```

 **3. Confirm tier and tool usage in agent logs:**

 ```bash
 docker compose -f /home/alvis/adolf/docker-compose.yml logs deepagents \
-  --since=600s --no-log-prefix | grep -E "tier=complex|web_search|fetch_url|crawl4ai"
+  --since=600s | grep -E "tier=complex|web_search|fetch_url|crawl4ai"
 ```

 ## Evaluate (use your judgment)
--- a/tests/use_cases/cli_startup.md
+++ b/tests/use_cases/cli_startup.md
@@ -1,13 +1,13 @@
 # Use Case: CLI Startup

-Verify the Adolf CLI starts cleanly and exits without error when the user closes input.
+Verify the Adolf CLI container starts cleanly, shows the welcome banner,
+and exits without error when the user closes input.

 ## Steps

-Run the CLI with empty stdin (simulates user pressing Ctrl+D immediately):
-
 ```bash
-echo "" | python3 /home/alvis/adolf/cli.py --session use-case-cli-startup
+echo "" | docker compose --profile tools run --rm -T cli \
+  python3 cli.py --url http://deepagents:8000 --session use-case-cli-startup
 echo "exit code: $?"
 ```