Add Rich token streaming: server SSE + CLI live display + CLI container

Server (agent.py):
- _stream_queues: per-session asyncio.Queue for token chunks
- _push_stream_chunk() / _end_stream() helpers
- Medium tier: astream() with <think> block filtering — real token streaming
- Light tier: full reply pushed as single chunk then [DONE]
- Complex tier: full reply pushed after agent completes then [DONE]
- GET /stream/{session_id} SSE endpoint (data: <chunk>\n\n, data: [DONE]\n\n)
- medium_model promoted to module-level global for astream() access

CLI (cli.py):
- stream_reply(): reads /stream/ SSE, renders tokens live with Rich Live (transient)
- Final reply rendered as Markdown after stream completes
- os.getlogin() replaced with os.getenv("USER") for container compatibility

Dockerfile.cli + docker-compose cli service (profiles: tools):
- Run: docker compose --profile tools run --rm -it cli

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Alvis
2026-03-12 17:26:52 +00:00
parent edc9a96f7a
commit b04e8a0925
6 changed files with 151 additions and 38 deletions

9
Dockerfile.cli Normal file
View File

@@ -0,0 +1,9 @@
FROM python:3.12-slim
WORKDIR /app
RUN pip install --no-cache-dir rich
COPY cli.py .
CMD ["python3", "cli.py"]

View File

@@ -41,6 +41,19 @@ CRAWL4AI_URL = os.getenv("CRAWL4AI_URL", "http://crawl4ai:11235")
MAX_HISTORY_TURNS = 5 MAX_HISTORY_TURNS = 5
_conversation_buffers: dict[str, list] = {} _conversation_buffers: dict[str, list] = {}
# Per-session streaming queues — filled during inference, read by /stream/{session_id}
_stream_queues: dict[str, asyncio.Queue] = {}
async def _push_stream_chunk(session_id: str, chunk: str) -> None:
q = _stream_queues.setdefault(session_id, asyncio.Queue())
await q.put(chunk)
async def _end_stream(session_id: str) -> None:
q = _stream_queues.setdefault(session_id, asyncio.Queue())
await q.put("[DONE]")
async def _crawl4ai_fetch_async(url: str) -> str: async def _crawl4ai_fetch_async(url: str) -> str:
"""Async fetch via Crawl4AI — JS-rendered, bot-bypass, returns clean markdown.""" """Async fetch via Crawl4AI — JS-rendered, bot-bypass, returns clean markdown."""
@@ -95,6 +108,7 @@ COMPLEX_SYSTEM_PROMPT = (
"NEVER invent URLs. End with: **Sources checked: N**" "NEVER invent URLs. End with: **Sources checked: N**"
) )
medium_model = None
medium_agent = None medium_agent = None
complex_agent = None complex_agent = None
router: Router = None router: Router = None
@@ -109,7 +123,7 @@ _reply_semaphore = asyncio.Semaphore(1)
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
global medium_agent, complex_agent, router, vram_manager, mcp_client, \ global medium_model, medium_agent, complex_agent, router, vram_manager, mcp_client, \
_memory_add_tool, _memory_search_tool _memory_add_tool, _memory_search_tool
# Register channel adapters # Register channel adapters
@@ -263,6 +277,7 @@ async def lifespan(app: FastAPI):
yield yield
medium_model = None
medium_agent = None medium_agent = None
complex_agent = None complex_agent = None
router = None router = None
@@ -394,6 +409,8 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
final_text = light_reply final_text = light_reply
llm_elapsed = time.monotonic() - t0 llm_elapsed = time.monotonic() - t0
print(f"[agent] light path: answered by router", flush=True) print(f"[agent] light path: answered by router", flush=True)
await _push_stream_chunk(session_id, final_text)
await _end_stream(session_id)
elif tier == "medium": elif tier == "medium":
system_prompt = MEDIUM_SYSTEM_PROMPT system_prompt = MEDIUM_SYSTEM_PROMPT
@@ -401,16 +418,39 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
system_prompt = system_prompt + "\n\n" + memories system_prompt = system_prompt + "\n\n" + memories
if url_context: if url_context:
system_prompt = system_prompt + "\n\n" + url_context system_prompt = system_prompt + "\n\n" + url_context
result = await medium_agent.ainvoke({
"messages": [ # Stream tokens directly — filter out qwen3 <think> blocks
in_think = False
response_parts = []
async for chunk in medium_model.astream([
{"role": "system", "content": system_prompt}, {"role": "system", "content": system_prompt},
*history, *history,
{"role": "user", "content": clean_message}, {"role": "user", "content": clean_message},
] ]):
}) token = chunk.content or ""
if not token:
continue
if in_think:
if "</think>" in token:
in_think = False
after = token.split("</think>", 1)[1]
if after:
await _push_stream_chunk(session_id, after)
response_parts.append(after)
else:
if "<think>" in token:
in_think = True
before = token.split("<think>", 1)[0]
if before:
await _push_stream_chunk(session_id, before)
response_parts.append(before)
else:
await _push_stream_chunk(session_id, token)
response_parts.append(token)
await _end_stream(session_id)
llm_elapsed = time.monotonic() - t0 llm_elapsed = time.monotonic() - t0
_log_messages(result) final_text = "".join(response_parts).strip() or None
final_text = _extract_final_text(result)
else: # complex else: # complex
ok = await vram_manager.enter_complex_mode() ok = await vram_manager.enter_complex_mode()
@@ -432,7 +472,6 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
else: else:
system_prompt = COMPLEX_SYSTEM_PROMPT.format(user_id=session_id) system_prompt = COMPLEX_SYSTEM_PROMPT.format(user_id=session_id)
if url_context: if url_context:
# Inject pre-fetched content — complex agent can still re-fetch or follow links
system_prompt = system_prompt + "\n\n[Pre-fetched URL content from user's message:]\n" + url_context system_prompt = system_prompt + "\n\n[Pre-fetched URL content from user's message:]\n" + url_context
result = await complex_agent.ainvoke({ result = await complex_agent.ainvoke({
"messages": [ "messages": [
@@ -446,12 +485,16 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
llm_elapsed = time.monotonic() - t0 llm_elapsed = time.monotonic() - t0
_log_messages(result) _log_messages(result)
final_text = _extract_final_text(result) final_text = _extract_final_text(result)
if final_text:
await _push_stream_chunk(session_id, final_text)
await _end_stream(session_id)
except Exception as e: except Exception as e:
import traceback import traceback
llm_elapsed = time.monotonic() - t0 llm_elapsed = time.monotonic() - t0
print(f"[agent] error after {llm_elapsed:.1f}s for chat {session_id}: {e}", flush=True) print(f"[agent] error after {llm_elapsed:.1f}s for chat {session_id}: {e}", flush=True)
traceback.print_exc() traceback.print_exc()
await _end_stream(session_id)
# Deliver reply through the originating channel # Deliver reply through the originating channel
if final_text: if final_text:
@@ -521,6 +564,32 @@ async def reply_stream(session_id: str):
return StreamingResponse(event_generator(), media_type="text/event-stream") return StreamingResponse(event_generator(), media_type="text/event-stream")
@app.get("/stream/{session_id}")
async def stream_reply(session_id: str):
"""
SSE endpoint — streams reply tokens as they are generated.
Each chunk: data: <token>\\n\\n
Signals completion: data: [DONE]\\n\\n
Medium tier: real token-by-token streaming (think blocks filtered out).
Light and complex tiers: full reply delivered as one chunk then [DONE].
"""
q = _stream_queues.setdefault(session_id, asyncio.Queue())
async def event_generator():
try:
while True:
chunk = await asyncio.wait_for(q.get(), timeout=900)
escaped = chunk.replace("\n", "\\n").replace("\r", "")
yield f"data: {escaped}\n\n"
if chunk == "[DONE]":
break
except asyncio.TimeoutError:
yield "data: [DONE]\n\n"
return StreamingResponse(event_generator(), media_type="text/event-stream")
@app.get("/health") @app.get("/health")
async def health(): async def health():
return {"status": "ok", "agent_ready": medium_agent is not None} return {"status": "ok", "agent_ready": medium_agent is not None}

57
cli.py
View File

@@ -1,9 +1,9 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Adolf CLI — interactive REPL for the multi-channel gateway. Adolf CLI — interactive REPL with Rich streaming display.
Usage: Usage:
python3 cli.py [--url http://localhost:8000] [--session cli-alvis] python3 cli.py [--url http://deepagents:8000] [--session cli-alvis]
""" """
import argparse import argparse
@@ -12,7 +12,13 @@ import os
import sys import sys
import urllib.request import urllib.request
GATEWAY = "http://localhost:8000" from rich.console import Console
from rich.live import Live
from rich.markdown import Markdown
from rich.text import Text
GATEWAY = "http://deepagents:8000"
console = Console()
def post_message(gateway: str, text: str, session_id: str) -> None: def post_message(gateway: str, text: str, session_id: str) -> None:
@@ -20,7 +26,7 @@ def post_message(gateway: str, text: str, session_id: str) -> None:
"text": text, "text": text,
"session_id": session_id, "session_id": session_id,
"channel": "cli", "channel": "cli",
"user_id": os.getlogin(), "user_id": os.getenv("USER", "user"),
}).encode() }).encode()
req = urllib.request.Request( req = urllib.request.Request(
f"{gateway}/message", f"{gateway}/message",
@@ -30,33 +36,49 @@ def post_message(gateway: str, text: str, session_id: str) -> None:
) )
with urllib.request.urlopen(req, timeout=10) as r: with urllib.request.urlopen(req, timeout=10) as r:
if r.status != 202: if r.status != 202:
print(f"[error] gateway returned {r.status}", file=sys.stderr) console.print(f"[red][error] gateway returned {r.status}[/red]")
sys.exit(1) sys.exit(1)
def wait_for_reply(gateway: str, session_id: str, timeout: int = 400) -> str: def stream_reply(gateway: str, session_id: str, timeout: int = 400) -> str:
"""Open SSE stream and return first data event.""" """
Open the /stream/{session_id} SSE endpoint and display tokens live with
Rich. Returns the full assembled reply text.
"""
req = urllib.request.Request( req = urllib.request.Request(
f"{gateway}/reply/{session_id}", f"{gateway}/stream/{session_id}",
headers={"Accept": "text/event-stream"}, headers={"Accept": "text/event-stream"},
) )
buffer = ""
with urllib.request.urlopen(req, timeout=timeout + 5) as r: with urllib.request.urlopen(req, timeout=timeout + 5) as r:
with Live(Text(""), console=console, refresh_per_second=20, transient=True) as live:
for raw_line in r: for raw_line in r:
line = raw_line.decode("utf-8").rstrip("\n") line = raw_line.decode("utf-8").rstrip("\n")
if line.startswith("data:"): if not line.startswith("data:"):
return line[5:].strip().replace("\\n", "\n") continue
return "" chunk = line[5:].strip()
if chunk == "[DONE]":
break
chunk = chunk.replace("\\n", "\n")
buffer += chunk
live.update(Text(buffer))
# Render the complete reply as Markdown once streaming is done
console.print(Markdown(buffer))
return buffer
def main(): def main():
parser = argparse.ArgumentParser(description="Adolf CLI") parser = argparse.ArgumentParser(description="Adolf CLI")
parser.add_argument("--url", default=GATEWAY, help="Gateway URL") parser.add_argument("--url", default=GATEWAY, help="Gateway URL")
parser.add_argument("--session", default=f"cli-{os.getlogin()}", help="Session ID") parser.add_argument("--session", default=f"cli-{os.getenv('USER', 'user')}",
help="Session ID")
parser.add_argument("--timeout", type=int, default=400, help="Reply timeout (seconds)") parser.add_argument("--timeout", type=int, default=400, help="Reply timeout (seconds)")
args = parser.parse_args() args = parser.parse_args()
print(f"Adolf CLI (session={args.session}, gateway={args.url})") console.print(f"[bold]Adolf CLI[/bold] (session=[cyan]{args.session}[/cyan], "
print("Type your message and press Enter. Ctrl+C or Ctrl+D to exit.\n") f"gateway=[cyan]{args.url}[/cyan])")
console.print("Type your message and press Enter. Ctrl+C or Ctrl+D to exit.\n")
try: try:
while True: while True:
@@ -68,12 +90,11 @@ def main():
continue continue
post_message(args.url, text, args.session) post_message(args.url, text, args.session)
print("...", end="", flush=True) stream_reply(args.url, args.session, timeout=args.timeout)
reply = wait_for_reply(args.url, args.session, timeout=args.timeout) console.print()
print(f"\r{reply}\n")
except KeyboardInterrupt: except KeyboardInterrupt:
print("\nbye") console.print("\n[dim]bye[/dim]")
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -65,6 +65,20 @@ services:
- DEEPAGENTS_URL=http://deepagents:8000 - DEEPAGENTS_URL=http://deepagents:8000
restart: unless-stopped restart: unless-stopped
cli:
build:
context: .
dockerfile: Dockerfile.cli
container_name: cli
environment:
- DEEPAGENTS_URL=http://deepagents:8000
depends_on:
- deepagents
stdin_open: true
tty: true
profiles:
- tools
crawl4ai: crawl4ai:
image: unclecode/crawl4ai:latest image: unclecode/crawl4ai:latest
container_name: crawl4ai container_name: crawl4ai

View File

@@ -13,17 +13,17 @@ curl -s -X POST http://localhost:8000/message \
-d '{"text": "/think what is the best recipe for an apple pie?", "session_id": "use-case-apple-pie", "channel": "cli", "user_id": "claude"}' -d '{"text": "/think what is the best recipe for an apple pie?", "session_id": "use-case-apple-pie", "channel": "cli", "user_id": "claude"}'
``` ```
**2. Wait for the reply** via SSE (complex tier can take up to 5 minutes): **2. Wait for the streaming reply** (complex tier can take up to 5 minutes):
```bash ```bash
curl -s -N --max-time 300 "http://localhost:8000/reply/use-case-apple-pie" curl -s -N --max-time 300 "http://localhost:8000/stream/use-case-apple-pie"
``` ```
**3. Confirm tier and tool usage in agent logs:** **3. Confirm tier and tool usage in agent logs:**
```bash ```bash
docker compose -f /home/alvis/adolf/docker-compose.yml logs deepagents \ docker compose -f /home/alvis/adolf/docker-compose.yml logs deepagents \
--since=600s --no-log-prefix | grep -E "tier=complex|web_search|fetch_url|crawl4ai" --since=600s | grep -E "tier=complex|web_search|fetch_url|crawl4ai"
``` ```
## Evaluate (use your judgment) ## Evaluate (use your judgment)

View File

@@ -1,13 +1,13 @@
# Use Case: CLI Startup # Use Case: CLI Startup
Verify the Adolf CLI starts cleanly and exits without error when the user closes input. Verify the Adolf CLI container starts cleanly, shows the welcome banner,
and exits without error when the user closes input.
## Steps ## Steps
Run the CLI with empty stdin (simulates user pressing Ctrl+D immediately):
```bash ```bash
echo "" | python3 /home/alvis/adolf/cli.py --session use-case-cli-startup echo "" | docker compose --profile tools run --rm -T cli \
python3 cli.py --url http://deepagents:8000 --session use-case-cli-startup
echo "exit code: $?" echo "exit code: $?"
``` ```