Add Rich token streaming: server SSE + CLI live display + CLI container
Server (agent.py):
- _stream_queues: per-session asyncio.Queue for token chunks
- _push_stream_chunk() / _end_stream() helpers
- Medium tier: astream() with <think> block filtering — real token streaming
- Light tier: full reply pushed as single chunk then [DONE]
- Complex tier: full reply pushed after agent completes then [DONE]
- GET /stream/{session_id} SSE endpoint (data: <chunk>\n\n, data: [DONE]\n\n)
- medium_model promoted to module-level global for astream() access
CLI (cli.py):
- stream_reply(): reads /stream/ SSE, renders tokens live with Rich Live (transient)
- Final reply rendered as Markdown after stream completes
- os.getlogin() replaced with os.getenv("USER") for container compatibility
Dockerfile.cli + docker-compose cli service (profiles: tools):
- Run: docker compose --profile tools run --rm -it cli
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
9
Dockerfile.cli
Normal file
9
Dockerfile.cli
Normal file
@@ -0,0 +1,9 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN pip install --no-cache-dir rich
|
||||
|
||||
COPY cli.py .
|
||||
|
||||
CMD ["python3", "cli.py"]
|
||||
91
agent.py
91
agent.py
@@ -41,6 +41,19 @@ CRAWL4AI_URL = os.getenv("CRAWL4AI_URL", "http://crawl4ai:11235")
|
||||
MAX_HISTORY_TURNS = 5
|
||||
_conversation_buffers: dict[str, list] = {}
|
||||
|
||||
# Per-session streaming queues — filled during inference, read by /stream/{session_id}
|
||||
_stream_queues: dict[str, asyncio.Queue] = {}
|
||||
|
||||
|
||||
async def _push_stream_chunk(session_id: str, chunk: str) -> None:
|
||||
q = _stream_queues.setdefault(session_id, asyncio.Queue())
|
||||
await q.put(chunk)
|
||||
|
||||
|
||||
async def _end_stream(session_id: str) -> None:
|
||||
q = _stream_queues.setdefault(session_id, asyncio.Queue())
|
||||
await q.put("[DONE]")
|
||||
|
||||
|
||||
async def _crawl4ai_fetch_async(url: str) -> str:
|
||||
"""Async fetch via Crawl4AI — JS-rendered, bot-bypass, returns clean markdown."""
|
||||
@@ -95,6 +108,7 @@ COMPLEX_SYSTEM_PROMPT = (
|
||||
"NEVER invent URLs. End with: **Sources checked: N**"
|
||||
)
|
||||
|
||||
medium_model = None
|
||||
medium_agent = None
|
||||
complex_agent = None
|
||||
router: Router = None
|
||||
@@ -109,7 +123,7 @@ _reply_semaphore = asyncio.Semaphore(1)
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
global medium_agent, complex_agent, router, vram_manager, mcp_client, \
|
||||
global medium_model, medium_agent, complex_agent, router, vram_manager, mcp_client, \
|
||||
_memory_add_tool, _memory_search_tool
|
||||
|
||||
# Register channel adapters
|
||||
@@ -263,6 +277,7 @@ async def lifespan(app: FastAPI):
|
||||
|
||||
yield
|
||||
|
||||
medium_model = None
|
||||
medium_agent = None
|
||||
complex_agent = None
|
||||
router = None
|
||||
@@ -394,6 +409,8 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
|
||||
final_text = light_reply
|
||||
llm_elapsed = time.monotonic() - t0
|
||||
print(f"[agent] light path: answered by router", flush=True)
|
||||
await _push_stream_chunk(session_id, final_text)
|
||||
await _end_stream(session_id)
|
||||
|
||||
elif tier == "medium":
|
||||
system_prompt = MEDIUM_SYSTEM_PROMPT
|
||||
@@ -401,16 +418,39 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
|
||||
system_prompt = system_prompt + "\n\n" + memories
|
||||
if url_context:
|
||||
system_prompt = system_prompt + "\n\n" + url_context
|
||||
result = await medium_agent.ainvoke({
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
*history,
|
||||
{"role": "user", "content": clean_message},
|
||||
]
|
||||
})
|
||||
|
||||
# Stream tokens directly — filter out qwen3 <think> blocks
|
||||
in_think = False
|
||||
response_parts = []
|
||||
async for chunk in medium_model.astream([
|
||||
{"role": "system", "content": system_prompt},
|
||||
*history,
|
||||
{"role": "user", "content": clean_message},
|
||||
]):
|
||||
token = chunk.content or ""
|
||||
if not token:
|
||||
continue
|
||||
if in_think:
|
||||
if "</think>" in token:
|
||||
in_think = False
|
||||
after = token.split("</think>", 1)[1]
|
||||
if after:
|
||||
await _push_stream_chunk(session_id, after)
|
||||
response_parts.append(after)
|
||||
else:
|
||||
if "<think>" in token:
|
||||
in_think = True
|
||||
before = token.split("<think>", 1)[0]
|
||||
if before:
|
||||
await _push_stream_chunk(session_id, before)
|
||||
response_parts.append(before)
|
||||
else:
|
||||
await _push_stream_chunk(session_id, token)
|
||||
response_parts.append(token)
|
||||
|
||||
await _end_stream(session_id)
|
||||
llm_elapsed = time.monotonic() - t0
|
||||
_log_messages(result)
|
||||
final_text = _extract_final_text(result)
|
||||
final_text = "".join(response_parts).strip() or None
|
||||
|
||||
else: # complex
|
||||
ok = await vram_manager.enter_complex_mode()
|
||||
@@ -432,7 +472,6 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
|
||||
else:
|
||||
system_prompt = COMPLEX_SYSTEM_PROMPT.format(user_id=session_id)
|
||||
if url_context:
|
||||
# Inject pre-fetched content — complex agent can still re-fetch or follow links
|
||||
system_prompt = system_prompt + "\n\n[Pre-fetched URL content from user's message:]\n" + url_context
|
||||
result = await complex_agent.ainvoke({
|
||||
"messages": [
|
||||
@@ -446,12 +485,16 @@ async def run_agent_task(message: str, session_id: str, channel: str = "telegram
|
||||
llm_elapsed = time.monotonic() - t0
|
||||
_log_messages(result)
|
||||
final_text = _extract_final_text(result)
|
||||
if final_text:
|
||||
await _push_stream_chunk(session_id, final_text)
|
||||
await _end_stream(session_id)
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
llm_elapsed = time.monotonic() - t0
|
||||
print(f"[agent] error after {llm_elapsed:.1f}s for chat {session_id}: {e}", flush=True)
|
||||
traceback.print_exc()
|
||||
await _end_stream(session_id)
|
||||
|
||||
# Deliver reply through the originating channel
|
||||
if final_text:
|
||||
@@ -521,6 +564,32 @@ async def reply_stream(session_id: str):
|
||||
return StreamingResponse(event_generator(), media_type="text/event-stream")
|
||||
|
||||
|
||||
@app.get("/stream/{session_id}")
|
||||
async def stream_reply(session_id: str):
|
||||
"""
|
||||
SSE endpoint — streams reply tokens as they are generated.
|
||||
Each chunk: data: <token>\\n\\n
|
||||
Signals completion: data: [DONE]\\n\\n
|
||||
|
||||
Medium tier: real token-by-token streaming (think blocks filtered out).
|
||||
Light and complex tiers: full reply delivered as one chunk then [DONE].
|
||||
"""
|
||||
q = _stream_queues.setdefault(session_id, asyncio.Queue())
|
||||
|
||||
async def event_generator():
|
||||
try:
|
||||
while True:
|
||||
chunk = await asyncio.wait_for(q.get(), timeout=900)
|
||||
escaped = chunk.replace("\n", "\\n").replace("\r", "")
|
||||
yield f"data: {escaped}\n\n"
|
||||
if chunk == "[DONE]":
|
||||
break
|
||||
except asyncio.TimeoutError:
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
return StreamingResponse(event_generator(), media_type="text/event-stream")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok", "agent_ready": medium_agent is not None}
|
||||
|
||||
61
cli.py
61
cli.py
@@ -1,9 +1,9 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Adolf CLI — interactive REPL for the multi-channel gateway.
|
||||
Adolf CLI — interactive REPL with Rich streaming display.
|
||||
|
||||
Usage:
|
||||
python3 cli.py [--url http://localhost:8000] [--session cli-alvis]
|
||||
python3 cli.py [--url http://deepagents:8000] [--session cli-alvis]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@@ -12,7 +12,13 @@ import os
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
GATEWAY = "http://localhost:8000"
|
||||
from rich.console import Console
|
||||
from rich.live import Live
|
||||
from rich.markdown import Markdown
|
||||
from rich.text import Text
|
||||
|
||||
GATEWAY = "http://deepagents:8000"
|
||||
console = Console()
|
||||
|
||||
|
||||
def post_message(gateway: str, text: str, session_id: str) -> None:
|
||||
@@ -20,7 +26,7 @@ def post_message(gateway: str, text: str, session_id: str) -> None:
|
||||
"text": text,
|
||||
"session_id": session_id,
|
||||
"channel": "cli",
|
||||
"user_id": os.getlogin(),
|
||||
"user_id": os.getenv("USER", "user"),
|
||||
}).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{gateway}/message",
|
||||
@@ -30,33 +36,49 @@ def post_message(gateway: str, text: str, session_id: str) -> None:
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=10) as r:
|
||||
if r.status != 202:
|
||||
print(f"[error] gateway returned {r.status}", file=sys.stderr)
|
||||
console.print(f"[red][error] gateway returned {r.status}[/red]")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def wait_for_reply(gateway: str, session_id: str, timeout: int = 400) -> str:
|
||||
"""Open SSE stream and return first data event."""
|
||||
def stream_reply(gateway: str, session_id: str, timeout: int = 400) -> str:
|
||||
"""
|
||||
Open the /stream/{session_id} SSE endpoint and display tokens live with
|
||||
Rich. Returns the full assembled reply text.
|
||||
"""
|
||||
req = urllib.request.Request(
|
||||
f"{gateway}/reply/{session_id}",
|
||||
f"{gateway}/stream/{session_id}",
|
||||
headers={"Accept": "text/event-stream"},
|
||||
)
|
||||
buffer = ""
|
||||
with urllib.request.urlopen(req, timeout=timeout + 5) as r:
|
||||
for raw_line in r:
|
||||
line = raw_line.decode("utf-8").rstrip("\n")
|
||||
if line.startswith("data:"):
|
||||
return line[5:].strip().replace("\\n", "\n")
|
||||
return ""
|
||||
with Live(Text(""), console=console, refresh_per_second=20, transient=True) as live:
|
||||
for raw_line in r:
|
||||
line = raw_line.decode("utf-8").rstrip("\n")
|
||||
if not line.startswith("data:"):
|
||||
continue
|
||||
chunk = line[5:].strip()
|
||||
if chunk == "[DONE]":
|
||||
break
|
||||
chunk = chunk.replace("\\n", "\n")
|
||||
buffer += chunk
|
||||
live.update(Text(buffer))
|
||||
|
||||
# Render the complete reply as Markdown once streaming is done
|
||||
console.print(Markdown(buffer))
|
||||
return buffer
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Adolf CLI")
|
||||
parser.add_argument("--url", default=GATEWAY, help="Gateway URL")
|
||||
parser.add_argument("--session", default=f"cli-{os.getlogin()}", help="Session ID")
|
||||
parser.add_argument("--session", default=f"cli-{os.getenv('USER', 'user')}",
|
||||
help="Session ID")
|
||||
parser.add_argument("--timeout", type=int, default=400, help="Reply timeout (seconds)")
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Adolf CLI (session={args.session}, gateway={args.url})")
|
||||
print("Type your message and press Enter. Ctrl+C or Ctrl+D to exit.\n")
|
||||
console.print(f"[bold]Adolf CLI[/bold] (session=[cyan]{args.session}[/cyan], "
|
||||
f"gateway=[cyan]{args.url}[/cyan])")
|
||||
console.print("Type your message and press Enter. Ctrl+C or Ctrl+D to exit.\n")
|
||||
|
||||
try:
|
||||
while True:
|
||||
@@ -68,12 +90,11 @@ def main():
|
||||
continue
|
||||
|
||||
post_message(args.url, text, args.session)
|
||||
print("...", end="", flush=True)
|
||||
reply = wait_for_reply(args.url, args.session, timeout=args.timeout)
|
||||
print(f"\r{reply}\n")
|
||||
stream_reply(args.url, args.session, timeout=args.timeout)
|
||||
console.print()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nbye")
|
||||
console.print("\n[dim]bye[/dim]")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -65,6 +65,20 @@ services:
|
||||
- DEEPAGENTS_URL=http://deepagents:8000
|
||||
restart: unless-stopped
|
||||
|
||||
cli:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.cli
|
||||
container_name: cli
|
||||
environment:
|
||||
- DEEPAGENTS_URL=http://deepagents:8000
|
||||
depends_on:
|
||||
- deepagents
|
||||
stdin_open: true
|
||||
tty: true
|
||||
profiles:
|
||||
- tools
|
||||
|
||||
crawl4ai:
|
||||
image: unclecode/crawl4ai:latest
|
||||
container_name: crawl4ai
|
||||
|
||||
@@ -13,17 +13,17 @@ curl -s -X POST http://localhost:8000/message \
|
||||
-d '{"text": "/think what is the best recipe for an apple pie?", "session_id": "use-case-apple-pie", "channel": "cli", "user_id": "claude"}'
|
||||
```
|
||||
|
||||
**2. Wait for the reply** via SSE (complex tier can take up to 5 minutes):
|
||||
**2. Wait for the streaming reply** (complex tier can take up to 5 minutes):
|
||||
|
||||
```bash
|
||||
curl -s -N --max-time 300 "http://localhost:8000/reply/use-case-apple-pie"
|
||||
curl -s -N --max-time 300 "http://localhost:8000/stream/use-case-apple-pie"
|
||||
```
|
||||
|
||||
**3. Confirm tier and tool usage in agent logs:**
|
||||
|
||||
```bash
|
||||
docker compose -f /home/alvis/adolf/docker-compose.yml logs deepagents \
|
||||
--since=600s --no-log-prefix | grep -E "tier=complex|web_search|fetch_url|crawl4ai"
|
||||
--since=600s | grep -E "tier=complex|web_search|fetch_url|crawl4ai"
|
||||
```
|
||||
|
||||
## Evaluate (use your judgment)
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
# Use Case: CLI Startup
|
||||
|
||||
Verify the Adolf CLI starts cleanly and exits without error when the user closes input.
|
||||
Verify the Adolf CLI container starts cleanly, shows the welcome banner,
|
||||
and exits without error when the user closes input.
|
||||
|
||||
## Steps
|
||||
|
||||
Run the CLI with empty stdin (simulates user pressing Ctrl+D immediately):
|
||||
|
||||
```bash
|
||||
echo "" | python3 /home/alvis/adolf/cli.py --session use-case-cli-startup
|
||||
echo "" | docker compose --profile tools run --rm -T cli \
|
||||
python3 cli.py --url http://deepagents:8000 --session use-case-cli-startup
|
||||
echo "exit code: $?"
|
||||
```
|
||||
|
||||
|
||||
Reference in New Issue
Block a user