wiki search people tested pipeline
This commit is contained in:
278
wiki_research.py
Normal file
278
wiki_research.py
Normal file
@@ -0,0 +1,278 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Wiki Research Pipeline — searches the web for each person/place in the family wiki.
|
||||
|
||||
Uses Adolf's complex agent (/think prefix → qwen3:8b + web_search) to research
|
||||
each subject and aggregates findings into research.md.
|
||||
|
||||
Usage:
|
||||
python3 wiki_research.py [--subject "Name"] [--dry-run] [--timeout 300]
|
||||
[--output PATH]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# ── config ─────────────────────────────────────────────────────────────────────
|
||||
GATEWAY = "http://localhost:8000"
|
||||
WIKI_ROOT = Path("/mnt/ssd/dbs/otter/app-data/repository")
|
||||
DEFAULT_OUTPUT = WIKI_ROOT / "research.md"
|
||||
|
||||
PASS = "\033[32mPASS\033[0m"
|
||||
FAIL = "\033[31mFAIL\033[0m"
|
||||
INFO = "\033[36mINFO\033[0m"
|
||||
|
||||
|
||||
# ── helpers ────────────────────────────────────────────────────────────────────
|
||||
|
||||
def post_message(text: str, session_id: str, timeout: int = 10) -> int:
|
||||
payload = json.dumps({
|
||||
"text": text,
|
||||
"session_id": session_id,
|
||||
"channel": "cli",
|
||||
"user_id": "wiki-pipeline",
|
||||
}).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{GATEWAY}/message",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.status
|
||||
|
||||
|
||||
def wait_for_reply(label: str, session_id: str, timeout_s: int = 300) -> str | None:
|
||||
"""Open SSE stream on /reply/{session_id} and return reply text, or None on timeout."""
|
||||
req = urllib.request.Request(
|
||||
f"{GATEWAY}/reply/{urllib.parse.quote(session_id, safe='')}",
|
||||
headers={"Accept": "text/event-stream"},
|
||||
)
|
||||
t0 = time.monotonic()
|
||||
tick = 0
|
||||
deadline = t0 + timeout_s
|
||||
|
||||
# Show progress while waiting (SSE blocks until reply is ready)
|
||||
print(f"\r [{label}] waiting... ", end="", flush=True)
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout_s + 30) as r:
|
||||
for raw_line in r:
|
||||
elapsed = time.monotonic() - t0
|
||||
line = raw_line.decode("utf-8").rstrip("\n")
|
||||
if line.startswith("data:"):
|
||||
text = line[5:].strip().replace("\\n", "\n")
|
||||
print(f"\r [{label}] done after {elapsed:.0f}s{' ' * 30}")
|
||||
if text == "[timeout]":
|
||||
return None
|
||||
return text
|
||||
tick += 1
|
||||
rem = int(deadline - time.monotonic())
|
||||
print(f"\r [{label}] {elapsed:.0f}s elapsed, {rem}s left — waiting... ",
|
||||
end="", flush=True)
|
||||
except Exception as e:
|
||||
print(f"\r [{label}] SSE error: {e}{' ' * 30}")
|
||||
|
||||
print(f"\r [{label}] TIMEOUT after {timeout_s}s{' ' * 30}")
|
||||
return None
|
||||
|
||||
|
||||
# ── wiki parsing ───────────────────────────────────────────────────────────────
|
||||
|
||||
def slugify(name: str) -> str:
|
||||
s = name.lower()
|
||||
s = re.sub(r"[^\w\s-]", "", s)
|
||||
s = re.sub(r"\s+", "-", s.strip())
|
||||
return s[:60]
|
||||
|
||||
|
||||
def parse_wiki_file(path: Path):
|
||||
try:
|
||||
text = path.read_text(encoding="utf-8")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
lines = text.splitlines()
|
||||
name = None
|
||||
context_parts = []
|
||||
|
||||
for line in lines[:50]:
|
||||
stripped = line.strip()
|
||||
if not name and stripped.startswith("# "):
|
||||
name = stripped[2:].strip()
|
||||
continue
|
||||
if name:
|
||||
if stripped.startswith("[![") or stripped.startswith("!["):
|
||||
continue
|
||||
if stripped:
|
||||
context_parts.append(stripped)
|
||||
if len(context_parts) >= 20:
|
||||
break
|
||||
|
||||
if not name:
|
||||
return None
|
||||
return name, "\n".join(context_parts)
|
||||
|
||||
|
||||
def discover_subjects(wiki_root: Path):
|
||||
subjects = []
|
||||
for subdir in ["люди", "места"]:
|
||||
folder = wiki_root / subdir
|
||||
if not folder.exists():
|
||||
continue
|
||||
for md_file in sorted(folder.glob("*.md")):
|
||||
result = parse_wiki_file(md_file)
|
||||
if result:
|
||||
name, context = result
|
||||
subjects.append((name, context, subdir))
|
||||
return subjects
|
||||
|
||||
|
||||
# ── output ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def load_existing_names(output_path: Path) -> set:
|
||||
if not output_path.exists():
|
||||
return set()
|
||||
return set(re.findall(r"^## (.+)$", output_path.read_text(encoding="utf-8"), re.MULTILINE))
|
||||
|
||||
|
||||
def init_output(output_path: Path, total: int):
|
||||
if not output_path.exists():
|
||||
output_path.write_text(
|
||||
f"# Wiki Research Results\n\n"
|
||||
f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n"
|
||||
f"Subjects: {total}\n\n---\n\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
def append_result(output_path: Path, name: str, elapsed: float, reply_text: str):
|
||||
date_str = datetime.now().strftime("%Y-%m-%d")
|
||||
block = (
|
||||
f"## {name}\n\n"
|
||||
f"**Searched**: {date_str} **Elapsed**: {elapsed:.0f}s\n\n"
|
||||
f"{reply_text or '_No reply captured._'}\n\n---\n\n"
|
||||
)
|
||||
with open(output_path, "a", encoding="utf-8") as f:
|
||||
f.write(block)
|
||||
|
||||
|
||||
# ── research prompt ────────────────────────────────────────────────────────────
|
||||
|
||||
def build_prompt(name: str, context: str, subdir: str) -> str:
|
||||
kind = "person" if subdir == "люди" else "place"
|
||||
return (
|
||||
f"/think You are researching a {kind} for a private family wiki. "
|
||||
f"Find everything publicly available. Be thorough and specific.\n\n"
|
||||
f"**Subject**: {name}\n"
|
||||
f"**Known context** (from the family wiki — do NOT just repeat this):\n{context}\n\n"
|
||||
f"**Research instructions** (MUST follow exactly):\n"
|
||||
f"1. Call web_search, then IMMEDIATELY call fetch_url on every URL found in results.\n"
|
||||
f"2. You MUST call fetch_url at least 5 times — do not write the report until you have.\n"
|
||||
f"3. Priority URLs to fetch: Google Scholar profile, ResearchGate, IEEE Xplore, LinkedIn, employer page.\n"
|
||||
f"4. Run searches in English AND Russian/Latvian.\n"
|
||||
f"5. After fetching pages, derive follow-up searches from what you find.\n\n"
|
||||
f"**Output format** (required):\n"
|
||||
f"- Use markdown with sections: Overview, Education, Career, Publications, "
|
||||
f"Online Presence, Interesting Findings, Not Found\n"
|
||||
f"- Every fact must have a source link: [fact](url)\n"
|
||||
f"- Include actual URLs to profiles, papers, articles found\n"
|
||||
f"- 'Interesting Findings': non-trivial facts not in the wiki context above\n"
|
||||
f"- Last line must be: **Sources checked: N** (count of URLs you fetched with fetch_url)\n\n"
|
||||
f'If truly nothing is found publicly, say "No public information found." '
|
||||
f"but only after exhausting all search angles."
|
||||
)
|
||||
|
||||
|
||||
# ── main ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Wiki research pipeline")
|
||||
parser.add_argument("--subject", help="Single subject (substring match)")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Print prompts, don't send")
|
||||
parser.add_argument("--timeout", type=int, default=300, help="Per-subject timeout (s)")
|
||||
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT, help="Output file")
|
||||
args = parser.parse_args()
|
||||
|
||||
subjects = discover_subjects(WIKI_ROOT)
|
||||
if not subjects:
|
||||
print(f"[{FAIL}] No subjects found in {WIKI_ROOT}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"[{INFO}] Discovered {len(subjects)} subjects")
|
||||
|
||||
if args.subject:
|
||||
needle = args.subject.lower()
|
||||
subjects = [(n, c, s) for n, c, s in subjects if needle in n.lower()]
|
||||
if not subjects:
|
||||
print(f"[{FAIL}] No subject matching '{args.subject}'")
|
||||
sys.exit(1)
|
||||
print(f"[{INFO}] Filtered to {len(subjects)} subject(s)")
|
||||
|
||||
if args.dry_run:
|
||||
for name, context, subdir in subjects:
|
||||
print(f"\n{'='*60}\nSUBJECT: {name} ({subdir})")
|
||||
print(f"PROMPT:\n{build_prompt(name, context, subdir)}")
|
||||
return
|
||||
|
||||
init_output(args.output, len(subjects))
|
||||
existing = load_existing_names(args.output)
|
||||
print(f"[{INFO}] Output: {args.output} ({len(existing)} already done)")
|
||||
|
||||
total = len(subjects)
|
||||
done = 0
|
||||
failed = []
|
||||
|
||||
for idx, (name, context, subdir) in enumerate(subjects, 1):
|
||||
if name in existing:
|
||||
print(f"[{idx}/{total}] SKIP {name} (already in output)")
|
||||
done += 1
|
||||
continue
|
||||
|
||||
prompt = build_prompt(name, context, subdir)
|
||||
session_id = f"wiki-{slugify(name)}"
|
||||
label = f"{idx}/{total}"
|
||||
|
||||
print(f"\n[{label}] {name}")
|
||||
|
||||
try:
|
||||
status = post_message(prompt, session_id, timeout=10)
|
||||
if status != 202:
|
||||
print(f" [{FAIL}] Unexpected status {status}")
|
||||
failed.append(name)
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f" [{FAIL}] POST failed: {e}")
|
||||
failed.append(name)
|
||||
continue
|
||||
|
||||
t0 = time.monotonic()
|
||||
reply_text = wait_for_reply(label, session_id, timeout_s=args.timeout)
|
||||
elapsed = time.monotonic() - t0
|
||||
|
||||
if reply_text is None:
|
||||
print(f" [{FAIL}] Timeout")
|
||||
failed.append(name)
|
||||
append_result(args.output, name, elapsed, "_Research timed out._")
|
||||
continue
|
||||
|
||||
print(f" [{PASS}] {elapsed:.0f}s — {len(reply_text)} chars")
|
||||
append_result(args.output, name, elapsed, reply_text)
|
||||
done += 1
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Done: {done}/{total}")
|
||||
if failed:
|
||||
print(f"Failed ({len(failed)}): {', '.join(failed)}")
|
||||
print(f"Output: {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user