chore: remove Airflow completely from the stack

Drop all four Airflow containers (db, init, webserver, scheduler) from the
mlops compose profile, leaving MLflow as the sole mlops service. Remove
AIRFLOW_* env vars, config fields, health-check entries, DAG trigger code
in admin/bench routes, the airflow_dag_run_id schema column, Airflow nav
links and DAG-run links in the admin UI, the two Airflow DAG files
(bench_dag.py, sim_dag.py), and all related docs/ADR references.
Simulations now run exclusively via the subprocess path.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-03 16:38:46 +00:00
parent ce1c8bde57
commit f8d66aa01f
27 changed files with 663 additions and 719 deletions

View File

@@ -1,168 +0,0 @@
"""
Airflow DAG: bench_collect
Runs the tip-generation benchmark (model × prompt evaluation). Triggered
on-demand from the admin UI or manually, collects candidates per cell,
exports for Claude Code judgment, and generates a leaderboard.
Mirrors the manual flow:
1. collect.py → generates candidates, logs to MLflow with judge_pending=true
2. (human: judge_cli.py --export, Claude Code scores, judge_cli.py --apply)
3. compare.py → leaderboard
For now, steps 2 is manual. Future: add a webhook to trigger the human
judge from the admin UI or set up an async task queue.
Required conf keys (passed via dag_run.conf):
models str — comma-separated model tags (e.g. "qwen2.5:0.5b,qwen2.5:1.5b")
prompts str — comma-separated prompt versions (default: "v1,v2-mentor,v3-few-shot")
n_tips int — candidates to generate per scenario (default: 5)
n_scenarios int — cap scenario count; 0 = all (default: 0)
temperature float — LLM generation temperature (default: 0.7)
experiment str — MLflow experiment name (default: "tip-bench-auto")
max_model_b float — reject models larger than this (default: 4.0)
ollama_url str — Ollama endpoint (default: http://localhost:11434)
mlflow_url str — MLflow tracking URI (env MLFLOW_TRACKING_URI or http://localhost:5000)
"""
from __future__ import annotations
import json
import os
import sys
from datetime import datetime, timedelta
from pathlib import Path
from airflow import DAG
from airflow.operators.python import PythonOperator
def _collect(**context: object) -> dict:
"""Run collect.py with the provided config."""
conf: dict = context["dag_run"].conf or {}
models = str(conf.get("models", "qwen2.5:0.5b,qwen2.5:1.5b,gemma3:1b,llama3.2:3b"))
prompts = str(conf.get("prompts", "v1,v2-mentor,v3-few-shot"))
n_tips = int(conf.get("n_tips", 5))
n_scenarios = int(conf.get("n_scenarios", 0))
temperature = float(conf.get("temperature", 0.7))
experiment = str(conf.get("experiment", "tip-bench-auto"))
max_model_b = float(conf.get("max_model_b", 4.0))
ollama_url = str(conf.get("ollama_url", os.environ.get("OLLAMA_URL", "http://localhost:11434")))
mlflow_url = str(conf.get("mlflow_url", os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000")))
sys.path.insert(0, "/opt/airflow/ml/experiments/bench")
from collect import main as collect_main # type: ignore
# Build args for collect.py
args = [
"--models", models,
"--prompts", prompts,
"--experiment", experiment,
"--n-tips", str(n_tips),
"--temperature", str(temperature),
"--max-model-b", str(max_model_b),
"--ollama-url", ollama_url,
"--mlflow-url", mlflow_url,
]
if n_scenarios > 0:
args.extend(["--n-scenarios", str(n_scenarios)])
# Inject args into sys.argv so argparse picks them up
old_argv = sys.argv
try:
sys.argv = ["collect.py"] + args
result = collect_main()
return {
"status": "success" if result == 0 else "failed",
"exit_code": result,
"experiment": experiment,
}
finally:
sys.argv = old_argv
def _compare(**context: object) -> dict:
"""Run compare.py to generate the leaderboard."""
conf: dict = context["dag_run"].conf or {}
experiment = str(conf.get("experiment", "tip-bench-auto"))
mlflow_url = str(conf.get("mlflow_url", os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000")))
sys.path.insert(0, "/opt/airflow/ml/experiments/bench")
from compare import main as compare_main # type: ignore
old_argv = sys.argv
try:
sys.argv = [
"compare.py",
"--experiment", experiment,
"--mlflow-url", mlflow_url,
]
result = compare_main()
return {
"status": "success" if result == 0 else "failed",
"exit_code": result,
"experiment": experiment,
}
finally:
sys.argv = old_argv
def _export_for_judge(**context: object) -> str:
"""Export pending runs for Claude Code judgment."""
conf: dict = context["dag_run"].conf or {}
experiment = str(conf.get("experiment", "tip-bench-auto"))
mlflow_url = str(conf.get("mlflow_url", os.environ.get("MLFLOW_TRACKING_URI", "http://localhost:5000")))
export_path = f"/tmp/oo-bench-{experiment}-{int(context['ti'].start_date.timestamp())}.json"
sys.path.insert(0, "/opt/airflow/ml/experiments/bench")
from judge_cli import export # type: ignore
from mlflow_client import MLflowClient # type: ignore
client = MLflowClient(
tracking_uri=mlflow_url,
username=os.environ.get("MLFLOW_TRACKING_USERNAME") or "admin",
password=os.environ.get("MLFLOW_TRACKING_PASSWORD") or "password",
)
result = export(client, experiment, export_path)
# XCom: push path so next task can find it
context["ti"].xcom_push(key="export_path", value=export_path)
return export_path
with DAG(
dag_id="bench_collect",
description="Tip-generation benchmark: model & prompt evaluation via MLflow",
schedule_interval=None,
start_date=datetime(2025, 1, 1),
catchup=False,
tags=["bench", "ml", "evaluation"],
default_args={
"retries": 1,
"retry_delay": timedelta(minutes=5),
},
) as dag:
collect = PythonOperator(
task_id="collect",
python_callable=_collect,
provide_context=True,
)
export_judge = PythonOperator(
task_id="export_for_judge",
python_callable=_export_for_judge,
provide_context=True,
)
compare = PythonOperator(
task_id="compare",
python_callable=_compare,
provide_context=True,
)
collect >> export_judge >> compare

View File

@@ -1,124 +0,0 @@
"""
Airflow DAG: bandit_sim
Runs a bandit policy simulation and logs results to MLflow.
Triggered on-demand from the oO admin panel or manually from the Airflow UI.
Required conf keys (passed via dag_run.conf):
sim_run_id str — oO SQLite run ID for callback correlation
n_users int — number of synthetic users
n_rounds int — rounds per user
tasks_per_round int — candidate pool size per round
policies list — policy names to compare
judge_mode str — "rule" | "llm"
ml_url str — ml/serving URL (e.g. http://ml-serving:8000)
mlflow_url str — MLflow tracking URI (e.g. http://mlflow:5000/mlflow)
callback_url str — oO API callback endpoint
internal_token str — x-internal-token header value
"""
from __future__ import annotations
import json
import os
import sys
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.python import PythonOperator
def _run_sim(**context: object) -> dict:
conf: dict = context["dag_run"].conf or {}
n_users = int(conf.get("n_users", 5))
n_rounds = int(conf.get("n_rounds", 20))
tasks_per_round = int(conf.get("tasks_per_round", 8))
policies = list(conf.get("policies", ["linucb-v1", "egreedy-v1"]))
judge_mode = str(conf.get("judge_mode", "rule"))
ml_url = str(conf.get("ml_url", "http://ml-serving:8000"))
mlflow_url = str(conf.get("mlflow_url", os.environ.get("MLFLOW_TRACKING_URI", "")))
mlflow_experiment = "bandit_simulation"
sys.path.insert(0, "/opt/airflow/ml/experiments/sim")
from runner import run_simulation # type: ignore[import]
use_llm = judge_mode == "llm"
result = run_simulation(
n_users=n_users,
n_rounds=n_rounds,
tasks_per_round=tasks_per_round,
ml_url=ml_url,
policies=policies,
use_llm=use_llm,
seed=42,
mlflow_url=mlflow_url or None,
mlflow_experiment=mlflow_experiment,
)
return result
def _callback(**context: object) -> None:
import httpx
conf: dict = context["dag_run"].conf or {}
callback_url: str = str(conf.get("callback_url", ""))
internal_token: str = str(conf.get("internal_token", ""))
if not callback_url or not internal_token:
print("No callback_url or internal_token — skipping result push.", flush=True)
return
result: dict = context["ti"].xcom_pull(task_ids="run_sim")
if not result:
print("No result from run_sim task — callback skipped.", flush=True)
return
payload = {
"summary": result.get("summary", {}),
"winner": result.get("winner", ""),
"persona_breakdown": result.get("persona_breakdown", {}),
"events": result.get("events", []),
"mlflow_run_id": result.get("mlflow_run_id"),
}
try:
r = httpx.post(
callback_url,
json=payload,
headers={"x-internal-token": internal_token},
timeout=30.0,
)
r.raise_for_status()
print(f"Callback OK: {r.status_code}", flush=True)
except Exception as exc:
print(f"Callback failed: {exc}", flush=True)
raise
with DAG(
dag_id="bandit_sim",
description="On-demand bandit policy simulation with MLflow tracking",
schedule_interval=None,
start_date=datetime(2025, 1, 1),
catchup=False,
tags=["bandit", "simulation", "ml"],
default_args={
"retries": 1,
"retry_delay": timedelta(minutes=2),
},
) as dag:
run_sim = PythonOperator(
task_id="run_sim",
python_callable=_run_sim,
provide_context=True,
)
push_results = PythonOperator(
task_id="push_results",
python_callable=_callback,
provide_context=True,
)
run_sim >> push_results