fix(infra): mlflow image tag + python-based healthchecks for ml-serving/mlflow

- Corrects mlflow image tag (2.14.3 → v2.14.3); the former tag does not exist
  on ghcr.io/mlflow/mlflow and caused a manifest-unknown error on pull.
- Replaces wget/curl healthchecks with inline python urllib calls — the
  python:3.12-slim (ml-serving) and ghcr.io/mlflow/mlflow images ship
  neither wget nor curl, so both containers reported unhealthy despite
  /health returning 200.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 15:04:18 +00:00
parent bb879c5f0f
commit d7a2423940

View File

@@ -70,7 +70,7 @@ services:
ports:
- "127.0.0.1:8000:8000"
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:8000/health"]
test: ["CMD", "python", "-c", "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8000/health',timeout=3).status==200 else 1)"]
interval: 10s
timeout: 5s
retries: 5
@@ -219,7 +219,7 @@ services:
retries: 5
mlflow:
image: ghcr.io/mlflow/mlflow:2.14.3
image: ghcr.io/mlflow/mlflow:v2.14.3
profiles: [mlops]
command: >
mlflow server
@@ -237,7 +237,7 @@ services:
ports:
- "127.0.0.1:5000:5000"
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:5000/health"]
test: ["CMD", "python", "-c", "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:5000/health',timeout=3).status==200 else 1)"]
interval: 10s
timeout: 5s
retries: 5