Files
oO/ml/features/test_profile_schema.py
alvis 17b9516903 feat(features): mirror invalidatedBy into Python ProfileFeature (#61)
Adds invalidated_by: tuple[str, ...] to ProfileFeature, mirroring the
invalidatedBy bus subjects from registry.ts. Adds a test that parses the
TS source and asserts Python stays in sync — same drift-detection pattern
used for names and ttlSec.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 07:10:36 +00:00

150 lines
5.2 KiB
Python

"""Smoke test for profile_schema mirror (#81 phase A, #61 freshness spec).
The TS registry in services/api/src/profile/registry.ts is the source of truth.
This test checks the names listed here match the registry by reading the TS
file and grepping for `name: '...'`. Crude but cheap, and it catches the
common rename/add-without-mirror failure mode.
Also verifies invalidated_by subjects mirror the TS invalidatedBy arrays (#61).
"""
from __future__ import annotations
import re
from pathlib import Path
from ml.features.profile_schema import PROFILE_FEATURES, feature_names
REGISTRY_PATH = Path(__file__).resolve().parents[2] / "services" / "api" / "src" / "profile" / "registry.ts"
_HOUR = 3600
_DAY = 86_400
# Expected ttl_sec values mirrored from registry.ts — keeps the two in sync.
_EXPECTED_TTL: dict[str, int] = {
"completion_rate_30d": 6 * _HOUR,
"dismiss_rate_30d": 6 * _HOUR,
"mean_dwell_ms_30d": 6 * _HOUR,
"preferred_hour": _DAY,
"tip_volume_30d": _HOUR,
}
def _ts_registry_names() -> set[str]:
text = REGISTRY_PATH.read_text(encoding="utf-8")
# Each FEATURES entry has `name: 'something_30d',`. Extract every match.
return set(re.findall(r"name:\s*'([a-zA-Z0-9_]+)'", text))
def _ts_registry_ttls() -> dict[str, int]:
"""Parse ttlSec values from registry.ts (crude but sufficient for drift detection).
Handles TS symbolic constants (HOUR, DAY) and expressions like ``6 * HOUR``.
"""
text = REGISTRY_PATH.read_text(encoding="utf-8")
# Extract numeric constants: `const HOUR = 3600;` or `const DAY = 86_400;`
consts: dict[str, int] = {}
for m in re.finditer(r"const\s+([A-Z_]+)\s*=\s*([\d_]+)", text):
consts[m.group(1)] = int(m.group(2).replace("_", ""))
def _eval_expr(expr: str) -> int:
tokens = [t.strip() for t in expr.split("*")]
result = 1
for t in tokens:
result *= consts[t] if t in consts else int(t)
return result
result: dict[str, int] = {}
for block in re.split(r"\{", text):
name_m = re.search(r"name:\s*'([a-zA-Z0-9_]+)'", block)
# ttlSec may be a constant name, a number, or `N * CONST`
ttl_m = re.search(r"ttlSec:\s*([A-Za-z0-9_]+(?:\s*\*\s*[A-Za-z0-9_]+)?)", block)
if name_m and ttl_m:
result[name_m.group(1)] = _eval_expr(ttl_m.group(1))
return result
def test_python_mirror_matches_ts_registry():
py_names = feature_names()
ts_names = _ts_registry_names()
assert py_names == ts_names, (
f"Profile feature names drifted between TS registry and Python mirror.\n"
f" in Python only: {sorted(py_names - ts_names)}\n"
f" in TS only: {sorted(ts_names - py_names)}"
)
def test_profile_schema_no_duplicates():
names = [f.name for f in PROFILE_FEATURES]
assert len(names) == len(set(names)), f"duplicate names: {names}"
def test_profile_schema_dtypes_known():
for f in PROFILE_FEATURES:
assert f.dtype in {"numeric", "categorical"}
def test_all_profile_features_are_batched():
for f in PROFILE_FEATURES:
assert f.freshness == "batched", f"{f.name}: expected freshness='batched', got {f.freshness!r}"
def test_profile_feature_ttl_matches_ts_registry():
ts_ttls = _ts_registry_ttls()
for f in PROFILE_FEATURES:
assert f.name in ts_ttls, f"{f.name} not found in TS registry ttlSec parse"
assert f.ttl_sec == ts_ttls[f.name], (
f"{f.name}: Python ttl_sec={f.ttl_sec} != TS ttlSec={ts_ttls[f.name]}"
)
def test_profile_feature_ttl_matches_expected():
for f in PROFILE_FEATURES:
assert f.ttl_sec == _EXPECTED_TTL[f.name], (
f"{f.name}: ttl_sec={f.ttl_sec}, expected {_EXPECTED_TTL[f.name]}"
)
def test_profile_feature_source_is_profile_store():
for f in PROFILE_FEATURES:
assert f.source == "profile_store", f"{f.name}: unexpected source {f.source!r}"
def test_profile_feature_fallback_set():
for f in PROFILE_FEATURES:
assert f.fallback, f"{f.name}: fallback must not be empty"
def _ts_registry_invalidated_by() -> dict[str, list[str]]:
"""Parse invalidatedBy arrays from registry.ts.
Extracts subjects from blocks like:
invalidatedBy: ['signals.tip.feedback'],
Returns {feature_name: [subject, ...]}; features with no invalidatedBy get [].
"""
text = REGISTRY_PATH.read_text(encoding="utf-8")
result: dict[str, list[str]] = {}
for block in re.split(r"\{", text):
name_m = re.search(r"name:\s*'([a-zA-Z0-9_]+)'", block)
if not name_m:
continue
name = name_m.group(1)
inv_m = re.search(r"invalidatedBy:\s*\[([^\]]*)\]", block)
if inv_m:
subjects = re.findall(r"'([^']+)'", inv_m.group(1))
else:
subjects = []
result[name] = subjects
return result
def test_invalidated_by_matches_ts_registry():
ts_inv = _ts_registry_invalidated_by()
for f in PROFILE_FEATURES:
assert f.name in ts_inv, f"{f.name} not found in TS registry invalidatedBy parse"
expected = tuple(sorted(ts_inv[f.name]))
actual = tuple(sorted(f.invalidated_by))
assert actual == expected, (
f"{f.name}: Python invalidated_by={actual} != TS invalidatedBy={expected}"
)