"""Smoke test for profile_schema mirror (#81 phase A, #61 freshness spec). The TS registry in services/api/src/profile/registry.ts is the source of truth. This test checks the names listed here match the registry by reading the TS file and grepping for `name: '...'`. Crude but cheap, and it catches the common rename/add-without-mirror failure mode. Also verifies invalidated_by subjects mirror the TS invalidatedBy arrays (#61). """ from __future__ import annotations import re from pathlib import Path from ml.features.profile_schema import PROFILE_FEATURES, feature_names REGISTRY_PATH = Path(__file__).resolve().parents[2] / "services" / "api" / "src" / "profile" / "registry.ts" _HOUR = 3600 _DAY = 86_400 # Expected ttl_sec values mirrored from registry.ts — keeps the two in sync. _EXPECTED_TTL: dict[str, int] = { "completion_rate_30d": 6 * _HOUR, "dismiss_rate_30d": 6 * _HOUR, "mean_dwell_ms_30d": 6 * _HOUR, "preferred_hour": _DAY, "tip_volume_30d": _HOUR, } def _ts_registry_names() -> set[str]: text = REGISTRY_PATH.read_text(encoding="utf-8") # Each FEATURES entry has `name: 'something_30d',`. Extract every match. return set(re.findall(r"name:\s*'([a-zA-Z0-9_]+)'", text)) def _ts_registry_ttls() -> dict[str, int]: """Parse ttlSec values from registry.ts (crude but sufficient for drift detection). Handles TS symbolic constants (HOUR, DAY) and expressions like ``6 * HOUR``. """ text = REGISTRY_PATH.read_text(encoding="utf-8") # Extract numeric constants: `const HOUR = 3600;` or `const DAY = 86_400;` consts: dict[str, int] = {} for m in re.finditer(r"const\s+([A-Z_]+)\s*=\s*([\d_]+)", text): consts[m.group(1)] = int(m.group(2).replace("_", "")) def _eval_expr(expr: str) -> int: tokens = [t.strip() for t in expr.split("*")] result = 1 for t in tokens: result *= consts[t] if t in consts else int(t) return result result: dict[str, int] = {} for block in re.split(r"\{", text): name_m = re.search(r"name:\s*'([a-zA-Z0-9_]+)'", block) # ttlSec may be a constant name, a number, or `N * CONST` ttl_m = re.search(r"ttlSec:\s*([A-Za-z0-9_]+(?:\s*\*\s*[A-Za-z0-9_]+)?)", block) if name_m and ttl_m: result[name_m.group(1)] = _eval_expr(ttl_m.group(1)) return result def test_python_mirror_matches_ts_registry(): py_names = feature_names() ts_names = _ts_registry_names() assert py_names == ts_names, ( f"Profile feature names drifted between TS registry and Python mirror.\n" f" in Python only: {sorted(py_names - ts_names)}\n" f" in TS only: {sorted(ts_names - py_names)}" ) def test_profile_schema_no_duplicates(): names = [f.name for f in PROFILE_FEATURES] assert len(names) == len(set(names)), f"duplicate names: {names}" def test_profile_schema_dtypes_known(): for f in PROFILE_FEATURES: assert f.dtype in {"numeric", "categorical"} def test_all_profile_features_are_batched(): for f in PROFILE_FEATURES: assert f.freshness == "batched", f"{f.name}: expected freshness='batched', got {f.freshness!r}" def test_profile_feature_ttl_matches_ts_registry(): ts_ttls = _ts_registry_ttls() for f in PROFILE_FEATURES: assert f.name in ts_ttls, f"{f.name} not found in TS registry ttlSec parse" assert f.ttl_sec == ts_ttls[f.name], ( f"{f.name}: Python ttl_sec={f.ttl_sec} != TS ttlSec={ts_ttls[f.name]}" ) def test_profile_feature_ttl_matches_expected(): for f in PROFILE_FEATURES: assert f.ttl_sec == _EXPECTED_TTL[f.name], ( f"{f.name}: ttl_sec={f.ttl_sec}, expected {_EXPECTED_TTL[f.name]}" ) def test_profile_feature_source_is_profile_store(): for f in PROFILE_FEATURES: assert f.source == "profile_store", f"{f.name}: unexpected source {f.source!r}" def test_profile_feature_fallback_set(): for f in PROFILE_FEATURES: assert f.fallback, f"{f.name}: fallback must not be empty" def _ts_registry_invalidated_by() -> dict[str, list[str]]: """Parse invalidatedBy arrays from registry.ts. Extracts subjects from blocks like: invalidatedBy: ['signals.tip.feedback'], Returns {feature_name: [subject, ...]}; features with no invalidatedBy get []. """ text = REGISTRY_PATH.read_text(encoding="utf-8") result: dict[str, list[str]] = {} for block in re.split(r"\{", text): name_m = re.search(r"name:\s*'([a-zA-Z0-9_]+)'", block) if not name_m: continue name = name_m.group(1) inv_m = re.search(r"invalidatedBy:\s*\[([^\]]*)\]", block) if inv_m: subjects = re.findall(r"'([^']+)'", inv_m.group(1)) else: subjects = [] result[name] = subjects return result def test_invalidated_by_matches_ts_registry(): ts_inv = _ts_registry_invalidated_by() for f in PROFILE_FEATURES: assert f.name in ts_inv, f"{f.name} not found in TS registry invalidatedBy parse" expected = tuple(sorted(ts_inv[f.name])) actual = tuple(sorted(f.invalidated_by)) assert actual == expected, ( f"{f.name}: Python invalidated_by={actual} != TS invalidatedBy={expected}" )