oO/ml/features/profile_schema.py

"""Profile-feature schema mirror (#81 phase A).

The TypeScript registry in ``services/api/src/profile/registry.ts`` is the
*source of truth* — features are computed there because the data lives in the
TS-owned SQLite DB. This module is a documentation/typing mirror so Python
code (ml/serving, eval harnesses, notebooks) knows what fields to expect on
``profile_features`` payloads without round-tripping the API.

Update this file whenever you add or rename a feature in the TS registry.
The accompanying test asserts the two stay in sync at the name level.

Feature-spec fields (issue #61):
  freshness — "batched": value cached in profile store, recomputed on TTL/event.
  ttl_sec   — cache lifetime in seconds; mirrors ``ttlSec`` in registry.ts.
  source    — where the value originates.
  fallback  — raw value returned when the feature is unavailable (null stored).
"""
from __future__ import annotations

from dataclasses import dataclass
from typing import Literal


Dtype = Literal["numeric", "categorical"]
Freshness = Literal["jit", "batched"]

_HOUR = 3600
_DAY = 86_400


@dataclass(frozen=True)
class ProfileFeature:
    name: str
    dtype: Dtype
    description: str
    freshness: Freshness
    ttl_sec: int
    source: str
    fallback: str


PROFILE_FEATURES: tuple[ProfileFeature, ...] = (
    ProfileFeature(
        name="completion_rate_30d",
        dtype="numeric",
        description='Fraction of tips served in the last 30 days that received a "done" reaction.',
        freshness="batched",
        ttl_sec=6 * _HOUR,
        source="profile_store",
        fallback="0.0",
    ),
    ProfileFeature(
        name="dismiss_rate_30d",
        dtype="numeric",
        description='Fraction of tips served in the last 30 days that received a "dismiss" reaction.',
        freshness="batched",
        ttl_sec=6 * _HOUR,
        source="profile_store",
        fallback="0.0",
    ),
    ProfileFeature(
        name="mean_dwell_ms_30d",
        dtype="numeric",
        description="Average dwell time (ms between served and reacted) over the last 30 days.",
        freshness="batched",
        ttl_sec=6 * _HOUR,
        source="profile_store",
        fallback="null — serving normalises to 0.0",
    ),
    ProfileFeature(
        name="preferred_hour",
        dtype="numeric",
        description='Hour-of-day with the most "done" reactions in the last 30 days (0–23).',
        freshness="batched",
        ttl_sec=_DAY,
        source="profile_store",
        fallback="null — serving normalises to 0.5 (neutral alignment)",
    ),
    ProfileFeature(
        name="tip_volume_30d",
        dtype="numeric",
        description="Number of tips served to the user in the last 30 days.",
        freshness="batched",
        ttl_sec=_HOUR,
        source="profile_store",
        fallback="0",
    ),
)


def feature_names() -> set[str]:
    return {f.name for f in PROFILE_FEATURES}