"""Schema dataclasses for EpcMlTransform — the cross-repo ML data contract. Consumed by the AutoGluon training repo (and by anything else that reads the transform's parquet output) to know each column's dtype, nullability, and meaning. """ from dataclasses import dataclass @dataclass(frozen=True) class ColumnSpec: """Specification of a single column in the EPC ML training dataset. `categorical=True` signals that the column carries a categorical value (raw strings emitted by the transform) and should be cast to `pd.Categorical` at parquet write time. The schema module stays pandas-free; the cast happens at the I/O boundary in `services/ml_training_data/`. """ dtype: type nullable: bool = True description: str = "" categorical: bool = False @dataclass(frozen=True) class TransformSchema: """The cross-repo ML data contract emitted by EpcMlTransform.schema().""" transform_version: str feature_columns: dict[str, ColumnSpec] target_columns: dict[str, ColumnSpec]