From 9c8aa754699ad4d1e019a14e5efd9b151526520f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 May 2026 15:14:30 +0000 Subject: [PATCH] slice 7: flat categoricals + ColumnSpec.categorical flag Adds seven flat categorical features (dwelling_type, tenure, transaction_type, property_type, built_form, region_code, country_code) emitted as raw strings. New ColumnSpec.categorical bool tells the parquet writer to cast these to pd.Categorical at the I/O boundary, keeping pandas out of the domain/schema module. Co-Authored-By: Claude Opus 4.7 --- packages/domain/src/domain/ml/schema.py | 9 ++- .../domain/src/domain/ml/tests/_fixtures.py | 17 ++++- .../src/domain/ml/tests/test_transform.py | 64 +++++++++++++++++++ packages/domain/src/domain/ml/transform.py | 51 +++++++++++++++ 4 files changed, 137 insertions(+), 4 deletions(-) diff --git a/packages/domain/src/domain/ml/schema.py b/packages/domain/src/domain/ml/schema.py index e2a0a08f..5850a899 100644 --- a/packages/domain/src/domain/ml/schema.py +++ b/packages/domain/src/domain/ml/schema.py @@ -9,11 +9,18 @@ from dataclasses import dataclass @dataclass(frozen=True) class ColumnSpec: - """Specification of a single column in the EPC ML training dataset.""" + """Specification of a single column in the EPC ML training dataset. + + `categorical=True` signals that the column carries a categorical value (raw + strings emitted by the transform) and should be cast to `pd.Categorical` at + parquet write time. The schema module stays pandas-free; the cast happens at + the I/O boundary in `services/ml_training_data/`. + """ dtype: type nullable: bool = True description: str = "" + categorical: bool = False @dataclass(frozen=True) diff --git a/packages/domain/src/domain/ml/tests/_fixtures.py b/packages/domain/src/domain/ml/tests/_fixtures.py index bdab655c..ecc45241 100644 --- a/packages/domain/src/domain/ml/tests/_fixtures.py +++ b/packages/domain/src/domain/ml/tests/_fixtures.py @@ -41,13 +41,20 @@ def make_minimal_sap10_epc( percent_draughtproofed: Optional[int] = None, energy_rating_average: Optional[int] = None, environmental_impact_current: Optional[int] = None, + dwelling_type: str = "Mid-terrace house", + tenure: str = "1", + transaction_type: str = "1", + property_type: Optional[str] = None, + built_form: Optional[str] = None, + region_code: Optional[str] = None, + country_code: Optional[str] = None, ) -> EpcPropertyData: """Construct a minimal valid SAP10 EpcPropertyData with parametrisable targets.""" return EpcPropertyData( - dwelling_type="Mid-terrace house", + dwelling_type=dwelling_type, inspection_date=date(2025, 6, 1), - tenure="1", - transaction_type="1", + tenure=tenure, + transaction_type=transaction_type, address_line_1="1 Test Street", postcode="A1 1AA", post_town="Testtown", @@ -93,6 +100,10 @@ def make_minimal_sap10_epc( percent_draughtproofed=percent_draughtproofed, energy_rating_average=energy_rating_average, environmental_impact_current=environmental_impact_current, + property_type=property_type, + built_form=built_form, + region_code=region_code, + country_code=country_code, renewable_heat_incentive=RenewableHeatIncentive( space_heating_kwh=space_heating_kwh, water_heating_kwh=water_heating_kwh, diff --git a/packages/domain/src/domain/ml/tests/test_transform.py b/packages/domain/src/domain/ml/tests/test_transform.py index a7c667fb..f92c280e 100644 --- a/packages/domain/src/domain/ml/tests/test_transform.py +++ b/packages/domain/src/domain/ml/tests/test_transform.py @@ -239,3 +239,67 @@ def test_to_row_extracts_boolean_and_optional_int_features() -> None: assert row["percent_draughtproofed"] == 100 assert row["energy_rating_average"] == 60 assert row["environmental_impact_current"] == 72 + + +_NULLABLE_CATEGORICAL_FEATURES: tuple[str, ...] = ( + "property_type", + "built_form", + "region_code", + "country_code", +) + + +_NON_NULLABLE_CATEGORICAL_FEATURES: tuple[str, ...] = ( + "dwelling_type", + "tenure", + "transaction_type", +) + + +def test_schema_advertises_categorical_features() -> None: + # Arrange + transform = EpcMlTransform() + + # Act + schema = transform.schema() + + # Assert + for feature_name in _NULLABLE_CATEGORICAL_FEATURES: + assert feature_name in schema.feature_columns, feature_name + column = schema.feature_columns[feature_name] + assert column.dtype is str + assert column.categorical is True + assert column.nullable is True + for feature_name in _NON_NULLABLE_CATEGORICAL_FEATURES: + assert feature_name in schema.feature_columns, feature_name + column = schema.feature_columns[feature_name] + assert column.dtype is str + assert column.categorical is True + assert column.nullable is False + + +def test_to_row_extracts_categorical_features() -> None: + # Arrange + epc = make_minimal_sap10_epc( + energy_rating_current=82, + dwelling_type="End-terrace house", + tenure="3", + transaction_type="8", + property_type="0", + built_form="2", + region_code="6", + country_code="ENG", + ) + transform = EpcMlTransform() + + # Act + row = transform.to_row(epc) + + # Assert + assert row["dwelling_type"] == "End-terrace house" + assert row["tenure"] == "3" + assert row["transaction_type"] == "8" + assert row["property_type"] == "0" + assert row["built_form"] == "2" + assert row["region_code"] == "6" + assert row["country_code"] == "ENG" diff --git a/packages/domain/src/domain/ml/transform.py b/packages/domain/src/domain/ml/transform.py index 058053a0..9e36b3b2 100644 --- a/packages/domain/src/domain/ml/transform.py +++ b/packages/domain/src/domain/ml/transform.py @@ -92,6 +92,49 @@ _FEATURE_COLUMNS: dict[str, ColumnSpec] = { nullable=True, description="Environmental impact rating; separate from energy efficiency SAP score.", ), + # Categoricals — emitted as raw strings; downstream casts to pd.Categorical + "dwelling_type": ColumnSpec( + dtype=str, + nullable=False, + categorical=True, + description="Free-form SAP dwelling-type description, e.g. 'Mid-terrace house'.", + ), + "tenure": ColumnSpec( + dtype=str, + nullable=False, + categorical=True, + description="SAP tenure code, stringified int (e.g. '1' owner-occupied).", + ), + "transaction_type": ColumnSpec( + dtype=str, + nullable=False, + categorical=True, + description="SAP transaction type code, stringified int.", + ), + "property_type": ColumnSpec( + dtype=str, + nullable=True, + categorical=True, + description="SAP property type code, stringified int.", + ), + "built_form": ColumnSpec( + dtype=str, + nullable=True, + categorical=True, + description="SAP built-form code, stringified int.", + ), + "region_code": ColumnSpec( + dtype=str, + nullable=True, + categorical=True, + description="SAP region code (stringified int) — coarse climate / fuel-rate proxy.", + ), + "country_code": ColumnSpec( + dtype=str, + nullable=True, + categorical=True, + description="ISO-style country code, e.g. 'ENG', 'WAL', 'EAW'.", + ), } @@ -188,6 +231,14 @@ class EpcMlTransform: "percent_draughtproofed": epc.percent_draughtproofed, "energy_rating_average": epc.energy_rating_average, "environmental_impact_current": epc.environmental_impact_current, + # Features — categoricals (raw strings; cast at parquet write time) + "dwelling_type": epc.dwelling_type, + "tenure": epc.tenure, + "transaction_type": epc.transaction_type, + "property_type": epc.property_type, + "built_form": epc.built_form, + "region_code": epc.region_code, + "country_code": epc.country_code, # Targets "sap_score": epc.energy_rating_current, "co2_emissions": epc.co2_emissions_current,