From 6aa3ddfbf489448381d2bafd97bb19a962ade7b5 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 May 2026 12:37:52 +0000 Subject: [PATCH] drop tenure + transaction_type from features (v2.0.0) Neither field physically affects SAP rating; they're dataset-side metadata (owner-occupied vs rented, sale vs marketed) and any correlation with sap_score is confounded with age/condition that the model already sees through built_form / property_type / construction_age_band. Dropping reduces feature count and removes a source of spurious split-gain. MAJOR per ADR-0007 versioning policy (column removal): 1.0.0 -> 2.0.0. --- .../domain/src/domain/ml/tests/test_transform.py | 10 +++------- packages/domain/src/domain/ml/transform.py | 16 +--------------- 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/packages/domain/src/domain/ml/tests/test_transform.py b/packages/domain/src/domain/ml/tests/test_transform.py index 8ef9fcee..a84a45b1 100644 --- a/packages/domain/src/domain/ml/tests/test_transform.py +++ b/packages/domain/src/domain/ml/tests/test_transform.py @@ -36,7 +36,7 @@ def test_transform_advertises_version_and_target_columns() -> None: # Assert assert isinstance(schema, TransformSchema) - assert schema.transform_version == "1.0.0" + assert schema.transform_version == "2.0.0" assert schema.transform_version == EpcMlTransform.VERSION assert set(schema.target_columns.keys()) == set(_EXPECTED_TARGET_DTYPES.keys()) for target_name, expected_dtype in _EXPECTED_TARGET_DTYPES.items(): @@ -257,8 +257,6 @@ _NULLABLE_CATEGORICAL_FEATURES: tuple[str, ...] = ( _NON_NULLABLE_CATEGORICAL_FEATURES: tuple[str, ...] = ( "dwelling_type", - "tenure", - "transaction_type", ) @@ -289,8 +287,6 @@ def test_to_row_extracts_categorical_features() -> None: epc = make_minimal_sap10_epc( energy_rating_current=82, dwelling_type="End-terrace house", - tenure="3", - transaction_type="8", property_type="0", built_form="2", region_code="6", @@ -303,8 +299,8 @@ def test_to_row_extracts_categorical_features() -> None: # Assert assert row["dwelling_type"] == "End-terrace house" - assert row["tenure"] == "3" - assert row["transaction_type"] == "8" + assert "tenure" not in row + assert "transaction_type" not in row assert row["property_type"] == "0" assert row["built_form"] == "2" assert row["region_code"] == "6" diff --git a/packages/domain/src/domain/ml/transform.py b/packages/domain/src/domain/ml/transform.py index ddb8e84f..666fea96 100644 --- a/packages/domain/src/domain/ml/transform.py +++ b/packages/domain/src/domain/ml/transform.py @@ -133,18 +133,6 @@ _FEATURE_COLUMNS: dict[str, ColumnSpec] = { categorical=True, description="Free-form SAP dwelling-type description, e.g. 'Mid-terrace house'.", ), - "tenure": ColumnSpec( - dtype=str, - nullable=False, - categorical=True, - description="SAP tenure code, stringified int (e.g. '1' owner-occupied).", - ), - "transaction_type": ColumnSpec( - dtype=str, - nullable=False, - categorical=True, - description="SAP transaction type code, stringified int.", - ), "property_type": ColumnSpec( dtype=str, nullable=True, @@ -897,7 +885,7 @@ class EpcMlTransform: Version 0.1.0 — schema contract only; feature columns added in subsequent slices. """ - VERSION: str = "1.0.0" + VERSION: str = "2.0.0" def schema(self) -> TransformSchema: """The cross-repo ML data contract. @@ -1011,8 +999,6 @@ class EpcMlTransform: "percent_draughtproofed": epc.percent_draughtproofed, # Features — categoricals (raw strings; cast at parquet write time) "dwelling_type": epc.dwelling_type, - "tenure": epc.tenure, - "transaction_type": epc.transaction_type, "property_type": epc.property_type, "built_form": epc.built_form, "region_code": epc.region_code,