drop tenure + transaction_type from features (v2.0.0)

Neither field physically affects SAP rating; they're dataset-side metadata
(owner-occupied vs rented, sale vs marketed) and any correlation with
sap_score is confounded with age/condition that the model already sees
through built_form / property_type / construction_age_band.

Dropping reduces feature count and removes a source of spurious split-gain.
MAJOR per ADR-0007 versioning policy (column removal): 1.0.0 -> 2.0.0.
This commit is contained in:
Khalim Conn-Kowlessar 2026-05-17 12:37:52 +00:00
parent e8b6f19a3a
commit 6aa3ddfbf4
2 changed files with 4 additions and 22 deletions

View file

@ -36,7 +36,7 @@ def test_transform_advertises_version_and_target_columns() -> None:
# Assert
assert isinstance(schema, TransformSchema)
assert schema.transform_version == "1.0.0"
assert schema.transform_version == "2.0.0"
assert schema.transform_version == EpcMlTransform.VERSION
assert set(schema.target_columns.keys()) == set(_EXPECTED_TARGET_DTYPES.keys())
for target_name, expected_dtype in _EXPECTED_TARGET_DTYPES.items():
@ -257,8 +257,6 @@ _NULLABLE_CATEGORICAL_FEATURES: tuple[str, ...] = (
_NON_NULLABLE_CATEGORICAL_FEATURES: tuple[str, ...] = (
"dwelling_type",
"tenure",
"transaction_type",
)
@ -289,8 +287,6 @@ def test_to_row_extracts_categorical_features() -> None:
epc = make_minimal_sap10_epc(
energy_rating_current=82,
dwelling_type="End-terrace house",
tenure="3",
transaction_type="8",
property_type="0",
built_form="2",
region_code="6",
@ -303,8 +299,8 @@ def test_to_row_extracts_categorical_features() -> None:
# Assert
assert row["dwelling_type"] == "End-terrace house"
assert row["tenure"] == "3"
assert row["transaction_type"] == "8"
assert "tenure" not in row
assert "transaction_type" not in row
assert row["property_type"] == "0"
assert row["built_form"] == "2"
assert row["region_code"] == "6"

View file

@ -133,18 +133,6 @@ _FEATURE_COLUMNS: dict[str, ColumnSpec] = {
categorical=True,
description="Free-form SAP dwelling-type description, e.g. 'Mid-terrace house'.",
),
"tenure": ColumnSpec(
dtype=str,
nullable=False,
categorical=True,
description="SAP tenure code, stringified int (e.g. '1' owner-occupied).",
),
"transaction_type": ColumnSpec(
dtype=str,
nullable=False,
categorical=True,
description="SAP transaction type code, stringified int.",
),
"property_type": ColumnSpec(
dtype=str,
nullable=True,
@ -897,7 +885,7 @@ class EpcMlTransform:
Version 0.1.0 schema contract only; feature columns added in subsequent slices.
"""
VERSION: str = "1.0.0"
VERSION: str = "2.0.0"
def schema(self) -> TransformSchema:
"""The cross-repo ML data contract.
@ -1011,8 +999,6 @@ class EpcMlTransform:
"percent_draughtproofed": epc.percent_draughtproofed,
# Features — categoricals (raw strings; cast at parquet write time)
"dwelling_type": epc.dwelling_type,
"tenure": epc.tenure,
"transaction_type": epc.transaction_type,
"property_type": epc.property_type,
"built_form": epc.built_form,
"region_code": epc.region_code,