mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
drop tenure + transaction_type from features (v2.0.0)
Neither field physically affects SAP rating; they're dataset-side metadata (owner-occupied vs rented, sale vs marketed) and any correlation with sap_score is confounded with age/condition that the model already sees through built_form / property_type / construction_age_band. Dropping reduces feature count and removes a source of spurious split-gain. MAJOR per ADR-0007 versioning policy (column removal): 1.0.0 -> 2.0.0.
This commit is contained in:
parent
e8b6f19a3a
commit
6aa3ddfbf4
2 changed files with 4 additions and 22 deletions
|
|
@ -36,7 +36,7 @@ def test_transform_advertises_version_and_target_columns() -> None:
|
|||
|
||||
# Assert
|
||||
assert isinstance(schema, TransformSchema)
|
||||
assert schema.transform_version == "1.0.0"
|
||||
assert schema.transform_version == "2.0.0"
|
||||
assert schema.transform_version == EpcMlTransform.VERSION
|
||||
assert set(schema.target_columns.keys()) == set(_EXPECTED_TARGET_DTYPES.keys())
|
||||
for target_name, expected_dtype in _EXPECTED_TARGET_DTYPES.items():
|
||||
|
|
@ -257,8 +257,6 @@ _NULLABLE_CATEGORICAL_FEATURES: tuple[str, ...] = (
|
|||
|
||||
_NON_NULLABLE_CATEGORICAL_FEATURES: tuple[str, ...] = (
|
||||
"dwelling_type",
|
||||
"tenure",
|
||||
"transaction_type",
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -289,8 +287,6 @@ def test_to_row_extracts_categorical_features() -> None:
|
|||
epc = make_minimal_sap10_epc(
|
||||
energy_rating_current=82,
|
||||
dwelling_type="End-terrace house",
|
||||
tenure="3",
|
||||
transaction_type="8",
|
||||
property_type="0",
|
||||
built_form="2",
|
||||
region_code="6",
|
||||
|
|
@ -303,8 +299,8 @@ def test_to_row_extracts_categorical_features() -> None:
|
|||
|
||||
# Assert
|
||||
assert row["dwelling_type"] == "End-terrace house"
|
||||
assert row["tenure"] == "3"
|
||||
assert row["transaction_type"] == "8"
|
||||
assert "tenure" not in row
|
||||
assert "transaction_type" not in row
|
||||
assert row["property_type"] == "0"
|
||||
assert row["built_form"] == "2"
|
||||
assert row["region_code"] == "6"
|
||||
|
|
|
|||
|
|
@ -133,18 +133,6 @@ _FEATURE_COLUMNS: dict[str, ColumnSpec] = {
|
|||
categorical=True,
|
||||
description="Free-form SAP dwelling-type description, e.g. 'Mid-terrace house'.",
|
||||
),
|
||||
"tenure": ColumnSpec(
|
||||
dtype=str,
|
||||
nullable=False,
|
||||
categorical=True,
|
||||
description="SAP tenure code, stringified int (e.g. '1' owner-occupied).",
|
||||
),
|
||||
"transaction_type": ColumnSpec(
|
||||
dtype=str,
|
||||
nullable=False,
|
||||
categorical=True,
|
||||
description="SAP transaction type code, stringified int.",
|
||||
),
|
||||
"property_type": ColumnSpec(
|
||||
dtype=str,
|
||||
nullable=True,
|
||||
|
|
@ -897,7 +885,7 @@ class EpcMlTransform:
|
|||
Version 0.1.0 — schema contract only; feature columns added in subsequent slices.
|
||||
"""
|
||||
|
||||
VERSION: str = "1.0.0"
|
||||
VERSION: str = "2.0.0"
|
||||
|
||||
def schema(self) -> TransformSchema:
|
||||
"""The cross-repo ML data contract.
|
||||
|
|
@ -1011,8 +999,6 @@ class EpcMlTransform:
|
|||
"percent_draughtproofed": epc.percent_draughtproofed,
|
||||
# Features — categoricals (raw strings; cast at parquet write time)
|
||||
"dwelling_type": epc.dwelling_type,
|
||||
"tenure": epc.tenure,
|
||||
"transaction_type": epc.transaction_type,
|
||||
"property_type": epc.property_type,
|
||||
"built_form": epc.built_form,
|
||||
"region_code": epc.region_code,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue