mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
drop tenure + transaction_type from features (v2.0.0)
Neither field physically affects SAP rating; they're dataset-side metadata (owner-occupied vs rented, sale vs marketed) and any correlation with sap_score is confounded with age/condition that the model already sees through built_form / property_type / construction_age_band. Dropping reduces feature count and removes a source of spurious split-gain. MAJOR per ADR-0007 versioning policy (column removal): 1.0.0 -> 2.0.0.
This commit is contained in:
parent
e8b6f19a3a
commit
6aa3ddfbf4
2 changed files with 4 additions and 22 deletions
|
|
@ -36,7 +36,7 @@ def test_transform_advertises_version_and_target_columns() -> None:
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert isinstance(schema, TransformSchema)
|
assert isinstance(schema, TransformSchema)
|
||||||
assert schema.transform_version == "1.0.0"
|
assert schema.transform_version == "2.0.0"
|
||||||
assert schema.transform_version == EpcMlTransform.VERSION
|
assert schema.transform_version == EpcMlTransform.VERSION
|
||||||
assert set(schema.target_columns.keys()) == set(_EXPECTED_TARGET_DTYPES.keys())
|
assert set(schema.target_columns.keys()) == set(_EXPECTED_TARGET_DTYPES.keys())
|
||||||
for target_name, expected_dtype in _EXPECTED_TARGET_DTYPES.items():
|
for target_name, expected_dtype in _EXPECTED_TARGET_DTYPES.items():
|
||||||
|
|
@ -257,8 +257,6 @@ _NULLABLE_CATEGORICAL_FEATURES: tuple[str, ...] = (
|
||||||
|
|
||||||
_NON_NULLABLE_CATEGORICAL_FEATURES: tuple[str, ...] = (
|
_NON_NULLABLE_CATEGORICAL_FEATURES: tuple[str, ...] = (
|
||||||
"dwelling_type",
|
"dwelling_type",
|
||||||
"tenure",
|
|
||||||
"transaction_type",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -289,8 +287,6 @@ def test_to_row_extracts_categorical_features() -> None:
|
||||||
epc = make_minimal_sap10_epc(
|
epc = make_minimal_sap10_epc(
|
||||||
energy_rating_current=82,
|
energy_rating_current=82,
|
||||||
dwelling_type="End-terrace house",
|
dwelling_type="End-terrace house",
|
||||||
tenure="3",
|
|
||||||
transaction_type="8",
|
|
||||||
property_type="0",
|
property_type="0",
|
||||||
built_form="2",
|
built_form="2",
|
||||||
region_code="6",
|
region_code="6",
|
||||||
|
|
@ -303,8 +299,8 @@ def test_to_row_extracts_categorical_features() -> None:
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert row["dwelling_type"] == "End-terrace house"
|
assert row["dwelling_type"] == "End-terrace house"
|
||||||
assert row["tenure"] == "3"
|
assert "tenure" not in row
|
||||||
assert row["transaction_type"] == "8"
|
assert "transaction_type" not in row
|
||||||
assert row["property_type"] == "0"
|
assert row["property_type"] == "0"
|
||||||
assert row["built_form"] == "2"
|
assert row["built_form"] == "2"
|
||||||
assert row["region_code"] == "6"
|
assert row["region_code"] == "6"
|
||||||
|
|
|
||||||
|
|
@ -133,18 +133,6 @@ _FEATURE_COLUMNS: dict[str, ColumnSpec] = {
|
||||||
categorical=True,
|
categorical=True,
|
||||||
description="Free-form SAP dwelling-type description, e.g. 'Mid-terrace house'.",
|
description="Free-form SAP dwelling-type description, e.g. 'Mid-terrace house'.",
|
||||||
),
|
),
|
||||||
"tenure": ColumnSpec(
|
|
||||||
dtype=str,
|
|
||||||
nullable=False,
|
|
||||||
categorical=True,
|
|
||||||
description="SAP tenure code, stringified int (e.g. '1' owner-occupied).",
|
|
||||||
),
|
|
||||||
"transaction_type": ColumnSpec(
|
|
||||||
dtype=str,
|
|
||||||
nullable=False,
|
|
||||||
categorical=True,
|
|
||||||
description="SAP transaction type code, stringified int.",
|
|
||||||
),
|
|
||||||
"property_type": ColumnSpec(
|
"property_type": ColumnSpec(
|
||||||
dtype=str,
|
dtype=str,
|
||||||
nullable=True,
|
nullable=True,
|
||||||
|
|
@ -897,7 +885,7 @@ class EpcMlTransform:
|
||||||
Version 0.1.0 — schema contract only; feature columns added in subsequent slices.
|
Version 0.1.0 — schema contract only; feature columns added in subsequent slices.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
VERSION: str = "1.0.0"
|
VERSION: str = "2.0.0"
|
||||||
|
|
||||||
def schema(self) -> TransformSchema:
|
def schema(self) -> TransformSchema:
|
||||||
"""The cross-repo ML data contract.
|
"""The cross-repo ML data contract.
|
||||||
|
|
@ -1011,8 +999,6 @@ class EpcMlTransform:
|
||||||
"percent_draughtproofed": epc.percent_draughtproofed,
|
"percent_draughtproofed": epc.percent_draughtproofed,
|
||||||
# Features — categoricals (raw strings; cast at parquet write time)
|
# Features — categoricals (raw strings; cast at parquet write time)
|
||||||
"dwelling_type": epc.dwelling_type,
|
"dwelling_type": epc.dwelling_type,
|
||||||
"tenure": epc.tenure,
|
|
||||||
"transaction_type": epc.transaction_type,
|
|
||||||
"property_type": epc.property_type,
|
"property_type": epc.property_type,
|
||||||
"built_form": epc.built_form,
|
"built_form": epc.built_form,
|
||||||
"region_code": epc.region_code,
|
"region_code": epc.region_code,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue