mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
slice 7: flat categoricals + ColumnSpec.categorical flag
Adds seven flat categorical features (dwelling_type, tenure, transaction_type, property_type, built_form, region_code, country_code) emitted as raw strings. New ColumnSpec.categorical bool tells the parquet writer to cast these to pd.Categorical at the I/O boundary, keeping pandas out of the domain/schema module. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
e4f9e9e1db
commit
9c8aa75469
4 changed files with 137 additions and 4 deletions
|
|
@ -9,11 +9,18 @@ from dataclasses import dataclass
|
|||
|
||||
@dataclass(frozen=True)
|
||||
class ColumnSpec:
|
||||
"""Specification of a single column in the EPC ML training dataset."""
|
||||
"""Specification of a single column in the EPC ML training dataset.
|
||||
|
||||
`categorical=True` signals that the column carries a categorical value (raw
|
||||
strings emitted by the transform) and should be cast to `pd.Categorical` at
|
||||
parquet write time. The schema module stays pandas-free; the cast happens at
|
||||
the I/O boundary in `services/ml_training_data/`.
|
||||
"""
|
||||
|
||||
dtype: type
|
||||
nullable: bool = True
|
||||
description: str = ""
|
||||
categorical: bool = False
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
|
|
|||
|
|
@ -41,13 +41,20 @@ def make_minimal_sap10_epc(
|
|||
percent_draughtproofed: Optional[int] = None,
|
||||
energy_rating_average: Optional[int] = None,
|
||||
environmental_impact_current: Optional[int] = None,
|
||||
dwelling_type: str = "Mid-terrace house",
|
||||
tenure: str = "1",
|
||||
transaction_type: str = "1",
|
||||
property_type: Optional[str] = None,
|
||||
built_form: Optional[str] = None,
|
||||
region_code: Optional[str] = None,
|
||||
country_code: Optional[str] = None,
|
||||
) -> EpcPropertyData:
|
||||
"""Construct a minimal valid SAP10 EpcPropertyData with parametrisable targets."""
|
||||
return EpcPropertyData(
|
||||
dwelling_type="Mid-terrace house",
|
||||
dwelling_type=dwelling_type,
|
||||
inspection_date=date(2025, 6, 1),
|
||||
tenure="1",
|
||||
transaction_type="1",
|
||||
tenure=tenure,
|
||||
transaction_type=transaction_type,
|
||||
address_line_1="1 Test Street",
|
||||
postcode="A1 1AA",
|
||||
post_town="Testtown",
|
||||
|
|
@ -93,6 +100,10 @@ def make_minimal_sap10_epc(
|
|||
percent_draughtproofed=percent_draughtproofed,
|
||||
energy_rating_average=energy_rating_average,
|
||||
environmental_impact_current=environmental_impact_current,
|
||||
property_type=property_type,
|
||||
built_form=built_form,
|
||||
region_code=region_code,
|
||||
country_code=country_code,
|
||||
renewable_heat_incentive=RenewableHeatIncentive(
|
||||
space_heating_kwh=space_heating_kwh,
|
||||
water_heating_kwh=water_heating_kwh,
|
||||
|
|
|
|||
|
|
@ -239,3 +239,67 @@ def test_to_row_extracts_boolean_and_optional_int_features() -> None:
|
|||
assert row["percent_draughtproofed"] == 100
|
||||
assert row["energy_rating_average"] == 60
|
||||
assert row["environmental_impact_current"] == 72
|
||||
|
||||
|
||||
_NULLABLE_CATEGORICAL_FEATURES: tuple[str, ...] = (
|
||||
"property_type",
|
||||
"built_form",
|
||||
"region_code",
|
||||
"country_code",
|
||||
)
|
||||
|
||||
|
||||
_NON_NULLABLE_CATEGORICAL_FEATURES: tuple[str, ...] = (
|
||||
"dwelling_type",
|
||||
"tenure",
|
||||
"transaction_type",
|
||||
)
|
||||
|
||||
|
||||
def test_schema_advertises_categorical_features() -> None:
|
||||
# Arrange
|
||||
transform = EpcMlTransform()
|
||||
|
||||
# Act
|
||||
schema = transform.schema()
|
||||
|
||||
# Assert
|
||||
for feature_name in _NULLABLE_CATEGORICAL_FEATURES:
|
||||
assert feature_name in schema.feature_columns, feature_name
|
||||
column = schema.feature_columns[feature_name]
|
||||
assert column.dtype is str
|
||||
assert column.categorical is True
|
||||
assert column.nullable is True
|
||||
for feature_name in _NON_NULLABLE_CATEGORICAL_FEATURES:
|
||||
assert feature_name in schema.feature_columns, feature_name
|
||||
column = schema.feature_columns[feature_name]
|
||||
assert column.dtype is str
|
||||
assert column.categorical is True
|
||||
assert column.nullable is False
|
||||
|
||||
|
||||
def test_to_row_extracts_categorical_features() -> None:
|
||||
# Arrange
|
||||
epc = make_minimal_sap10_epc(
|
||||
energy_rating_current=82,
|
||||
dwelling_type="End-terrace house",
|
||||
tenure="3",
|
||||
transaction_type="8",
|
||||
property_type="0",
|
||||
built_form="2",
|
||||
region_code="6",
|
||||
country_code="ENG",
|
||||
)
|
||||
transform = EpcMlTransform()
|
||||
|
||||
# Act
|
||||
row = transform.to_row(epc)
|
||||
|
||||
# Assert
|
||||
assert row["dwelling_type"] == "End-terrace house"
|
||||
assert row["tenure"] == "3"
|
||||
assert row["transaction_type"] == "8"
|
||||
assert row["property_type"] == "0"
|
||||
assert row["built_form"] == "2"
|
||||
assert row["region_code"] == "6"
|
||||
assert row["country_code"] == "ENG"
|
||||
|
|
|
|||
|
|
@ -92,6 +92,49 @@ _FEATURE_COLUMNS: dict[str, ColumnSpec] = {
|
|||
nullable=True,
|
||||
description="Environmental impact rating; separate from energy efficiency SAP score.",
|
||||
),
|
||||
# Categoricals — emitted as raw strings; downstream casts to pd.Categorical
|
||||
"dwelling_type": ColumnSpec(
|
||||
dtype=str,
|
||||
nullable=False,
|
||||
categorical=True,
|
||||
description="Free-form SAP dwelling-type description, e.g. 'Mid-terrace house'.",
|
||||
),
|
||||
"tenure": ColumnSpec(
|
||||
dtype=str,
|
||||
nullable=False,
|
||||
categorical=True,
|
||||
description="SAP tenure code, stringified int (e.g. '1' owner-occupied).",
|
||||
),
|
||||
"transaction_type": ColumnSpec(
|
||||
dtype=str,
|
||||
nullable=False,
|
||||
categorical=True,
|
||||
description="SAP transaction type code, stringified int.",
|
||||
),
|
||||
"property_type": ColumnSpec(
|
||||
dtype=str,
|
||||
nullable=True,
|
||||
categorical=True,
|
||||
description="SAP property type code, stringified int.",
|
||||
),
|
||||
"built_form": ColumnSpec(
|
||||
dtype=str,
|
||||
nullable=True,
|
||||
categorical=True,
|
||||
description="SAP built-form code, stringified int.",
|
||||
),
|
||||
"region_code": ColumnSpec(
|
||||
dtype=str,
|
||||
nullable=True,
|
||||
categorical=True,
|
||||
description="SAP region code (stringified int) — coarse climate / fuel-rate proxy.",
|
||||
),
|
||||
"country_code": ColumnSpec(
|
||||
dtype=str,
|
||||
nullable=True,
|
||||
categorical=True,
|
||||
description="ISO-style country code, e.g. 'ENG', 'WAL', 'EAW'.",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -188,6 +231,14 @@ class EpcMlTransform:
|
|||
"percent_draughtproofed": epc.percent_draughtproofed,
|
||||
"energy_rating_average": epc.energy_rating_average,
|
||||
"environmental_impact_current": epc.environmental_impact_current,
|
||||
# Features — categoricals (raw strings; cast at parquet write time)
|
||||
"dwelling_type": epc.dwelling_type,
|
||||
"tenure": epc.tenure,
|
||||
"transaction_type": epc.transaction_type,
|
||||
"property_type": epc.property_type,
|
||||
"built_form": epc.built_form,
|
||||
"region_code": epc.region_code,
|
||||
"country_code": epc.country_code,
|
||||
# Targets
|
||||
"sap_score": epc.energy_rating_current,
|
||||
"co2_emissions": epc.co2_emissions_current,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue