diff --git a/packages/domain/src/domain/ml/tests/test_transform.py b/packages/domain/src/domain/ml/tests/test_transform.py index bce6c397..27d48654 100644 --- a/packages/domain/src/domain/ml/tests/test_transform.py +++ b/packages/domain/src/domain/ml/tests/test_transform.py @@ -415,6 +415,91 @@ def test_to_row_returns_window_zeros_for_property_with_no_windows() -> None: assert row["window_avg_solar_transmittance"] is None +_GLAZED_TYPE_CODES: tuple[int, ...] = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + + +def test_schema_advertises_window_categorical_share_features() -> None: + # Arrange + transform = EpcMlTransform() + + # Act + schema = transform.schema() + + # Assert — one float share per known glazed_type code + `_other`, plus pvc_frame share + for code in _GLAZED_TYPE_CODES: + name = f"window_pct_glazed_type_{code}" + assert name in schema.feature_columns, name + column = schema.feature_columns[name] + assert column.dtype is float + assert column.nullable is False + assert column.categorical is False + assert "window_pct_glazed_type_other" in schema.feature_columns + assert "window_pct_pvc_frame" in schema.feature_columns + assert schema.feature_columns["window_pct_pvc_frame"].dtype is float + assert schema.feature_columns["window_pct_pvc_frame"].nullable is True + + +def test_to_row_aggregates_glazed_type_and_pvc_frame_shares() -> None: + # Arrange — three windows: 3.0 m² glazed_type=2 PVC, 1.5 m² glazed_type=13 PVC, + # 0.5 m² glazed_type=5 (single, no PVC). Total area = 5.0 m². + sap_windows = [ + make_window(width=1.5, height=2.0, glazing_type=2, frame_material="PVC"), + make_window(width=1.0, height=1.5, glazing_type=13, frame_material="PVC"), + make_window(width=0.5, height=1.0, glazing_type=5, frame_material=None), + ] + epc = make_minimal_sap10_epc(energy_rating_current=82, sap_windows=sap_windows) + transform = EpcMlTransform() + + # Act + row = transform.to_row(epc) + + # Assert + # Shares (area-weighted) — 3.0/5.0=0.6 type 2; 1.5/5.0=0.3 type 13; 0.5/5.0=0.1 type 5. + assert row["window_pct_glazed_type_2"] == pytest.approx(0.6) + assert row["window_pct_glazed_type_13"] == pytest.approx(0.3) + assert row["window_pct_glazed_type_5"] == pytest.approx(0.1) + # All other known glazed_type codes are zero. + for code in _GLAZED_TYPE_CODES: + if code not in (2, 5, 13): + assert row[f"window_pct_glazed_type_{code}"] == 0.0 + assert row["window_pct_glazed_type_other"] == 0.0 + # PVC frame area share: (3.0 + 1.5) / 5.0 = 0.9 + assert row["window_pct_pvc_frame"] == pytest.approx(0.9) + + +def test_to_row_routes_unknown_glazed_type_to_other_bucket() -> None: + # Arrange — one window has glazing_type=99 (not in the SAP10 enum 1-15) + sap_windows = [ + make_window(width=2.0, height=1.0, glazing_type=2, frame_material="PVC"), + make_window(width=1.0, height=1.0, glazing_type=99, frame_material="PVC"), + ] + epc = make_minimal_sap10_epc(energy_rating_current=82, sap_windows=sap_windows) + transform = EpcMlTransform() + + # Act + row = transform.to_row(epc) + + # Assert + # Total area = 3.0; known type 2 = 2.0/3.0; unknown 99 → _other = 1.0/3.0 + assert row["window_pct_glazed_type_2"] == pytest.approx(2 / 3) + assert row["window_pct_glazed_type_other"] == pytest.approx(1 / 3) + + +def test_to_row_returns_window_share_zeros_for_property_with_no_windows() -> None: + # Arrange + epc = make_minimal_sap10_epc(energy_rating_current=82) + transform = EpcMlTransform() + + # Act + row = transform.to_row(epc) + + # Assert + for code in _GLAZED_TYPE_CODES: + assert row[f"window_pct_glazed_type_{code}"] == 0.0 + assert row["window_pct_glazed_type_other"] == 0.0 + assert row["window_pct_pvc_frame"] is None + + def test_to_row_area_weights_window_u_value_and_solar_transmittance() -> None: # Arrange — two windows with transmission details; one without. sap_windows = [ diff --git a/packages/domain/src/domain/ml/transform.py b/packages/domain/src/domain/ml/transform.py index f249b5db..919df345 100644 --- a/packages/domain/src/domain/ml/transform.py +++ b/packages/domain/src/domain/ml/transform.py @@ -32,6 +32,12 @@ _OCTANT_NAMES: dict[int, str] = { 8: "NW", } +# SAP10 glazed_type enumeration (codes 1-15 per the gov api /api/codes export at +# datatypes/epc/domain/epc_codes.csv, schema RdSAP-21.0.x). Anything outside this set +# (the documentation "ND" sentinel, future codes, or unexpected strings) falls into +# the `_other` bucket so share columns always sum to 1.0 of total window area. +_GLAZED_TYPE_CODES: tuple[int, ...] = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + _FEATURE_COLUMNS: dict[str, ColumnSpec] = { # Geometry @@ -182,6 +188,25 @@ _FEATURE_COLUMNS: dict[str, ColumnSpec] = { nullable=True, description="Area-weighted mean window solar transmittance; null when no transmission details.", ), + # Window glazed_type categorical share columns (sum to 1.0 over total area when any windows present) + **{ + f"window_pct_glazed_type_{code}": ColumnSpec( + dtype=float, + nullable=False, + description=f"Area share of windows with glazed_type {code} (0.0-1.0).", + ) + for code in _GLAZED_TYPE_CODES + }, + "window_pct_glazed_type_other": ColumnSpec( + dtype=float, + nullable=False, + description="Area share of windows with glazed_type outside the SAP10 1-15 enum.", + ), + "window_pct_pvc_frame": ColumnSpec( + dtype=float, + nullable=True, + description="Area share of windows with PVC frame; null when no windows.", + ), } @@ -312,13 +337,18 @@ def _peui_ucl(epc: EpcPropertyData) -> Optional[float]: def _window_aggregates(windows: list[SapWindow]) -> dict[str, Any]: - """Aggregate a list of windows into the 13 physics + orientation columns. + """Aggregate a list of windows into the 30 window-feature columns. - With no windows: counts/areas are 0; nullable averages are None. - Windows whose `orientation` isn't an integer in 1-8 contribute to count and - total area but to no octant — they're treated as unrecorded. + With no windows: counts/areas/shares are 0; nullable averages and the + pvc_frame share are None. Windows whose `orientation` isn't an integer in 1-8 + contribute to count and total area but to no octant. Windows whose + `glazing_type` isn't in the SAP10 1-15 enum fall into the `_other` share. """ octant_areas: dict[str, float] = {name: 0.0 for name in _OCTANT_NAMES.values()} + glazed_type_areas: dict[str, float] = { + f"window_pct_glazed_type_{code}": 0.0 for code in _GLAZED_TYPE_CODES + } + glazed_type_areas["window_pct_glazed_type_other"] = 0.0 aggregates: dict[str, Any] = { "window_count": len(windows), "window_total_area_m2": 0.0, @@ -326,12 +356,15 @@ def _window_aggregates(windows: list[SapWindow]) -> dict[str, Any]: "window_pct_draught_proofed": None, "window_avg_u_value": None, "window_avg_solar_transmittance": None, + **glazed_type_areas, + "window_pct_pvc_frame": None, } if not windows: return aggregates total_area = 0.0 draught_proofed_area = 0.0 + pvc_frame_area = 0.0 transmission_area = 0.0 weighted_u_value = 0.0 weighted_solar_transmittance = 0.0 @@ -340,8 +373,14 @@ def _window_aggregates(windows: list[SapWindow]) -> dict[str, Any]: total_area += area if w.draught_proofed is True or w.draught_proofed == "true": draught_proofed_area += area + if w.frame_material == "PVC": + pvc_frame_area += area if isinstance(w.orientation, int) and w.orientation in _OCTANT_NAMES: octant_areas[_OCTANT_NAMES[w.orientation]] += area + if isinstance(w.glazing_type, int) and w.glazing_type in _GLAZED_TYPE_CODES: + glazed_type_areas[f"window_pct_glazed_type_{w.glazing_type}"] += area + else: + glazed_type_areas["window_pct_glazed_type_other"] += area if w.window_transmission_details is not None: transmission_area += area weighted_u_value += w.window_transmission_details.u_value * area @@ -356,6 +395,9 @@ def _window_aggregates(windows: list[SapWindow]) -> dict[str, Any]: aggregates["window_pct_draught_proofed"] = ( draught_proofed_area / total_area * 100.0 ) + aggregates["window_pct_pvc_frame"] = pvc_frame_area / total_area + for column, area in glazed_type_areas.items(): + aggregates[column] = area / total_area if transmission_area > 0: aggregates["window_avg_u_value"] = weighted_u_value / transmission_area aggregates["window_avg_solar_transmittance"] = (