From 3abcee6a532caa3177e8ad3cf80f4d5508bb4abd Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 May 2026 16:43:28 +0000 Subject: [PATCH] slice 13: to_rows(properties) returns pd.DataFrame Co-Authored-By: Claude Opus 4.7 --- packages/domain/pyproject.toml | 5 +- .../src/domain/ml/tests/test_transform.py | 61 +++++++++++++++++++ packages/domain/src/domain/ml/transform.py | 26 +++++++- 3 files changed, 90 insertions(+), 2 deletions(-) diff --git a/packages/domain/pyproject.toml b/packages/domain/pyproject.toml index 5e820371..19786eed 100644 --- a/packages/domain/pyproject.toml +++ b/packages/domain/pyproject.toml @@ -3,7 +3,10 @@ name = "domna-domain" version = "0.1.0" description = "Shared domain types for the Ara modelling pipeline and sibling Domna services." requires-python = ">=3.11" -dependencies = [] +dependencies = [ + "pandas>=2.0", + "pandas-stubs", +] [build-system] requires = ["hatchling"] diff --git a/packages/domain/src/domain/ml/tests/test_transform.py b/packages/domain/src/domain/ml/tests/test_transform.py index 66116fbe..14fe2404 100644 --- a/packages/domain/src/domain/ml/tests/test_transform.py +++ b/packages/domain/src/domain/ml/tests/test_transform.py @@ -1,5 +1,6 @@ """Tests for EpcMlTransform v0.1.0 — schema-contract surface and target extraction.""" +import pandas as pd import pytest from datatypes.epc.domain.epc_property_data import SapRoomInRoof, WindowTransmissionDetails @@ -1087,6 +1088,66 @@ def test_to_row_extracts_ventilation_features() -> None: assert row["pressure_test"] == 4 +def test_to_rows_returns_dataframe_with_one_row_per_property() -> None: + # Arrange — two properties with different floor areas + SAP scores + epcs = [ + make_minimal_sap10_epc(energy_rating_current=82, total_floor_area_m2=70.0), + make_minimal_sap10_epc(energy_rating_current=45, total_floor_area_m2=120.0), + ] + transform = EpcMlTransform() + + # Act + df = transform.to_rows(epcs) + + # Assert + assert isinstance(df, pd.DataFrame) + assert len(df) == 2 + assert df.loc[0, "sap_score"] == 82 + assert df.loc[1, "sap_score"] == 45 + assert df.loc[0, "total_floor_area_m2"] == 70.0 + assert df.loc[1, "total_floor_area_m2"] == 120.0 + + +def test_to_rows_returns_empty_dataframe_for_empty_input() -> None: + # Arrange + transform = EpcMlTransform() + + # Act + df = transform.to_rows([]) + + # Assert + assert isinstance(df, pd.DataFrame) + assert len(df) == 0 + # Every advertised column appears as an output column even for empty input. + schema = transform.schema() + for name in schema.feature_columns: + assert name in df.columns + for name in schema.target_columns: + assert name in df.columns + + +def test_to_rows_casts_categorical_columns_to_pd_categorical_dtype() -> None: + # Arrange — minimal property with a categorical feature populated + epcs = [ + make_minimal_sap10_epc( + energy_rating_current=82, dwelling_type="Mid-terrace house" + ), + make_minimal_sap10_epc( + energy_rating_current=45, dwelling_type="Detached house" + ), + ] + transform = EpcMlTransform() + + # Act + df = transform.to_rows(epcs) + + # Assert — every column flagged ColumnSpec.categorical=True is a pd.Categorical + schema = transform.schema() + for name, spec in schema.feature_columns.items(): + if spec.categorical: + assert isinstance(df[name].dtype, pd.CategoricalDtype), name + + def test_to_row_area_weights_window_u_value_and_solar_transmittance() -> None: # Arrange — two windows with transmission details; one without. sap_windows = [ diff --git a/packages/domain/src/domain/ml/transform.py b/packages/domain/src/domain/ml/transform.py index 9e1fc662..bfba7a43 100644 --- a/packages/domain/src/domain/ml/transform.py +++ b/packages/domain/src/domain/ml/transform.py @@ -10,7 +10,9 @@ are added in subsequent slices. See docs/adr/0007-kwh-as-ml-target.md for the target set and rationale. """ -from typing import Any, Optional +from typing import Any, Iterable, Optional + +import pandas as pd from datatypes.epc.domain.epc import Epc from datatypes.epc.domain.epc_property_data import ( @@ -502,6 +504,28 @@ class EpcMlTransform: target_columns=dict(_TARGET_COLUMNS), ) + def to_rows(self, properties: Iterable[EpcPropertyData]) -> pd.DataFrame: + """Apply `to_row` across many properties and return a typed DataFrame. + + Columns flagged `categorical=True` in the schema are cast to + `pd.Categorical`; everything else is left at pandas-inferred dtype. + The DataFrame always carries every advertised column, even when the + input is empty. + """ + schema = self.schema() + all_columns = list(schema.feature_columns.keys()) + list( + schema.target_columns.keys() + ) + rows = [self.to_row(epc) for epc in properties] + df = pd.DataFrame(rows, columns=all_columns) + for name, spec in schema.feature_columns.items(): + if spec.categorical: + df[name] = df[name].astype("category") + for name, spec in schema.target_columns.items(): + if spec.categorical: + df[name] = df[name].astype("category") + return df + def to_row(self, epc: EpcPropertyData) -> dict[str, Any]: """Map an EpcPropertyData to a single row of features + targets.