slice 13: to_rows(properties) returns pd.DataFrame

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-05-16 16:43:28 +00:00
parent ebceb4bf2b
commit 3abcee6a53
3 changed files with 90 additions and 2 deletions

View file

@ -3,7 +3,10 @@ name = "domna-domain"
version = "0.1.0"
description = "Shared domain types for the Ara modelling pipeline and sibling Domna services."
requires-python = ">=3.11"
dependencies = []
dependencies = [
"pandas>=2.0",
"pandas-stubs",
]
[build-system]
requires = ["hatchling"]

View file

@ -1,5 +1,6 @@
"""Tests for EpcMlTransform v0.1.0 — schema-contract surface and target extraction."""
import pandas as pd
import pytest
from datatypes.epc.domain.epc_property_data import SapRoomInRoof, WindowTransmissionDetails
@ -1087,6 +1088,66 @@ def test_to_row_extracts_ventilation_features() -> None:
assert row["pressure_test"] == 4
def test_to_rows_returns_dataframe_with_one_row_per_property() -> None:
# Arrange — two properties with different floor areas + SAP scores
epcs = [
make_minimal_sap10_epc(energy_rating_current=82, total_floor_area_m2=70.0),
make_minimal_sap10_epc(energy_rating_current=45, total_floor_area_m2=120.0),
]
transform = EpcMlTransform()
# Act
df = transform.to_rows(epcs)
# Assert
assert isinstance(df, pd.DataFrame)
assert len(df) == 2
assert df.loc[0, "sap_score"] == 82
assert df.loc[1, "sap_score"] == 45
assert df.loc[0, "total_floor_area_m2"] == 70.0
assert df.loc[1, "total_floor_area_m2"] == 120.0
def test_to_rows_returns_empty_dataframe_for_empty_input() -> None:
# Arrange
transform = EpcMlTransform()
# Act
df = transform.to_rows([])
# Assert
assert isinstance(df, pd.DataFrame)
assert len(df) == 0
# Every advertised column appears as an output column even for empty input.
schema = transform.schema()
for name in schema.feature_columns:
assert name in df.columns
for name in schema.target_columns:
assert name in df.columns
def test_to_rows_casts_categorical_columns_to_pd_categorical_dtype() -> None:
# Arrange — minimal property with a categorical feature populated
epcs = [
make_minimal_sap10_epc(
energy_rating_current=82, dwelling_type="Mid-terrace house"
),
make_minimal_sap10_epc(
energy_rating_current=45, dwelling_type="Detached house"
),
]
transform = EpcMlTransform()
# Act
df = transform.to_rows(epcs)
# Assert — every column flagged ColumnSpec.categorical=True is a pd.Categorical
schema = transform.schema()
for name, spec in schema.feature_columns.items():
if spec.categorical:
assert isinstance(df[name].dtype, pd.CategoricalDtype), name
def test_to_row_area_weights_window_u_value_and_solar_transmittance() -> None:
# Arrange — two windows with transmission details; one without.
sap_windows = [

View file

@ -10,7 +10,9 @@ are added in subsequent slices.
See docs/adr/0007-kwh-as-ml-target.md for the target set and rationale.
"""
from typing import Any, Optional
from typing import Any, Iterable, Optional
import pandas as pd
from datatypes.epc.domain.epc import Epc
from datatypes.epc.domain.epc_property_data import (
@ -502,6 +504,28 @@ class EpcMlTransform:
target_columns=dict(_TARGET_COLUMNS),
)
def to_rows(self, properties: Iterable[EpcPropertyData]) -> pd.DataFrame:
"""Apply `to_row` across many properties and return a typed DataFrame.
Columns flagged `categorical=True` in the schema are cast to
`pd.Categorical`; everything else is left at pandas-inferred dtype.
The DataFrame always carries every advertised column, even when the
input is empty.
"""
schema = self.schema()
all_columns = list(schema.feature_columns.keys()) + list(
schema.target_columns.keys()
)
rows = [self.to_row(epc) for epc in properties]
df = pd.DataFrame(rows, columns=all_columns)
for name, spec in schema.feature_columns.items():
if spec.categorical:
df[name] = df[name].astype("category")
for name, spec in schema.target_columns.items():
if spec.categorical:
df[name] = df[name].astype("category")
return df
def to_row(self, epc: EpcPropertyData) -> dict[str, Any]:
"""Map an EpcPropertyData to a single row of features + targets.