mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Sibling migration to the sap10_calculator move — `domain.ml` now lives
at the root-level layout (`domain/sap10_ml/`) matching the pattern
already used by `domain.addresses`, `domain.tasks`, `domain.postcode`,
and `domain.sap10_calculator`.
Changes:
- `git mv packages/domain/src/domain/ml → domain/sap10_ml` (19 files;
history preserved).
- Subpackage rename: `domain.ml` → `domain.sap10_ml`. 32 references
rewritten across .py and .md files: 11 internal + 21 external
(datatypes/epc/domain/mapper.py, 14 files in domain/sap10_calculator,
2 backend tests, 2 ADRs, 1 README, 1 design doc).
- Path-string updates: `pytest.ini` testpath
`packages/domain/src/domain/ml/tests` → `domain/sap10_ml/tests` so
ML tests stay in the default auto-discovered sweep. `CONTEXT.md`
also updated.
`packages/domain/src/domain/` is now empty — the workspace `domain/`
tree has been fully migrated. Together with the `domain/__init__.py`
deletions from the sap10_calculator commit (29ac35cc), `domain` is
now a single root-level namespace package with subpackages
{addresses, sap10_calculator, sap10_ml, tasks} + the standalone
`postcode.py` module.
Verified:
- Focused sweep (backend mapper-chain + sap10_calculator worksheet
e2e + golden fixtures): 99 passed / 19 failed — identical baseline.
- Wider sweep (all sap10_calculator + sap10_ml): 1654 passed / 20
failed (same pre-existing failures).
- domain/sap10_ml/tests: 210/210 PASSED at new path.
- Pyright net-zero: heat_transmission.py 13, cert_to_inputs.py 35,
mapper.py 33, rdsap_uvalues.py 1 (all unchanged from baseline).
Note: `packages/domain/pyproject.toml` still declares
`packages = ["src/domain"]` for the hatchling wheel — that target
directory is now empty and the wheel build is effectively a no-op.
Retiring the workspace package or repointing the wheel is a follow-up.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1734 lines
74 KiB
Python
1734 lines
74 KiB
Python
"""EpcMlTransform — maps EpcPropertyData to ML-ready feature/target columns.
|
||
|
||
The single ML-data contract between this repo and the AutoGluon training repo.
|
||
Versioned semver-style: MAJOR on removing/renaming columns, MINOR on adding.
|
||
|
||
At v0.1.0 the schema contract is fixed and the five directly-extractable targets
|
||
are populated by `to_row()`. The UCL-corrected PEUI target and all feature columns
|
||
are added in subsequent slices.
|
||
|
||
See docs/adr/0007-kwh-as-ml-target.md for the target set and rationale.
|
||
"""
|
||
|
||
from typing import Any, Iterable, Optional
|
||
|
||
import pandas as pd
|
||
|
||
from datatypes.epc.domain.epc import Epc
|
||
from datatypes.epc.domain.epc_property_data import (
|
||
BuildingPartIdentifier,
|
||
EnergyElement,
|
||
EpcPropertyData,
|
||
SapBuildingPart,
|
||
SapEnergySource,
|
||
SapHeating,
|
||
SapWindow,
|
||
)
|
||
from domain.sap10_ml.demand import (
|
||
predicted_hot_water_kwh,
|
||
predicted_lighting_kwh,
|
||
predicted_space_heating_kwh,
|
||
)
|
||
from domain.sap10_ml.ecf import (
|
||
predicted_ecf,
|
||
predicted_log10_ecf,
|
||
predicted_pv_generation_kwh,
|
||
predicted_total_fuel_cost_gbp,
|
||
)
|
||
from domain.sap10_ml.envelope import envelope_heat_loss_w_per_k
|
||
from domain.sap10_ml.ventilation import ventilation_heat_loss_w_per_k
|
||
from domain.sap10_ml.sap_efficiencies import seasonal_efficiency, water_heating_efficiency
|
||
from domain.sap10_ml.schema import ColumnSpec, TransformSchema
|
||
from domain.sap10_ml.ucl import apply_ucl_correction
|
||
|
||
|
||
# SAP10 orientation codes: 1=N, 2=NE, 3=E, 4=SE, 5=S, 6=SW, 7=W, 8=NW.
|
||
# Anything else (0, "NR", etc.) is treated as unrecorded — it contributes to
|
||
# `window_count` and `window_total_area_m2` but to no octant.
|
||
_OCTANT_NAMES: dict[int, str] = {
|
||
1: "N",
|
||
2: "NE",
|
||
3: "E",
|
||
4: "SE",
|
||
5: "S",
|
||
6: "SW",
|
||
7: "W",
|
||
8: "NW",
|
||
}
|
||
|
||
# SAP10 glazed_type enumeration (codes 1-15 per the gov api /api/codes export at
|
||
# datatypes/epc/domain/epc_codes.csv, schema RdSAP-21.0.x). Anything outside this set
|
||
# (the documentation "ND" sentinel, future codes, or unexpected strings) falls into
|
||
# the `_other` bucket so share columns always sum to 1.0 of total window area.
|
||
_GLAZED_TYPE_CODES: tuple[int, ...] = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
|
||
|
||
|
||
_FEATURE_COLUMNS: dict[str, ColumnSpec] = {
|
||
# Geometry
|
||
"total_floor_area_m2": ColumnSpec(
|
||
dtype=float,
|
||
nullable=False,
|
||
description="Total floor area in square metres, from `total_floor_area`.",
|
||
),
|
||
# Counts — directly populated by all SAP10 EPCs
|
||
"door_count": ColumnSpec(
|
||
dtype=int, nullable=False, description="Number of external doors."
|
||
),
|
||
"habitable_rooms_count": ColumnSpec(
|
||
dtype=int, nullable=False, description="Number of habitable rooms."
|
||
),
|
||
"heated_rooms_count": ColumnSpec(
|
||
dtype=int, nullable=False, description="Number of heated rooms."
|
||
),
|
||
"wet_rooms_count": ColumnSpec(
|
||
dtype=int, nullable=False, description="Number of wet rooms (bathrooms / WCs)."
|
||
),
|
||
"extensions_count": ColumnSpec(
|
||
dtype=int,
|
||
nullable=False,
|
||
description="Number of extensions beyond the main dwelling.",
|
||
),
|
||
"open_chimneys_count": ColumnSpec(
|
||
dtype=int, nullable=False, description="Number of open chimneys."
|
||
),
|
||
"insulated_door_count": ColumnSpec(
|
||
dtype=int,
|
||
nullable=False,
|
||
description="Number of external doors classed as insulated.",
|
||
),
|
||
"cfl_fixed_lighting_bulbs_count": ColumnSpec(
|
||
dtype=int,
|
||
nullable=False,
|
||
description="Number of CFL bulbs in fixed lighting outlets.",
|
||
),
|
||
"led_fixed_lighting_bulbs_count": ColumnSpec(
|
||
dtype=int,
|
||
nullable=False,
|
||
description="Number of LED bulbs in fixed lighting outlets.",
|
||
),
|
||
"incandescent_fixed_lighting_bulbs_count": ColumnSpec(
|
||
dtype=int,
|
||
nullable=False,
|
||
description="Number of incandescent bulbs in fixed lighting outlets.",
|
||
),
|
||
# Booleans — directly populated by all SAP10 EPCs
|
||
"solar_water_heating": ColumnSpec(
|
||
dtype=bool, nullable=False, description="Solar water heating present."
|
||
),
|
||
"has_hot_water_cylinder": ColumnSpec(
|
||
dtype=bool, nullable=False, description="Hot water cylinder present."
|
||
),
|
||
"has_fixed_air_conditioning": ColumnSpec(
|
||
dtype=bool, nullable=False, description="Fixed air conditioning present."
|
||
),
|
||
# Optional integer indicators — may be absent on older or partial certificates
|
||
"percent_draughtproofed": ColumnSpec(
|
||
dtype=int,
|
||
nullable=True,
|
||
description="Percentage of windows / doors with draught proofing.",
|
||
),
|
||
# Categoricals — emitted as raw strings; downstream casts to pd.Categorical
|
||
"dwelling_type": ColumnSpec(
|
||
dtype=str,
|
||
nullable=False,
|
||
categorical=True,
|
||
description="Free-form SAP dwelling-type description, e.g. 'Mid-terrace house'.",
|
||
),
|
||
"transaction_type": ColumnSpec(
|
||
dtype=str,
|
||
nullable=False,
|
||
categorical=True,
|
||
description="SAP transaction type code, stringified int.",
|
||
),
|
||
"property_type": ColumnSpec(
|
||
dtype=str,
|
||
nullable=True,
|
||
categorical=True,
|
||
description="SAP property type code, stringified int.",
|
||
),
|
||
"built_form": ColumnSpec(
|
||
dtype=str,
|
||
nullable=True,
|
||
categorical=True,
|
||
description="SAP built-form code, stringified int.",
|
||
),
|
||
"region_code": ColumnSpec(
|
||
dtype=str,
|
||
nullable=True,
|
||
categorical=True,
|
||
description="SAP region code (stringified int) — coarse climate / fuel-rate proxy.",
|
||
),
|
||
"country_code": ColumnSpec(
|
||
dtype=str,
|
||
nullable=True,
|
||
categorical=True,
|
||
description="ISO-style country code, e.g. 'ENG', 'WAL', 'EAW'.",
|
||
),
|
||
# Window aggregates — physics + orientation distribution
|
||
"window_count": ColumnSpec(
|
||
dtype=int, nullable=False, description="Number of windows."
|
||
),
|
||
"window_total_area_m2": ColumnSpec(
|
||
dtype=float,
|
||
nullable=False,
|
||
description="Total window area in square metres, summed across all windows.",
|
||
),
|
||
**{
|
||
f"window_area_orientation_{name}": ColumnSpec(
|
||
dtype=float,
|
||
nullable=False,
|
||
description=f"Total window area in m² facing {name} (SAP orientation code).",
|
||
)
|
||
for name in _OCTANT_NAMES.values()
|
||
},
|
||
"window_pct_draught_proofed": ColumnSpec(
|
||
dtype=float,
|
||
nullable=True,
|
||
description="Area-weighted percentage of windows with draught proofing (0-100).",
|
||
),
|
||
"window_avg_u_value": ColumnSpec(
|
||
dtype=float,
|
||
nullable=True,
|
||
description="Area-weighted mean window U-value (W/m²K); null when no transmission details.",
|
||
),
|
||
"window_avg_solar_transmittance": ColumnSpec(
|
||
dtype=float,
|
||
nullable=True,
|
||
description="Area-weighted mean window solar transmittance; null when no transmission details.",
|
||
),
|
||
# Window glazed_type categorical share columns (sum to 1.0 over total area when any windows present)
|
||
**{
|
||
f"window_pct_glazed_type_{code}": ColumnSpec(
|
||
dtype=float,
|
||
nullable=False,
|
||
description=f"Area share of windows with glazed_type {code} (0.0-1.0).",
|
||
)
|
||
for code in _GLAZED_TYPE_CODES
|
||
},
|
||
"window_pct_glazed_type_other": ColumnSpec(
|
||
dtype=float,
|
||
nullable=False,
|
||
description="Area share of windows with glazed_type outside the SAP10 1-15 enum.",
|
||
),
|
||
"window_pct_pvc_frame": ColumnSpec(
|
||
dtype=float,
|
||
nullable=True,
|
||
description="Area share of windows with PVC frame; null when no windows.",
|
||
),
|
||
# Building parts — cross-all-parts physical aggregates
|
||
"building_parts_count": ColumnSpec(
|
||
dtype=int, nullable=False, description="Number of sap_building_parts."
|
||
),
|
||
"total_heat_loss_perimeter_m": ColumnSpec(
|
||
dtype=float,
|
||
nullable=False,
|
||
description="Total heat-loss perimeter (m), summed across all floor dimensions.",
|
||
),
|
||
"total_party_wall_length_m": ColumnSpec(
|
||
dtype=float,
|
||
nullable=False,
|
||
description="Total party-wall length (m), summed across all floor dimensions.",
|
||
),
|
||
"total_floor_area_from_parts_m2": ColumnSpec(
|
||
dtype=float,
|
||
nullable=False,
|
||
description="Total floor area (m²) summed across sap_building_parts (sanity vs total_floor_area_m2).",
|
||
),
|
||
"avg_room_height_m": ColumnSpec(
|
||
dtype=float,
|
||
nullable=True,
|
||
description="Floor-area-weighted mean room height (m) across all floor dimensions.",
|
||
),
|
||
# Building parts — Main Dwelling carve-out (none of these are populated if the
|
||
# property has no part identified as 'Main Dwelling')
|
||
"main_dwelling_heat_loss_perimeter_m": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Heat-loss perimeter (m) for the Main Dwelling only.",
|
||
),
|
||
"main_dwelling_party_wall_length_m": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Party-wall length (m) for the Main Dwelling only.",
|
||
),
|
||
"main_dwelling_total_floor_area_m2": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Total floor area (m²) for the Main Dwelling only.",
|
||
),
|
||
"main_dwelling_avg_room_height_m": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Floor-area-weighted mean room height (m) for the Main Dwelling.",
|
||
),
|
||
"main_dwelling_has_room_in_roof": ColumnSpec(
|
||
dtype=bool, nullable=True,
|
||
description="True if the Main Dwelling carries a sap_room_in_roof block.",
|
||
),
|
||
"main_dwelling_construction_age_band": ColumnSpec(
|
||
dtype=str, nullable=True, categorical=True,
|
||
description="Main Dwelling construction age band (A-M, '0', or 'NR').",
|
||
),
|
||
"main_dwelling_wall_construction": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Main Dwelling wall construction SAP10 code.",
|
||
),
|
||
"main_dwelling_roof_construction": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Main Dwelling roof construction SAP10 code.",
|
||
),
|
||
# Main Dwelling fabric inputs — wall, roof, floor (model retrofit simulation surface).
|
||
"main_dwelling_wall_insulation_type": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Main Dwelling wall insulation type SAP10 code.",
|
||
),
|
||
"main_dwelling_wall_insulation_thickness_mm": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Main Dwelling wall insulation thickness in mm. 'NI' (no insulation) maps to 0.",
|
||
),
|
||
"main_dwelling_wall_dry_lined": ColumnSpec(
|
||
dtype=bool, nullable=True,
|
||
description="Main Dwelling wall_dry_lined flag.",
|
||
),
|
||
"main_dwelling_wall_thickness_mm": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Main Dwelling external wall thickness in mm.",
|
||
),
|
||
"main_dwelling_party_wall_construction": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Main Dwelling party wall construction SAP10 code (str sentinels NA/NI -> None).",
|
||
),
|
||
"main_dwelling_roof_insulation_location": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Main Dwelling roof insulation location SAP10 code (str sentinels -> None).",
|
||
),
|
||
"main_dwelling_roof_insulation_thickness_mm": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Main Dwelling roof insulation thickness in mm. 'NI' -> 0; non-numeric sentinels -> None.",
|
||
),
|
||
"main_dwelling_floor_construction": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Main Dwelling ground-floor construction SAP10 code (from sap_floor_dimensions[floor==0]).",
|
||
),
|
||
"main_dwelling_floor_insulation": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Main Dwelling ground-floor insulation SAP10 code (from sap_floor_dimensions[floor==0]).",
|
||
),
|
||
"main_dwelling_floor_insulation_thickness_mm": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Main Dwelling floor insulation thickness in mm. 'NI' -> 0; non-numeric sentinels -> None.",
|
||
),
|
||
"main_dwelling_floor_heat_loss": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Main Dwelling floor heat-loss SAP10 code.",
|
||
),
|
||
# Heating — count of main heating systems (usually 1)
|
||
"main_heating_count": ColumnSpec(
|
||
dtype=int, nullable=False,
|
||
description="Number of main heating systems declared on sap_heating.main_heating_details.",
|
||
),
|
||
# Heating — primary (Top-1) slot from main_heating_details[0]
|
||
"primary_main_fuel_type": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Primary heating main_fuel SAP10 code (per epc_codes.csv main_fuel enum).",
|
||
),
|
||
"primary_heat_emitter_type": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Primary heating heat_emitter_type SAP10 code.",
|
||
),
|
||
"primary_main_heating_control": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Primary heating main_heating_control SAP10 code.",
|
||
),
|
||
"primary_main_heating_category": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Primary heating main_heating_category SAP10 code.",
|
||
),
|
||
"primary_has_fghrs": ColumnSpec(
|
||
dtype=bool, nullable=True,
|
||
description="Primary heating has flue gas heat recovery system.",
|
||
),
|
||
"primary_fan_flue_present": ColumnSpec(
|
||
dtype=bool, nullable=True,
|
||
description="Primary heating boiler has a fan flue.",
|
||
),
|
||
"primary_boiler_flue_type": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Primary heating boiler flue type SAP10 code.",
|
||
),
|
||
"primary_central_heating_pump_age": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Primary heating central-heating pump age band (SAP10 enum).",
|
||
),
|
||
# Water heating — on sap_heating directly
|
||
"water_heating_code": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Water heating SAP10 code.",
|
||
),
|
||
"water_heating_fuel": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Water heating fuel SAP10 code (per epc_codes.csv water_heating_fuel enum).",
|
||
),
|
||
"cylinder_size": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Hot water cylinder size SAP10 code (1=small, 2=normal, 3=large).",
|
||
),
|
||
"cylinder_insulation_thickness_mm": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Hot water cylinder insulation thickness (mm).",
|
||
),
|
||
# Secondary heating — present when secondary_fuel_type is set
|
||
"has_secondary_heating": ColumnSpec(
|
||
dtype=bool, nullable=False,
|
||
description="True if sap_heating.secondary_fuel_type is populated.",
|
||
),
|
||
"secondary_fuel_type": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Secondary heating fuel SAP10 code (shares main_fuel enum).",
|
||
),
|
||
# PV — has-pv + measured-vs-estimated capacity + array aggregates
|
||
"has_pv": ColumnSpec(
|
||
dtype=bool, nullable=False,
|
||
description="True if the property has any photovoltaic system (measured or estimated).",
|
||
),
|
||
"pv_capacity_source": ColumnSpec(
|
||
dtype=str, nullable=False, categorical=True,
|
||
description=(
|
||
"How PV capacity is known: 'measured' (per-array peak_power available), "
|
||
"'estimated_from_roof_area' (only percent_roof_area), or 'none'."
|
||
),
|
||
),
|
||
"pv_array_count": ColumnSpec(
|
||
dtype=int, nullable=False,
|
||
description="Number of measured PV arrays (0 unless capacity_source is 'measured').",
|
||
),
|
||
"pv_total_peak_power_kw": ColumnSpec(
|
||
dtype=float, nullable=False,
|
||
description="Sum of peak_power (kW) across measured PV arrays.",
|
||
),
|
||
**{
|
||
f"pv_peak_power_kw_{name}": ColumnSpec(
|
||
dtype=float, nullable=False,
|
||
description=(
|
||
f"Sum of peak_power (kW) for measured PV arrays facing {name} "
|
||
"(SAP orientation code)."
|
||
),
|
||
)
|
||
for name in _OCTANT_NAMES.values()
|
||
},
|
||
"pv_avg_pitch": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Peak-power-weighted mean array pitch (SAP code); null when no measured arrays.",
|
||
),
|
||
"pv_avg_overshading": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Peak-power-weighted mean overshading (SAP code); null when no measured arrays.",
|
||
),
|
||
"pv_percent_roof_area": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Percent of roof covered by PV — populated only when capacity_source = 'estimated_from_roof_area'.",
|
||
),
|
||
# PV battery, wind turbine, energy source flags
|
||
"has_pv_battery": ColumnSpec(
|
||
dtype=bool, nullable=False,
|
||
description="True if the property has at least one PV battery.",
|
||
),
|
||
"pv_battery_count": ColumnSpec(
|
||
dtype=int, nullable=False, description="Number of PV batteries."
|
||
),
|
||
"pv_battery_capacity_kwh": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description=(
|
||
"Total PV battery capacity (kWh) — pv_battery_count × per-unit capacity "
|
||
"from sap_energy_source.pv_batteries. Null when count=0."
|
||
),
|
||
),
|
||
"has_wind_turbine": ColumnSpec(
|
||
dtype=bool, nullable=False,
|
||
description="True if the property has at least one wind turbine.",
|
||
),
|
||
"wind_turbine_count": ColumnSpec(
|
||
dtype=int, nullable=False, description="Number of wind turbines."
|
||
),
|
||
"mains_gas": ColumnSpec(
|
||
dtype=bool, nullable=False,
|
||
description="Property is connected to mains gas (strong fuel-deduction signal).",
|
||
),
|
||
"electricity_smart_meter_present": ColumnSpec(
|
||
dtype=bool, nullable=False,
|
||
description="Electricity smart meter installed.",
|
||
),
|
||
"gas_smart_meter_present": ColumnSpec(
|
||
dtype=bool, nullable=False, description="Gas smart meter installed."
|
||
),
|
||
"is_dwelling_export_capable": ColumnSpec(
|
||
dtype=bool, nullable=False,
|
||
description="Dwelling has an export-capable connection (eligible for SEG).",
|
||
),
|
||
# Ventilation — flat fields direct off EpcPropertyData
|
||
"mechanical_ventilation": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Mechanical ventilation SAP10 code (0=natural, 1-6 per epc_codes.csv enum).",
|
||
),
|
||
"mechanical_vent_duct_type": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Mechanical ventilation duct type SAP10 code.",
|
||
),
|
||
"blocked_chimneys_count": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Number of blocked / capped-off chimneys.",
|
||
),
|
||
"pressure_test": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Air-tightness pressure-test SAP10 code.",
|
||
),
|
||
# Dwelling-level fabric + demand inputs.
|
||
"multiple_glazed_proportion": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Percent of glazed area that is multiple-glazed.",
|
||
),
|
||
"number_baths": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Number of baths declared on sap_heating (hot-water demand proxy).",
|
||
),
|
||
"number_baths_wwhrs": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Number of baths served by a WWHRS unit.",
|
||
),
|
||
"extract_fans_count": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Number of extract fans (ventilation/heat-loss proxy).",
|
||
),
|
||
# Heating — heating-system identity + flow temp + multi-system fraction.
|
||
"primary_sap_main_heating_code": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="SAP10 main heating type code (canonical heating-system enum).",
|
||
),
|
||
"primary_emitter_temperature": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Primary heating emitter temperature class (0=standard, 1=low-temp).",
|
||
),
|
||
"primary_main_heating_fraction": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Fraction of space heating delivered by the primary main heating system.",
|
||
),
|
||
# Hot water — immersion type + presence of shower outlet block.
|
||
"immersion_heating_type": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Electric immersion heater type SAP10 code.",
|
||
),
|
||
"shower_outlet_count": ColumnSpec(
|
||
dtype=int, nullable=False,
|
||
description="1 if any shower_outlet block is declared on sap_heating, else 0.",
|
||
),
|
||
# Windows — per-window-type share aggregates.
|
||
"window_pct_living": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Area share of windows with window_type == 1 (living room).",
|
||
),
|
||
"window_pct_external": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Area share of windows with window_location == 0 (external).",
|
||
),
|
||
"window_pct_permanent_shutters": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Area share of windows with permanent_shutters_present truthy.",
|
||
),
|
||
# Dwelling — conservatory + flat-only block.
|
||
"conservatory_type": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Conservatory SAP10 code (1=none, 2=heated, 3=unheated, ...).",
|
||
),
|
||
"has_heated_separate_conservatory": ColumnSpec(
|
||
dtype=bool, nullable=True,
|
||
description="Whether the dwelling has a heated separate conservatory.",
|
||
),
|
||
"flat_level": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Flat-only: floor number on which the flat sits.",
|
||
),
|
||
"flat_top_storey": ColumnSpec(
|
||
dtype=str, nullable=True, categorical=True,
|
||
description="Flat-only: Y/N flag indicating whether this is the top storey.",
|
||
),
|
||
"flat_storey_count": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Flat-only: storey count of the building containing the flat.",
|
||
),
|
||
"flat_location": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Flat-only: location SAP10 code (corner/middle/...).",
|
||
),
|
||
"flat_heat_loss_corridor": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Flat-only: heat-loss-corridor SAP10 code.",
|
||
),
|
||
# Energy supply categoricals.
|
||
"meter_type": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Electricity meter type SAP10 code (1=Standard, 2=Off-peak, ...).",
|
||
),
|
||
"pv_connection": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="PV connection topology SAP10 code.",
|
||
),
|
||
"wind_turbines_terrain_type": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Wind-turbine terrain type SAP10 code.",
|
||
),
|
||
# Doors.
|
||
"draughtproofed_door_count": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Number of draught-proofed doors.",
|
||
),
|
||
"insulated_door_u_value": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="U-value of insulated doors (W/m^2K).",
|
||
),
|
||
# Hot water extras.
|
||
"cylinder_insulation_type": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Cylinder insulation type SAP10 code (string sentinels -> None).",
|
||
),
|
||
"cylinder_thermostat": ColumnSpec(
|
||
dtype=str, nullable=True, categorical=True,
|
||
description="Cylinder-thermostat flag (Y/N/missing).",
|
||
),
|
||
"secondary_heating_type": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Secondary heating type SAP10 code (distinct from secondary_fuel_type).",
|
||
),
|
||
# Mechanical ventilation extras.
|
||
"mechanical_vent_duct_placement": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Mechanical-vent duct placement SAP10 code.",
|
||
),
|
||
"mechanical_vent_duct_insulation": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Mechanical-vent duct insulation SAP10 code.",
|
||
),
|
||
"mechanical_vent_duct_insulation_level": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Mechanical-vent duct insulation level SAP10 code.",
|
||
),
|
||
"mechanical_vent_measured_installation": ColumnSpec(
|
||
dtype=bool, nullable=True,
|
||
description="Whether mechanical ventilation was measured at installation.",
|
||
),
|
||
# Lighting extras.
|
||
"low_energy_fixed_lighting_bulbs_count": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Number of low-energy fixed-lighting bulbs (separate from CFL/LED).",
|
||
),
|
||
"fixed_lighting_outlets_count": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Total number of fixed-lighting outlets.",
|
||
),
|
||
"low_energy_fixed_lighting_outlets_count": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Number of low-energy fixed-lighting outlets.",
|
||
),
|
||
# Window extras (per-window scalars area-weighted across windows).
|
||
"window_avg_glazing_gap_mm": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Area-weighted average glazing gap in mm (non-numeric sentinels excluded).",
|
||
),
|
||
"window_avg_frame_factor": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Area-weighted average frame factor across windows.",
|
||
),
|
||
"window_pct_permanent_shutters_insulated": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Area share of windows with permanent_shutters_insulated == 'Y'.",
|
||
),
|
||
# Main-dwelling extras: room-in-roof + alternative walls + flat-roof + measured flag.
|
||
"main_dwelling_room_in_roof_floor_area_m2": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Floor area of main dwelling room-in-roof block (when present).",
|
||
),
|
||
"main_dwelling_alternative_wall_count": ColumnSpec(
|
||
dtype=int, nullable=False,
|
||
description="Number of sap_alternative_wall_* blocks on the main dwelling (0-2).",
|
||
),
|
||
"main_dwelling_alternative_wall_area_m2": ColumnSpec(
|
||
dtype=float, nullable=False,
|
||
description="Sum of sap_alternative_wall_*.wall_area for the main dwelling.",
|
||
),
|
||
"main_dwelling_flat_roof_insulation_thickness_mm": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Main dwelling flat-roof insulation thickness in mm (rare).",
|
||
),
|
||
"main_dwelling_wall_thickness_measured": ColumnSpec(
|
||
dtype=bool, nullable=True,
|
||
description="Main dwelling wall_thickness_measured flag.",
|
||
),
|
||
# Element list counts (split-fabric discriminator).
|
||
"wall_count": ColumnSpec(
|
||
dtype=int, nullable=False,
|
||
description="Number of entries in the top-level walls EnergyElement list.",
|
||
),
|
||
"roof_count": ColumnSpec(
|
||
dtype=int, nullable=False,
|
||
description="Number of entries in the top-level roofs EnergyElement list.",
|
||
),
|
||
"floor_count": ColumnSpec(
|
||
dtype=int, nullable=False,
|
||
description="Number of entries in the top-level floors EnergyElement list.",
|
||
),
|
||
"main_heating_count_elements": ColumnSpec(
|
||
dtype=int, nullable=False,
|
||
description="Number of entries in the top-level main_heating EnergyElement list.",
|
||
),
|
||
"main_heating_controls_present": ColumnSpec(
|
||
dtype=bool, nullable=False,
|
||
description="Whether the cert carries a main_heating_controls EnergyElement.",
|
||
),
|
||
# Wind turbine geometry.
|
||
"wind_turbine_hub_height_m": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Hub height of the (first) wind turbine, metres.",
|
||
),
|
||
"wind_turbine_rotor_diameter_m": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Rotor diameter of the (first) wind turbine, metres.",
|
||
),
|
||
# Flat extras.
|
||
"flat_unheated_corridor_length_m": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Flat-only: length (m) of any unheated corridor adjacent to the dwelling.",
|
||
),
|
||
# Addendum (~43% present).
|
||
"addendum_stone_walls": ColumnSpec(
|
||
dtype=bool, nullable=True,
|
||
description="Addendum: stone-wall construction flagged by assessor.",
|
||
),
|
||
"addendum_system_build": ColumnSpec(
|
||
dtype=bool, nullable=True,
|
||
description="Addendum: system-build construction flagged by assessor.",
|
||
),
|
||
"addendum_numbers_count": ColumnSpec(
|
||
dtype=int, nullable=False,
|
||
description="Number of addendum codes flagged.",
|
||
),
|
||
# Low-carbon energy sources.
|
||
"lzc_energy_sources_count": ColumnSpec(
|
||
dtype=int, nullable=False,
|
||
description="Number of LZC energy-source codes declared (0 if none).",
|
||
),
|
||
# Extension 1 (first non-main building part; ~36% of certs).
|
||
"extension_1_present": ColumnSpec(
|
||
dtype=bool, nullable=False,
|
||
description="True if there is a building part beyond the Main Dwelling.",
|
||
),
|
||
"extension_1_wall_construction": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Extension 1 wall construction SAP10 code.",
|
||
),
|
||
"extension_1_wall_insulation_type": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Extension 1 wall insulation type SAP10 code.",
|
||
),
|
||
"extension_1_wall_insulation_thickness_mm": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Extension 1 wall insulation thickness in mm.",
|
||
),
|
||
"extension_1_wall_thickness_mm": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Extension 1 external wall thickness in mm.",
|
||
),
|
||
"extension_1_roof_construction": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Extension 1 roof construction SAP10 code.",
|
||
),
|
||
"extension_1_roof_insulation_thickness_mm": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Extension 1 roof insulation thickness in mm.",
|
||
),
|
||
"extension_1_floor_construction": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Extension 1 ground-floor construction SAP10 code.",
|
||
),
|
||
"extension_1_floor_insulation": ColumnSpec(
|
||
dtype=int, nullable=True, categorical=True,
|
||
description="Extension 1 ground-floor insulation SAP10 code.",
|
||
),
|
||
"extension_1_floor_insulation_thickness_mm": ColumnSpec(
|
||
dtype=int, nullable=True,
|
||
description="Extension 1 floor insulation thickness in mm.",
|
||
),
|
||
"extension_1_total_floor_area_m2": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Extension 1 total floor area (sum of its sap_floor_dimensions).",
|
||
),
|
||
"extension_1_heat_loss_perimeter_m": ColumnSpec(
|
||
dtype=float, nullable=True,
|
||
description="Extension 1 heat-loss perimeter (sum of its sap_floor_dimensions).",
|
||
),
|
||
"other_building_parts_count": ColumnSpec(
|
||
dtype=int, nullable=False,
|
||
description="Number of building parts beyond Main Dwelling and the secondary part.",
|
||
),
|
||
"envelope_heat_loss_w_per_k": ColumnSpec(
|
||
dtype=float, nullable=False,
|
||
description=(
|
||
"Sum of U*A over walls / roof / floor / party walls / windows / doors "
|
||
"plus thermal-bridging factor y times total exposed area, summed across "
|
||
"every sap_building_part. U-values cascade-default per ADR-0008 so the "
|
||
"feature is never null. Approximates the SAP10.2 worksheet's envelope "
|
||
"conduction loss in W/K."
|
||
),
|
||
),
|
||
"ventilation_heat_loss_w_per_k": ColumnSpec(
|
||
dtype=float, nullable=False,
|
||
description=(
|
||
"SAP10.2 §C ventilation heat-loss in W/K from structural infiltration "
|
||
"(0.35 ACH masonry / 0.25 ACH timber) plus open chimneys (40 m³/h each) "
|
||
"minus draught-proofing reduction (0.05 max × window DP share), all "
|
||
"multiplied by dwelling volume × 0.33. Captures the infiltration share "
|
||
"of total heat loss that envelope_heat_loss_w_per_k misses. ADR-0008."
|
||
),
|
||
),
|
||
"seasonal_efficiency_main_heating": ColumnSpec(
|
||
dtype=float, nullable=False,
|
||
description=(
|
||
"Space-heating seasonal efficiency as a decimal (e.g. 0.84 = 84%), "
|
||
"from SAP10.2 Table 4a/4b keyed on primary_sap_main_heating_code. "
|
||
"Unknown codes fall back to 0.80 (gas-boiler typical). ADR-0008."
|
||
),
|
||
),
|
||
"seasonal_efficiency_water_heating": ColumnSpec(
|
||
dtype=float, nullable=False,
|
||
description=(
|
||
"Water-heating seasonal efficiency as a decimal. Code 901 ('from main') "
|
||
"inherits the main code's efficiency; unknown -> 0.78 (gas-combi). "
|
||
"ADR-0008."
|
||
),
|
||
),
|
||
"predicted_space_heating_kwh": ColumnSpec(
|
||
dtype=float, nullable=False,
|
||
description=(
|
||
"Crude annual delivered space-heating kWh: envelope_heat_loss_w_per_k * "
|
||
"HDH_region * 1e-3 / seasonal_efficiency_main_heating. HDH from a 22-row "
|
||
"SAP-region lookup; UK average ~53,000 K*h/yr. ADR-0008."
|
||
),
|
||
),
|
||
"predicted_hot_water_kwh": ColumnSpec(
|
||
dtype=float, nullable=False,
|
||
description=(
|
||
"Crude annual delivered hot-water kWh from SAP10.2 Appendix J simplified: "
|
||
"occupancy from TFA, daily volume 25*N+36 L, delta-T 43 K, +10% losses, "
|
||
"divided by water-heating efficiency. ADR-0008."
|
||
),
|
||
),
|
||
"predicted_lighting_kwh": ColumnSpec(
|
||
dtype=float, nullable=False,
|
||
description=(
|
||
"Crude annual lighting kWh from SAP10.2 Section L simplified: "
|
||
"9.3 * TFA reduced by 50% LED share + 40% CFL share. ADR-0008."
|
||
),
|
||
),
|
||
"predicted_pv_generation_kwh": ColumnSpec(
|
||
dtype=float, nullable=False,
|
||
description=(
|
||
"Annual PV generation kWh: pv_total_peak_power_kw * yield_factor "
|
||
"(SAP10.2 Table 6e region-keyed; UK avg 850 kWh/kWp/yr). "
|
||
"Subtracted from predicted_total_fuel_cost at the standard "
|
||
"electricity rate per SAP10 §13 (slice 17a)."
|
||
),
|
||
),
|
||
"predicted_total_fuel_cost_gbp": ColumnSpec(
|
||
dtype=float, nullable=False,
|
||
description=(
|
||
"Annual regulated fuel cost (gbp/yr): space + DHW + lighting kWh "
|
||
"multiplied by Table 32 unit prices. Standing charges omitted "
|
||
"(approximately a constant fuel-mix offset the model can learn). "
|
||
"ADR-0008 '+ Lighting' scope."
|
||
),
|
||
),
|
||
"predicted_ecf": ColumnSpec(
|
||
dtype=float, nullable=False,
|
||
description=(
|
||
"SAP10 §20.1 Energy Cost Factor: 0.42 * predicted_total_fuel_cost / "
|
||
"(TFA + 45). SAP score is a piecewise log/linear function of ECF. "
|
||
"ADR-0008."
|
||
),
|
||
),
|
||
"predicted_log10_ecf": ColumnSpec(
|
||
dtype=float, nullable=False,
|
||
description=(
|
||
"log10 of predicted_ecf. Monotone with sap_score so a tree-based "
|
||
"model can use this as a near-target feature; the SAP rating's "
|
||
"piecewise kink at ECF=3.5 is one further split. ADR-0008."
|
||
),
|
||
),
|
||
}
|
||
|
||
|
||
_TARGET_COLUMNS: dict[str, ColumnSpec] = {
|
||
"sap_score": ColumnSpec(
|
||
dtype=int,
|
||
nullable=False,
|
||
description="SAP10 energy rating, from `energy_rating_current` on the EPC.",
|
||
),
|
||
"co2_emissions": ColumnSpec(
|
||
dtype=float,
|
||
nullable=False,
|
||
description="Annual CO2 emissions in tonnes/yr, from `co2_emissions_current`.",
|
||
),
|
||
"peui_raw": ColumnSpec(
|
||
dtype=int,
|
||
nullable=False,
|
||
description=(
|
||
"Primary energy intensity (kWh/m2/yr), from `energy_consumption_current`, "
|
||
"untransformed."
|
||
),
|
||
),
|
||
"peui_ucl": ColumnSpec(
|
||
dtype=float,
|
||
nullable=False,
|
||
description=(
|
||
"Primary energy intensity (kWh/m2/yr) with Few et al. 2023 per-band UCL "
|
||
"correction folded into the training label (ADR-0007)."
|
||
),
|
||
),
|
||
"space_heating_kwh": ColumnSpec(
|
||
dtype=float,
|
||
nullable=False,
|
||
description=(
|
||
"Annual space heating delivered kWh, from "
|
||
"`renewable_heat_incentive.space_heating_existing_dwelling`."
|
||
),
|
||
),
|
||
"hot_water_kwh": ColumnSpec(
|
||
dtype=float,
|
||
nullable=False,
|
||
description=(
|
||
"Annual hot water delivered kWh, from `renewable_heat_incentive.water_heating`."
|
||
),
|
||
),
|
||
}
|
||
|
||
|
||
class EpcMlTransform:
|
||
"""Maps an EpcPropertyData to a fixed-width row of ML features + targets.
|
||
|
||
Version 0.1.0 — schema contract only; feature columns added in subsequent slices.
|
||
"""
|
||
|
||
VERSION: str = "2.7.1"
|
||
|
||
def schema(self) -> TransformSchema:
|
||
"""The cross-repo ML data contract.
|
||
|
||
Returns the column manifest the AutoGluon repo reads to know which
|
||
columns are features, which are targets, and their dtypes.
|
||
"""
|
||
return TransformSchema(
|
||
transform_version=self.VERSION,
|
||
feature_columns=dict(_FEATURE_COLUMNS),
|
||
target_columns=dict(_TARGET_COLUMNS),
|
||
)
|
||
|
||
def to_rows(self, properties: Iterable[EpcPropertyData]) -> pd.DataFrame:
|
||
"""Apply `to_row` across many properties and return a typed DataFrame.
|
||
|
||
Columns flagged `categorical=True` in the schema are cast to
|
||
`pd.Categorical`; everything else is left at pandas-inferred dtype.
|
||
The DataFrame always carries every advertised column, even when the
|
||
input is empty.
|
||
"""
|
||
schema = self.schema()
|
||
all_columns = list(schema.feature_columns.keys()) + list(
|
||
schema.target_columns.keys()
|
||
)
|
||
rows = [self.to_row(epc) for epc in properties]
|
||
df = pd.DataFrame(rows, columns=all_columns)
|
||
for name, spec in schema.feature_columns.items():
|
||
if spec.categorical:
|
||
df[name] = df[name].astype("category")
|
||
for name, spec in schema.target_columns.items():
|
||
if spec.categorical:
|
||
df[name] = df[name].astype("category")
|
||
return df
|
||
|
||
def to_row(self, epc: EpcPropertyData) -> dict[str, Any]:
|
||
"""Map an EpcPropertyData to a single row of features + targets.
|
||
|
||
v0.1.0 populates the six targets. Feature columns land in later slices.
|
||
"""
|
||
rhi = epc.renewable_heat_incentive
|
||
window_aggregates = _window_aggregates(epc.sap_windows)
|
||
building_part_aggregates = _building_part_aggregates(epc.sap_building_parts)
|
||
heating_aggregates = _heating_aggregates(epc.sap_heating)
|
||
pv_aggregates = _pv_aggregates(epc.sap_energy_source)
|
||
energy_source_other = _energy_source_other_aggregates(epc.sap_energy_source)
|
||
envelope_w_per_k = envelope_heat_loss_w_per_k(
|
||
sap_building_parts=epc.sap_building_parts,
|
||
country_code=epc.country_code,
|
||
window_total_area_m2=float(window_aggregates.get("window_total_area_m2") or 0.0),
|
||
window_avg_u_value=window_aggregates.get("window_avg_u_value"),
|
||
door_count=epc.door_count,
|
||
insulated_door_count=epc.insulated_door_count,
|
||
insulated_door_u_value=epc.insulated_door_u_value,
|
||
roof_description=_joined_descriptions(epc.roofs),
|
||
wall_description=_joined_descriptions(epc.walls),
|
||
)
|
||
main_wall_con = building_part_aggregates.get("main_dwelling_wall_construction")
|
||
is_timber_frame = isinstance(main_wall_con, int) and main_wall_con in (5, 6)
|
||
avg_room_h = building_part_aggregates.get("avg_room_height_m")
|
||
window_dp_pct = window_aggregates.get("window_pct_draught_proofed")
|
||
ventilation_w_per_k = ventilation_heat_loss_w_per_k(
|
||
total_floor_area_m2=epc.total_floor_area_m2,
|
||
avg_room_height_m=float(avg_room_h) if isinstance(avg_room_h, (int, float)) else 2.5,
|
||
is_timber_frame=is_timber_frame,
|
||
open_chimneys_count=epc.open_chimneys_count,
|
||
window_pct_draught_proofed=float(window_dp_pct) if isinstance(window_dp_pct, (int, float)) else None,
|
||
)
|
||
main_heating_code = heating_aggregates.get("primary_sap_main_heating_code")
|
||
water_code = heating_aggregates.get("water_heating_code")
|
||
main_category = heating_aggregates.get("primary_main_heating_category")
|
||
main_fuel = heating_aggregates.get("primary_main_fuel_type")
|
||
space_eff = seasonal_efficiency(
|
||
main_heating_code if isinstance(main_heating_code, int) else None,
|
||
main_heating_category=main_category if isinstance(main_category, int) else None,
|
||
main_fuel_type=main_fuel if isinstance(main_fuel, int) else None,
|
||
)
|
||
water_eff = water_heating_efficiency(
|
||
water_heating_code=water_code if isinstance(water_code, int) else None,
|
||
main_heating_code=main_heating_code if isinstance(main_heating_code, int) else None,
|
||
)
|
||
pred_space_kwh = predicted_space_heating_kwh(
|
||
envelope_heat_loss_w_per_k=envelope_w_per_k,
|
||
region_code=epc.region_code,
|
||
seasonal_efficiency_main=space_eff,
|
||
ventilation_heat_loss_w_per_k=ventilation_w_per_k,
|
||
)
|
||
cylinder_size_val = heating_aggregates.get("cylinder_size")
|
||
cylinder_ins_thk = heating_aggregates.get("cylinder_insulation_thickness_mm")
|
||
cylinder_ins_type = heating_aggregates.get("cylinder_insulation_type")
|
||
main_age = building_part_aggregates.get("main_dwelling_construction_age_band")
|
||
pred_hw_kwh = predicted_hot_water_kwh(
|
||
total_floor_area_m2=epc.total_floor_area_m2,
|
||
seasonal_efficiency_water=water_eff,
|
||
cylinder_size=cylinder_size_val if isinstance(cylinder_size_val, int) else None,
|
||
cylinder_insulation_thickness_mm=cylinder_ins_thk if isinstance(cylinder_ins_thk, int) else None,
|
||
cylinder_insulation_type=cylinder_ins_type if isinstance(cylinder_ins_type, int) else None,
|
||
age_band=main_age if isinstance(main_age, str) else None,
|
||
has_wwhrs=bool(epc.sap_heating.number_baths_wwhrs and epc.sap_heating.number_baths_wwhrs > 0),
|
||
has_solar_water_heating=epc.solar_water_heating,
|
||
)
|
||
pred_light_kwh = predicted_lighting_kwh(
|
||
total_floor_area_m2=epc.total_floor_area_m2,
|
||
cfl_count=epc.cfl_fixed_lighting_bulbs_count,
|
||
led_count=epc.led_fixed_lighting_bulbs_count,
|
||
incandescent_count=epc.incandescent_fixed_lighting_bulbs_count,
|
||
)
|
||
main_fuel_code = heating_aggregates.get("primary_main_fuel_type")
|
||
water_fuel_code = heating_aggregates.get("water_heating_fuel")
|
||
pv_kw = pv_aggregates.get("pv_total_peak_power_kw") or 0.0
|
||
pred_pv_kwh = predicted_pv_generation_kwh(
|
||
pv_total_peak_power_kw=float(pv_kw),
|
||
region_code=epc.region_code,
|
||
)
|
||
pred_cost = predicted_total_fuel_cost_gbp(
|
||
predicted_space_heating_kwh=pred_space_kwh,
|
||
predicted_hot_water_kwh=pred_hw_kwh,
|
||
predicted_lighting_kwh=pred_light_kwh,
|
||
main_fuel_code=main_fuel_code if isinstance(main_fuel_code, int) else None,
|
||
water_heating_fuel_code=water_fuel_code if isinstance(water_fuel_code, int) else None,
|
||
predicted_pv_kwh=pred_pv_kwh,
|
||
)
|
||
pred_ecf_v = predicted_ecf(
|
||
predicted_total_cost_gbp=pred_cost,
|
||
total_floor_area_m2=epc.total_floor_area_m2,
|
||
)
|
||
pred_log10_ecf_v = predicted_log10_ecf(pred_ecf_v)
|
||
return {
|
||
# Features — geometry
|
||
"total_floor_area_m2": epc.total_floor_area_m2,
|
||
# Features — counts
|
||
"door_count": epc.door_count,
|
||
"habitable_rooms_count": epc.habitable_rooms_count,
|
||
"heated_rooms_count": epc.heated_rooms_count,
|
||
"wet_rooms_count": epc.wet_rooms_count,
|
||
"extensions_count": epc.extensions_count,
|
||
"open_chimneys_count": epc.open_chimneys_count,
|
||
"insulated_door_count": epc.insulated_door_count,
|
||
"cfl_fixed_lighting_bulbs_count": epc.cfl_fixed_lighting_bulbs_count,
|
||
"led_fixed_lighting_bulbs_count": epc.led_fixed_lighting_bulbs_count,
|
||
"incandescent_fixed_lighting_bulbs_count": epc.incandescent_fixed_lighting_bulbs_count,
|
||
# Features — booleans
|
||
"solar_water_heating": epc.solar_water_heating,
|
||
"has_hot_water_cylinder": epc.has_hot_water_cylinder,
|
||
"has_fixed_air_conditioning": epc.has_fixed_air_conditioning,
|
||
# Features — optional integer indicators
|
||
"percent_draughtproofed": epc.percent_draughtproofed,
|
||
# Features — categoricals (raw strings; cast at parquet write time)
|
||
"dwelling_type": epc.dwelling_type,
|
||
"transaction_type": epc.transaction_type,
|
||
"property_type": epc.property_type,
|
||
"built_form": epc.built_form,
|
||
"region_code": epc.region_code,
|
||
"country_code": epc.country_code,
|
||
# Features — window aggregates (physics + orientation)
|
||
**window_aggregates,
|
||
# Features — building parts aggregates + Main Dwelling carve-out
|
||
**building_part_aggregates,
|
||
# Features — engineered physics (ADR-0008)
|
||
"envelope_heat_loss_w_per_k": envelope_w_per_k,
|
||
"ventilation_heat_loss_w_per_k": ventilation_w_per_k,
|
||
"seasonal_efficiency_main_heating": space_eff,
|
||
"seasonal_efficiency_water_heating": water_eff,
|
||
"predicted_space_heating_kwh": pred_space_kwh,
|
||
"predicted_hot_water_kwh": pred_hw_kwh,
|
||
"predicted_lighting_kwh": pred_light_kwh,
|
||
"predicted_pv_generation_kwh": pred_pv_kwh,
|
||
"predicted_total_fuel_cost_gbp": pred_cost,
|
||
"predicted_ecf": pred_ecf_v,
|
||
"predicted_log10_ecf": pred_log10_ecf_v,
|
||
# Features — heating system (primary slot + water + secondary)
|
||
**heating_aggregates,
|
||
# Features — PV (capacity source + array aggregates by SAP octant)
|
||
**pv_aggregates,
|
||
# Features — battery, wind turbine, mains gas + smart meter flags
|
||
**energy_source_other,
|
||
# Features — ventilation
|
||
"mechanical_ventilation": epc.mechanical_ventilation,
|
||
"mechanical_vent_duct_type": epc.mechanical_vent_duct_type,
|
||
"blocked_chimneys_count": epc.blocked_chimneys_count,
|
||
"pressure_test": epc.pressure_test,
|
||
# Features — dwelling-level fabric + demand scalars
|
||
"multiple_glazed_proportion": epc.multiple_glazed_proportion,
|
||
"number_baths": epc.sap_heating.number_baths,
|
||
"number_baths_wwhrs": epc.sap_heating.number_baths_wwhrs,
|
||
"extract_fans_count": epc.extract_fans_count,
|
||
# Features — conservatory + flat-only block
|
||
"conservatory_type": epc.conservatory_type,
|
||
"has_heated_separate_conservatory": epc.has_heated_separate_conservatory,
|
||
"flat_level": (
|
||
_int_or_none(epc.sap_flat_details.level) if epc.sap_flat_details else None
|
||
),
|
||
"flat_top_storey": (
|
||
epc.sap_flat_details.top_storey if epc.sap_flat_details else None
|
||
),
|
||
"flat_storey_count": (
|
||
_int_or_none(epc.sap_flat_details.storey_count) if epc.sap_flat_details else None
|
||
),
|
||
"flat_location": (
|
||
_int_or_none(epc.sap_flat_details.flat_location) if epc.sap_flat_details else None
|
||
),
|
||
"flat_heat_loss_corridor": (
|
||
_int_or_none(epc.sap_flat_details.heat_loss_corridor) if epc.sap_flat_details else None
|
||
),
|
||
# Features — energy supply categoricals
|
||
"meter_type": _meter_type_int(epc.sap_energy_source.meter_type),
|
||
"pv_connection": epc.sap_energy_source.pv_connection,
|
||
"wind_turbines_terrain_type": _wind_terrain_int(epc.sap_energy_source.wind_turbines_terrain_type),
|
||
# Features — doors
|
||
"draughtproofed_door_count": epc.draughtproofed_door_count,
|
||
"insulated_door_u_value": epc.insulated_door_u_value,
|
||
# Features — hot water extras
|
||
"cylinder_insulation_type": _int_or_none(epc.sap_heating.cylinder_insulation_type),
|
||
"cylinder_thermostat": epc.sap_heating.cylinder_thermostat,
|
||
"secondary_heating_type": _int_or_none(epc.sap_heating.secondary_heating_type),
|
||
# Features — mechanical ventilation extras
|
||
"mechanical_vent_duct_placement": epc.mechanical_vent_duct_placement,
|
||
"mechanical_vent_duct_insulation": epc.mechanical_vent_duct_insulation,
|
||
"mechanical_vent_duct_insulation_level": epc.mechanical_vent_duct_insulation_level,
|
||
"mechanical_vent_measured_installation": _truthy_yn(epc.mechanical_vent_measured_installation),
|
||
# Features — lighting extras
|
||
"low_energy_fixed_lighting_bulbs_count": epc.low_energy_fixed_lighting_bulbs_count,
|
||
"fixed_lighting_outlets_count": epc.fixed_lighting_outlets_count,
|
||
"low_energy_fixed_lighting_outlets_count": epc.low_energy_fixed_lighting_outlets_count,
|
||
# Features — element list counts (split-fabric discriminators)
|
||
"wall_count": len(epc.walls),
|
||
"roof_count": len(epc.roofs),
|
||
"floor_count": len(epc.floors),
|
||
"main_heating_count_elements": len(epc.main_heating),
|
||
"main_heating_controls_present": epc.main_heating_controls is not None,
|
||
# Features — wind turbine geometry
|
||
"wind_turbine_hub_height_m": (
|
||
epc.sap_energy_source.wind_turbine_details.hub_height
|
||
if epc.sap_energy_source.wind_turbine_details is not None else None
|
||
),
|
||
"wind_turbine_rotor_diameter_m": (
|
||
epc.sap_energy_source.wind_turbine_details.rotor_diameter
|
||
if epc.sap_energy_source.wind_turbine_details is not None else None
|
||
),
|
||
# Features — flat unheated corridor length
|
||
"flat_unheated_corridor_length_m": (
|
||
epc.sap_flat_details.unheated_corridor_length_m
|
||
if epc.sap_flat_details is not None else None
|
||
),
|
||
# Features — addendum + LZC
|
||
"addendum_stone_walls": (
|
||
epc.addendum.stone_walls if epc.addendum is not None else None
|
||
),
|
||
"addendum_system_build": (
|
||
epc.addendum.system_build if epc.addendum is not None else None
|
||
),
|
||
"addendum_numbers_count": (
|
||
len(epc.addendum.addendum_numbers)
|
||
if epc.addendum is not None and epc.addendum.addendum_numbers is not None
|
||
else 0
|
||
),
|
||
"lzc_energy_sources_count": (
|
||
len(epc.lzc_energy_sources) if epc.lzc_energy_sources is not None else 0
|
||
),
|
||
# Targets
|
||
"sap_score": epc.energy_rating_current,
|
||
"co2_emissions": epc.co2_emissions_current,
|
||
"peui_raw": epc.energy_consumption_current,
|
||
"peui_ucl": _peui_ucl(epc),
|
||
"space_heating_kwh": rhi.space_heating_kwh if rhi is not None else None,
|
||
"hot_water_kwh": rhi.water_heating_kwh if rhi is not None else None,
|
||
}
|
||
|
||
|
||
def _peui_ucl(epc: EpcPropertyData) -> Optional[float]:
|
||
"""Apply the Few et al. per-band UCL correction to PEUI for training labels.
|
||
|
||
Returns None when:
|
||
- either the raw PEUI or the SAP score is missing, or
|
||
- the raw PEUI is non-positive (e.g. net-exporter homes with negative PEUI)
|
||
so the UCL correction is undefined.
|
||
Those rows are unusable as `peui_ucl` training labels and should be dropped
|
||
upstream rather than crashing the transform.
|
||
"""
|
||
if epc.energy_consumption_current is None or epc.energy_rating_current is None:
|
||
return None
|
||
peui_raw = float(epc.energy_consumption_current)
|
||
if peui_raw <= 0:
|
||
return None
|
||
band = Epc.from_sap_score(epc.energy_rating_current)
|
||
return apply_ucl_correction(peui_raw, band)
|
||
|
||
|
||
def _pv_aggregates(es: SapEnergySource) -> dict[str, Any]:
|
||
"""Aggregate the PV side of sap_energy_source into 15 columns.
|
||
|
||
`pv_capacity_source` discriminates the three PV states:
|
||
- 'measured': es.photovoltaic_arrays is non-empty — array aggregates populate
|
||
- 'estimated_from_roof_area': only percent_roof_area > 0 is known
|
||
- 'none': no PV (either no payload, or percent_roof_area == 0)
|
||
"""
|
||
octant_power: dict[str, float] = {name: 0.0 for name in _OCTANT_NAMES.values()}
|
||
aggregates: dict[str, Any] = {
|
||
"has_pv": False,
|
||
"pv_capacity_source": "none",
|
||
"pv_array_count": 0,
|
||
"pv_total_peak_power_kw": 0.0,
|
||
**{f"pv_peak_power_kw_{name}": 0.0 for name in _OCTANT_NAMES.values()},
|
||
"pv_avg_pitch": None,
|
||
"pv_avg_overshading": None,
|
||
"pv_percent_roof_area": None,
|
||
}
|
||
|
||
arrays = es.photovoltaic_arrays
|
||
if arrays:
|
||
total_power = 0.0
|
||
weighted_pitch = 0.0
|
||
weighted_overshading = 0.0
|
||
for a in arrays:
|
||
total_power += a.peak_power
|
||
weighted_pitch += a.pitch * a.peak_power
|
||
weighted_overshading += a.overshading * a.peak_power
|
||
if a.orientation in _OCTANT_NAMES:
|
||
octant_power[_OCTANT_NAMES[a.orientation]] += a.peak_power
|
||
aggregates["has_pv"] = True
|
||
aggregates["pv_capacity_source"] = "measured"
|
||
aggregates["pv_array_count"] = len(arrays)
|
||
aggregates["pv_total_peak_power_kw"] = total_power
|
||
for name, power in octant_power.items():
|
||
aggregates[f"pv_peak_power_kw_{name}"] = power
|
||
if total_power > 0:
|
||
aggregates["pv_avg_pitch"] = weighted_pitch / total_power
|
||
aggregates["pv_avg_overshading"] = weighted_overshading / total_power
|
||
return aggregates
|
||
|
||
supply = es.photovoltaic_supply
|
||
if supply is not None and supply.none_or_no_details.percent_roof_area > 0:
|
||
aggregates["has_pv"] = True
|
||
aggregates["pv_capacity_source"] = "estimated_from_roof_area"
|
||
aggregates["pv_percent_roof_area"] = supply.none_or_no_details.percent_roof_area
|
||
|
||
return aggregates
|
||
|
||
|
||
def _energy_source_other_aggregates(es: SapEnergySource) -> dict[str, Any]:
|
||
"""Pull battery, wind turbine, and household energy source flags.
|
||
|
||
Battery capacity multiplies pv_battery_count by the per-unit capacity carried
|
||
on pv_batteries.pv_battery; null when no battery is present.
|
||
"""
|
||
battery_capacity_kwh: Optional[float] = None
|
||
if es.pv_battery_count > 0 and es.pv_batteries is not None:
|
||
battery_capacity_kwh = (
|
||
es.pv_battery_count * es.pv_batteries.pv_battery.battery_capacity
|
||
)
|
||
return {
|
||
"has_pv_battery": es.pv_battery_count > 0,
|
||
"pv_battery_count": es.pv_battery_count,
|
||
"pv_battery_capacity_kwh": battery_capacity_kwh,
|
||
"has_wind_turbine": es.wind_turbines_count > 0,
|
||
"wind_turbine_count": es.wind_turbines_count,
|
||
"mains_gas": es.mains_gas,
|
||
"electricity_smart_meter_present": es.electricity_smart_meter_present,
|
||
"gas_smart_meter_present": es.gas_smart_meter_present,
|
||
"is_dwelling_export_capable": es.is_dwelling_export_capable,
|
||
}
|
||
|
||
|
||
def _heating_aggregates(sap_heating: SapHeating) -> dict[str, Any]:
|
||
"""Aggregate sap_heating into 15 heating-feature columns.
|
||
|
||
Hybrid Top-1: the primary heating slot comes from `main_heating_details[0]`;
|
||
water heating fields read directly off `sap_heating`; secondary heating is
|
||
inferred from `secondary_fuel_type`. Fields are Union[int, str] in the
|
||
domain object — Union-int values pass through as int categoricals; str
|
||
values (from site notes) coerce to None.
|
||
"""
|
||
shower_outlets = sap_heating.shower_outlets
|
||
shower_outlet_count = 1 if shower_outlets is not None else 0
|
||
aggregates: dict[str, Any] = {
|
||
"main_heating_count": len(sap_heating.main_heating_details),
|
||
"primary_main_fuel_type": None,
|
||
"primary_heat_emitter_type": None,
|
||
"primary_main_heating_control": None,
|
||
"primary_main_heating_category": None,
|
||
"primary_has_fghrs": None,
|
||
"primary_fan_flue_present": None,
|
||
"primary_boiler_flue_type": None,
|
||
"primary_central_heating_pump_age": None,
|
||
"primary_sap_main_heating_code": None,
|
||
"primary_emitter_temperature": None,
|
||
"primary_main_heating_fraction": None,
|
||
"water_heating_code": sap_heating.water_heating_code,
|
||
"water_heating_fuel": sap_heating.water_heating_fuel,
|
||
"cylinder_size": (
|
||
sap_heating.cylinder_size
|
||
if isinstance(sap_heating.cylinder_size, int)
|
||
else None
|
||
),
|
||
"cylinder_insulation_thickness_mm": sap_heating.cylinder_insulation_thickness_mm,
|
||
"has_secondary_heating": sap_heating.secondary_fuel_type is not None,
|
||
"secondary_fuel_type": sap_heating.secondary_fuel_type,
|
||
"immersion_heating_type": _int_or_none(sap_heating.immersion_heating_type),
|
||
"shower_outlet_count": shower_outlet_count,
|
||
}
|
||
|
||
if sap_heating.main_heating_details:
|
||
primary = sap_heating.main_heating_details[0]
|
||
aggregates["primary_main_fuel_type"] = (
|
||
primary.main_fuel_type if isinstance(primary.main_fuel_type, int) else None
|
||
)
|
||
aggregates["primary_heat_emitter_type"] = (
|
||
primary.heat_emitter_type
|
||
if isinstance(primary.heat_emitter_type, int)
|
||
else None
|
||
)
|
||
aggregates["primary_main_heating_control"] = (
|
||
primary.main_heating_control
|
||
if isinstance(primary.main_heating_control, int)
|
||
else None
|
||
)
|
||
aggregates["primary_main_heating_category"] = primary.main_heating_category
|
||
aggregates["primary_has_fghrs"] = primary.has_fghrs
|
||
aggregates["primary_fan_flue_present"] = primary.fan_flue_present
|
||
aggregates["primary_boiler_flue_type"] = primary.boiler_flue_type
|
||
aggregates["primary_central_heating_pump_age"] = (
|
||
primary.central_heating_pump_age
|
||
)
|
||
aggregates["primary_sap_main_heating_code"] = primary.sap_main_heating_code
|
||
aggregates["primary_emitter_temperature"] = _int_or_none(primary.emitter_temperature)
|
||
aggregates["primary_main_heating_fraction"] = primary.main_heating_fraction
|
||
|
||
return aggregates
|
||
|
||
|
||
_MAIN_DWELLING_FABRIC_COLUMNS = (
|
||
"main_dwelling_wall_insulation_type",
|
||
"main_dwelling_wall_insulation_thickness_mm",
|
||
"main_dwelling_wall_dry_lined",
|
||
"main_dwelling_wall_thickness_mm",
|
||
"main_dwelling_party_wall_construction",
|
||
"main_dwelling_roof_insulation_location",
|
||
"main_dwelling_roof_insulation_thickness_mm",
|
||
"main_dwelling_floor_construction",
|
||
"main_dwelling_floor_insulation",
|
||
"main_dwelling_floor_insulation_thickness_mm",
|
||
"main_dwelling_floor_heat_loss",
|
||
"main_dwelling_room_in_roof_floor_area_m2",
|
||
"main_dwelling_flat_roof_insulation_thickness_mm",
|
||
"main_dwelling_wall_thickness_measured",
|
||
)
|
||
|
||
_SECONDARY_DWELLING_FABRIC_COLUMNS = (
|
||
"extension_1_wall_construction",
|
||
"extension_1_wall_insulation_type",
|
||
"extension_1_wall_insulation_thickness_mm",
|
||
"extension_1_wall_thickness_mm",
|
||
"extension_1_roof_construction",
|
||
"extension_1_roof_insulation_thickness_mm",
|
||
"extension_1_floor_construction",
|
||
"extension_1_floor_insulation",
|
||
"extension_1_floor_insulation_thickness_mm",
|
||
"extension_1_total_floor_area_m2",
|
||
"extension_1_heat_loss_perimeter_m",
|
||
)
|
||
|
||
|
||
def _parse_thickness_mm(value: Any) -> Optional[int]:
|
||
"""Parse a SAP10 insulation-thickness string ('100mm', '400mm+', 'NI', 'ND') to int mm.
|
||
|
||
Returns 0 for 'NI' (No Insulation — semantically meaningful as 0mm). Returns None
|
||
for unparseable sentinels like 'ND' or '(assumed)'.
|
||
"""
|
||
if value is None:
|
||
return None
|
||
if isinstance(value, int):
|
||
return value
|
||
if not isinstance(value, str):
|
||
return None
|
||
s = value.strip()
|
||
if s.upper() == "NI":
|
||
return 0
|
||
digits = ""
|
||
for c in s:
|
||
if c.isdigit():
|
||
digits += c
|
||
else:
|
||
break
|
||
return int(digits) if digits else None
|
||
|
||
|
||
def _int_or_none(value: Any) -> Optional[int]:
|
||
"""Treat int values as-is, drop string sentinels like 'NA'/'NI'/'ND'."""
|
||
return value if isinstance(value, int) else None
|
||
|
||
|
||
def _meter_type_int(value: Any) -> Optional[int]:
|
||
"""Domain mapper coerces sap_energy_source.meter_type to str(int) for site-notes
|
||
compatibility ("1", "2", ...). Parse back to int for the categorical feature."""
|
||
if isinstance(value, int):
|
||
return value
|
||
if isinstance(value, str) and value.isdigit():
|
||
return int(value)
|
||
return None
|
||
|
||
|
||
def _wind_terrain_int(value: Any) -> Optional[int]:
|
||
"""Same shape as meter_type — int coerced to str by the 21.0.x mapper."""
|
||
if isinstance(value, int):
|
||
return value
|
||
if isinstance(value, str) and value.isdigit():
|
||
return int(value)
|
||
return None
|
||
|
||
|
||
def _truthy_yn(value: Any) -> Optional[bool]:
|
||
"""Map 'Y'/'true'/True to True, 'N'/'false'/False to False, anything else to None."""
|
||
if value is None:
|
||
return None
|
||
if isinstance(value, bool):
|
||
return value
|
||
if isinstance(value, str):
|
||
v = value.strip().lower()
|
||
if v in ("y", "true", "yes", "1"):
|
||
return True
|
||
if v in ("n", "false", "no", "0"):
|
||
return False
|
||
return None
|
||
|
||
|
||
def _joined_descriptions(elements: list[EnergyElement]) -> Optional[str]:
|
||
"""Concatenate `description` text across an `EnergyElement` list.
|
||
|
||
Used so envelope_heat_loss_w_per_k can spot worst-case markers ("no
|
||
insulation" / "limited insulation") across every roof / wall / floor entry
|
||
on the cert, since those are top-level lists not keyed by building part.
|
||
Returns None when the list is empty so callers can short-circuit.
|
||
"""
|
||
if not elements:
|
||
return None
|
||
parts = [e.description for e in elements if e.description]
|
||
if not parts:
|
||
return None
|
||
return " | ".join(parts)
|
||
|
||
|
||
def _ground_floor(part: SapBuildingPart) -> Optional[Any]:
|
||
"""Pick the ground-floor `SapFloorDimension` (floor==0) for a building part.
|
||
|
||
Falls back to the first floor dimension if no part is flagged as ground floor.
|
||
Returns None if the part has no floor dimensions at all.
|
||
"""
|
||
if not part.sap_floor_dimensions:
|
||
return None
|
||
for fd in part.sap_floor_dimensions:
|
||
if fd.floor == 0:
|
||
return fd
|
||
return part.sap_floor_dimensions[0]
|
||
|
||
|
||
def _building_part_aggregates(parts: list[SapBuildingPart]) -> dict[str, Any]:
|
||
"""Aggregate sap_building_parts into 24 columns: 5 cross-all + 19 Main-Dwelling.
|
||
|
||
Cross-all aggregates always populate (zeros when no parts). Main-Dwelling
|
||
columns populate only when a part with `identifier == "Main Dwelling"` is
|
||
present — otherwise None (we don't silently fall back to the first part).
|
||
"""
|
||
main = next(
|
||
(p for p in parts if p.identifier is BuildingPartIdentifier.MAIN), None
|
||
)
|
||
aggregates: dict[str, Any] = {
|
||
"building_parts_count": len(parts),
|
||
"total_heat_loss_perimeter_m": 0.0,
|
||
"total_party_wall_length_m": 0.0,
|
||
"total_floor_area_from_parts_m2": 0.0,
|
||
"avg_room_height_m": None,
|
||
"main_dwelling_heat_loss_perimeter_m": None,
|
||
"main_dwelling_party_wall_length_m": None,
|
||
"main_dwelling_total_floor_area_m2": None,
|
||
"main_dwelling_avg_room_height_m": None,
|
||
"main_dwelling_has_room_in_roof": None,
|
||
"main_dwelling_construction_age_band": None,
|
||
"main_dwelling_wall_construction": None,
|
||
"main_dwelling_roof_construction": None,
|
||
"main_dwelling_alternative_wall_count": 0,
|
||
"main_dwelling_alternative_wall_area_m2": 0.0,
|
||
"extension_1_present": False,
|
||
"other_building_parts_count": 0,
|
||
}
|
||
for col in _MAIN_DWELLING_FABRIC_COLUMNS:
|
||
aggregates[col] = None
|
||
for col in _SECONDARY_DWELLING_FABRIC_COLUMNS:
|
||
aggregates[col] = None
|
||
if not parts:
|
||
return aggregates
|
||
|
||
total_floor_area = 0.0
|
||
weighted_room_height = 0.0
|
||
for p in parts:
|
||
for fd in p.sap_floor_dimensions:
|
||
aggregates["total_heat_loss_perimeter_m"] += fd.heat_loss_perimeter_m
|
||
aggregates["total_party_wall_length_m"] += fd.party_wall_length_m
|
||
total_floor_area += fd.total_floor_area_m2
|
||
weighted_room_height += fd.room_height_m * fd.total_floor_area_m2
|
||
aggregates["total_floor_area_from_parts_m2"] = total_floor_area
|
||
if total_floor_area > 0:
|
||
aggregates["avg_room_height_m"] = weighted_room_height / total_floor_area
|
||
|
||
if main is not None:
|
||
main_floor_area = 0.0
|
||
main_weighted_height = 0.0
|
||
main_hlp = 0.0
|
||
main_pwl = 0.0
|
||
for fd in main.sap_floor_dimensions:
|
||
main_hlp += fd.heat_loss_perimeter_m
|
||
main_pwl += fd.party_wall_length_m
|
||
main_floor_area += fd.total_floor_area_m2
|
||
main_weighted_height += fd.room_height_m * fd.total_floor_area_m2
|
||
aggregates["main_dwelling_heat_loss_perimeter_m"] = main_hlp
|
||
aggregates["main_dwelling_party_wall_length_m"] = main_pwl
|
||
aggregates["main_dwelling_total_floor_area_m2"] = main_floor_area
|
||
if main_floor_area > 0:
|
||
aggregates["main_dwelling_avg_room_height_m"] = (
|
||
main_weighted_height / main_floor_area
|
||
)
|
||
aggregates["main_dwelling_has_room_in_roof"] = main.sap_room_in_roof is not None
|
||
aggregates["main_dwelling_construction_age_band"] = main.construction_age_band
|
||
aggregates["main_dwelling_wall_construction"] = (
|
||
main.wall_construction
|
||
if isinstance(main.wall_construction, int)
|
||
else None
|
||
)
|
||
aggregates["main_dwelling_roof_construction"] = main.roof_construction
|
||
# New fabric inputs: walls
|
||
aggregates["main_dwelling_wall_insulation_type"] = _int_or_none(main.wall_insulation_type)
|
||
aggregates["main_dwelling_wall_insulation_thickness_mm"] = _parse_thickness_mm(
|
||
main.wall_insulation_thickness
|
||
)
|
||
aggregates["main_dwelling_wall_dry_lined"] = main.wall_dry_lined
|
||
aggregates["main_dwelling_wall_thickness_mm"] = main.wall_thickness_mm
|
||
aggregates["main_dwelling_party_wall_construction"] = _int_or_none(
|
||
main.party_wall_construction
|
||
)
|
||
# New fabric inputs: roof
|
||
aggregates["main_dwelling_roof_insulation_location"] = _int_or_none(
|
||
main.roof_insulation_location
|
||
)
|
||
aggregates["main_dwelling_roof_insulation_thickness_mm"] = _parse_thickness_mm(
|
||
main.roof_insulation_thickness
|
||
)
|
||
# New fabric inputs: floor — from ground-floor SapFloorDimension
|
||
aggregates["main_dwelling_floor_heat_loss"] = main.floor_heat_loss
|
||
aggregates["main_dwelling_floor_insulation_thickness_mm"] = _parse_thickness_mm(
|
||
main.floor_insulation_thickness
|
||
)
|
||
ground_floor = _ground_floor(main)
|
||
if ground_floor is not None:
|
||
aggregates["main_dwelling_floor_construction"] = ground_floor.floor_construction
|
||
aggregates["main_dwelling_floor_insulation"] = ground_floor.floor_insulation
|
||
# Main dwelling extras: room-in-roof, alternative walls, flat-roof, measured flag.
|
||
if main.sap_room_in_roof is not None:
|
||
aggregates["main_dwelling_room_in_roof_floor_area_m2"] = float(
|
||
main.sap_room_in_roof.floor_area
|
||
)
|
||
alt_count = 0
|
||
alt_area = 0.0
|
||
for alt in (main.sap_alternative_wall_1, main.sap_alternative_wall_2):
|
||
if alt is not None:
|
||
alt_count += 1
|
||
alt_area += float(alt.wall_area)
|
||
aggregates["main_dwelling_alternative_wall_count"] = alt_count
|
||
aggregates["main_dwelling_alternative_wall_area_m2"] = alt_area
|
||
aggregates["main_dwelling_flat_roof_insulation_thickness_mm"] = _parse_thickness_mm(
|
||
main.flat_roof_insulation_thickness
|
||
)
|
||
aggregates["main_dwelling_wall_thickness_measured"] = main.wall_thickness_measured
|
||
|
||
# Extension 1 — first non-main entry in the list.
|
||
secondary = next(
|
||
(p for p in parts if p.identifier is not BuildingPartIdentifier.MAIN), None
|
||
)
|
||
if secondary is not None:
|
||
aggregates["extension_1_present"] = True
|
||
aggregates["extension_1_wall_construction"] = _int_or_none(
|
||
secondary.wall_construction
|
||
)
|
||
aggregates["extension_1_wall_insulation_type"] = _int_or_none(
|
||
secondary.wall_insulation_type
|
||
)
|
||
aggregates["extension_1_wall_insulation_thickness_mm"] = _parse_thickness_mm(
|
||
secondary.wall_insulation_thickness
|
||
)
|
||
aggregates["extension_1_wall_thickness_mm"] = secondary.wall_thickness_mm
|
||
aggregates["extension_1_roof_construction"] = secondary.roof_construction
|
||
aggregates["extension_1_roof_insulation_thickness_mm"] = _parse_thickness_mm(
|
||
secondary.roof_insulation_thickness
|
||
)
|
||
sec_ground = _ground_floor(secondary)
|
||
if sec_ground is not None:
|
||
aggregates["extension_1_floor_construction"] = sec_ground.floor_construction
|
||
aggregates["extension_1_floor_insulation"] = sec_ground.floor_insulation
|
||
aggregates["extension_1_floor_insulation_thickness_mm"] = _parse_thickness_mm(
|
||
secondary.floor_insulation_thickness
|
||
)
|
||
sec_floor_area = 0.0
|
||
sec_hlp = 0.0
|
||
if secondary.sap_floor_dimensions:
|
||
for fd in secondary.sap_floor_dimensions:
|
||
sec_floor_area += fd.total_floor_area_m2
|
||
sec_hlp += fd.heat_loss_perimeter_m
|
||
aggregates["extension_1_total_floor_area_m2"] = sec_floor_area
|
||
aggregates["extension_1_heat_loss_perimeter_m"] = sec_hlp
|
||
|
||
# Anything beyond main + secondary just gets counted (extension chains, etc.).
|
||
aggregates["other_building_parts_count"] = max(0, len(parts) - (1 if main else 0) - (1 if secondary else 0))
|
||
|
||
return aggregates
|
||
|
||
|
||
def _window_aggregates(windows: list[SapWindow]) -> dict[str, Any]:
|
||
"""Aggregate a list of windows into the 30 window-feature columns.
|
||
|
||
With no windows: counts/areas/shares are 0; nullable averages and the
|
||
pvc_frame share are None. Windows whose `orientation` isn't an integer in 1-8
|
||
contribute to count and total area but to no octant. Windows whose
|
||
`glazing_type` isn't in the SAP10 1-15 enum fall into the `_other` share.
|
||
"""
|
||
octant_areas: dict[str, float] = {name: 0.0 for name in _OCTANT_NAMES.values()}
|
||
glazed_type_areas: dict[str, float] = {
|
||
f"window_pct_glazed_type_{code}": 0.0 for code in _GLAZED_TYPE_CODES
|
||
}
|
||
glazed_type_areas["window_pct_glazed_type_other"] = 0.0
|
||
aggregates: dict[str, Any] = {
|
||
"window_count": len(windows),
|
||
"window_total_area_m2": 0.0,
|
||
**{f"window_area_orientation_{name}": 0.0 for name in _OCTANT_NAMES.values()},
|
||
"window_pct_draught_proofed": None,
|
||
"window_avg_u_value": None,
|
||
"window_avg_solar_transmittance": None,
|
||
**glazed_type_areas,
|
||
"window_pct_pvc_frame": None,
|
||
"window_pct_living": None,
|
||
"window_pct_external": None,
|
||
"window_pct_permanent_shutters": None,
|
||
"window_avg_glazing_gap_mm": None,
|
||
"window_avg_frame_factor": None,
|
||
"window_pct_permanent_shutters_insulated": None,
|
||
}
|
||
if not windows:
|
||
return aggregates
|
||
|
||
total_area = 0.0
|
||
draught_proofed_area = 0.0
|
||
pvc_frame_area = 0.0
|
||
living_area = 0.0
|
||
external_area = 0.0
|
||
shutters_area = 0.0
|
||
shutters_insulated_area = 0.0
|
||
transmission_area = 0.0
|
||
weighted_u_value = 0.0
|
||
weighted_solar_transmittance = 0.0
|
||
glazing_gap_area = 0.0
|
||
weighted_glazing_gap = 0.0
|
||
frame_factor_area = 0.0
|
||
weighted_frame_factor = 0.0
|
||
for w in windows:
|
||
area = w.window_width * w.window_height
|
||
total_area += area
|
||
if w.draught_proofed is True or w.draught_proofed == "true":
|
||
draught_proofed_area += area
|
||
if w.frame_material == "PVC":
|
||
pvc_frame_area += area
|
||
if w.window_type == 1: # living room
|
||
living_area += area
|
||
if w.window_location == 0: # external (not conservatory)
|
||
external_area += area
|
||
if w.permanent_shutters_present is True or w.permanent_shutters_present == "Y":
|
||
shutters_area += area
|
||
if w.permanent_shutters_insulated == "Y":
|
||
shutters_insulated_area += area
|
||
if isinstance(w.glazing_gap, int):
|
||
glazing_gap_area += area
|
||
weighted_glazing_gap += float(w.glazing_gap) * area
|
||
if w.frame_factor is not None:
|
||
frame_factor_area += area
|
||
weighted_frame_factor += float(w.frame_factor) * area
|
||
if isinstance(w.orientation, int) and w.orientation in _OCTANT_NAMES:
|
||
octant_areas[_OCTANT_NAMES[w.orientation]] += area
|
||
if isinstance(w.glazing_type, int) and w.glazing_type in _GLAZED_TYPE_CODES:
|
||
glazed_type_areas[f"window_pct_glazed_type_{w.glazing_type}"] += area
|
||
else:
|
||
glazed_type_areas["window_pct_glazed_type_other"] += area
|
||
if w.window_transmission_details is not None:
|
||
transmission_area += area
|
||
weighted_u_value += w.window_transmission_details.u_value * area
|
||
weighted_solar_transmittance += (
|
||
w.window_transmission_details.solar_transmittance * area
|
||
)
|
||
|
||
aggregates["window_total_area_m2"] = total_area
|
||
for name, area in octant_areas.items():
|
||
aggregates[f"window_area_orientation_{name}"] = area
|
||
if total_area > 0:
|
||
aggregates["window_pct_draught_proofed"] = (
|
||
draught_proofed_area / total_area * 100.0
|
||
)
|
||
aggregates["window_pct_pvc_frame"] = pvc_frame_area / total_area
|
||
aggregates["window_pct_living"] = living_area / total_area
|
||
aggregates["window_pct_external"] = external_area / total_area
|
||
aggregates["window_pct_permanent_shutters"] = shutters_area / total_area
|
||
aggregates["window_pct_permanent_shutters_insulated"] = (
|
||
shutters_insulated_area / total_area
|
||
)
|
||
for column, area in glazed_type_areas.items():
|
||
aggregates[column] = area / total_area
|
||
if transmission_area > 0:
|
||
aggregates["window_avg_u_value"] = weighted_u_value / transmission_area
|
||
aggregates["window_avg_solar_transmittance"] = (
|
||
weighted_solar_transmittance / transmission_area
|
||
)
|
||
if glazing_gap_area > 0:
|
||
aggregates["window_avg_glazing_gap_mm"] = weighted_glazing_gap / glazing_gap_area
|
||
if frame_factor_area > 0:
|
||
aggregates["window_avg_frame_factor"] = weighted_frame_factor / frame_factor_area
|
||
return aggregates
|