"""EpcMlTransform — maps EpcPropertyData to ML-ready feature/target columns. The single ML-data contract between this repo and the AutoGluon training repo. Versioned semver-style: MAJOR on removing/renaming columns, MINOR on adding. At v0.1.0 the schema contract is fixed and the five directly-extractable targets are populated by `to_row()`. The UCL-corrected PEUI target and all feature columns are added in subsequent slices. See docs/adr/0007-kwh-as-ml-target.md for the target set and rationale. """ from typing import Any, Iterable, Optional import pandas as pd from datatypes.epc.domain.epc import Epc from datatypes.epc.domain.epc_property_data import ( BuildingPartIdentifier, EnergyElement, EpcPropertyData, SapBuildingPart, SapEnergySource, SapHeating, SapWindow, ) from domain.sap10_ml.demand import ( predicted_hot_water_kwh, predicted_lighting_kwh, predicted_space_heating_kwh, ) from domain.sap10_ml.ecf import ( predicted_ecf, predicted_log10_ecf, predicted_pv_generation_kwh, predicted_total_fuel_cost_gbp, ) from domain.sap10_ml.envelope import envelope_heat_loss_w_per_k from domain.sap10_ml.ventilation import ventilation_heat_loss_w_per_k from domain.sap10_ml.sap_efficiencies import seasonal_efficiency, water_heating_efficiency from domain.sap10_ml.schema import ColumnSpec, TransformSchema from domain.sap10_ml.ucl import apply_ucl_correction # SAP10 orientation codes: 1=N, 2=NE, 3=E, 4=SE, 5=S, 6=SW, 7=W, 8=NW. # Anything else (0, "NR", etc.) is treated as unrecorded — it contributes to # `window_count` and `window_total_area_m2` but to no octant. _OCTANT_NAMES: dict[int, str] = { 1: "N", 2: "NE", 3: "E", 4: "SE", 5: "S", 6: "SW", 7: "W", 8: "NW", } # SAP10 glazed_type enumeration (codes 1-15 per the gov api /api/codes export at # datatypes/epc/domain/epc_codes.csv, schema RdSAP-21.0.x). Anything outside this set # (the documentation "ND" sentinel, future codes, or unexpected strings) falls into # the `_other` bucket so share columns always sum to 1.0 of total window area. _GLAZED_TYPE_CODES: tuple[int, ...] = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) _FEATURE_COLUMNS: dict[str, ColumnSpec] = { # Geometry "total_floor_area_m2": ColumnSpec( dtype=float, nullable=False, description="Total floor area in square metres, from `total_floor_area`.", ), # Counts — directly populated by all SAP10 EPCs "door_count": ColumnSpec( dtype=int, nullable=False, description="Number of external doors." ), "habitable_rooms_count": ColumnSpec( dtype=int, nullable=False, description="Number of habitable rooms." ), "heated_rooms_count": ColumnSpec( dtype=int, nullable=False, description="Number of heated rooms." ), "wet_rooms_count": ColumnSpec( dtype=int, nullable=False, description="Number of wet rooms (bathrooms / WCs)." ), "extensions_count": ColumnSpec( dtype=int, nullable=False, description="Number of extensions beyond the main dwelling.", ), "open_chimneys_count": ColumnSpec( dtype=int, nullable=False, description="Number of open chimneys." ), "insulated_door_count": ColumnSpec( dtype=int, nullable=False, description="Number of external doors classed as insulated.", ), "cfl_fixed_lighting_bulbs_count": ColumnSpec( dtype=int, nullable=False, description="Number of CFL bulbs in fixed lighting outlets.", ), "led_fixed_lighting_bulbs_count": ColumnSpec( dtype=int, nullable=False, description="Number of LED bulbs in fixed lighting outlets.", ), "incandescent_fixed_lighting_bulbs_count": ColumnSpec( dtype=int, nullable=False, description="Number of incandescent bulbs in fixed lighting outlets.", ), # Booleans — directly populated by all SAP10 EPCs "solar_water_heating": ColumnSpec( dtype=bool, nullable=False, description="Solar water heating present." ), "has_hot_water_cylinder": ColumnSpec( dtype=bool, nullable=False, description="Hot water cylinder present." ), "has_fixed_air_conditioning": ColumnSpec( dtype=bool, nullable=False, description="Fixed air conditioning present." ), # Optional integer indicators — may be absent on older or partial certificates "percent_draughtproofed": ColumnSpec( dtype=int, nullable=True, description="Percentage of windows / doors with draught proofing.", ), # Categoricals — emitted as raw strings; downstream casts to pd.Categorical "dwelling_type": ColumnSpec( dtype=str, nullable=False, categorical=True, description="Free-form SAP dwelling-type description, e.g. 'Mid-terrace house'.", ), "transaction_type": ColumnSpec( dtype=str, nullable=False, categorical=True, description="SAP transaction type code, stringified int.", ), "property_type": ColumnSpec( dtype=str, nullable=True, categorical=True, description="SAP property type code, stringified int.", ), "built_form": ColumnSpec( dtype=str, nullable=True, categorical=True, description="SAP built-form code, stringified int.", ), "region_code": ColumnSpec( dtype=str, nullable=True, categorical=True, description="SAP region code (stringified int) — coarse climate / fuel-rate proxy.", ), "country_code": ColumnSpec( dtype=str, nullable=True, categorical=True, description="ISO-style country code, e.g. 'ENG', 'WAL', 'EAW'.", ), # Window aggregates — physics + orientation distribution "window_count": ColumnSpec( dtype=int, nullable=False, description="Number of windows." ), "window_total_area_m2": ColumnSpec( dtype=float, nullable=False, description="Total window area in square metres, summed across all windows.", ), **{ f"window_area_orientation_{name}": ColumnSpec( dtype=float, nullable=False, description=f"Total window area in m² facing {name} (SAP orientation code).", ) for name in _OCTANT_NAMES.values() }, "window_pct_draught_proofed": ColumnSpec( dtype=float, nullable=True, description="Area-weighted percentage of windows with draught proofing (0-100).", ), "window_avg_u_value": ColumnSpec( dtype=float, nullable=True, description="Area-weighted mean window U-value (W/m²K); null when no transmission details.", ), "window_avg_solar_transmittance": ColumnSpec( dtype=float, nullable=True, description="Area-weighted mean window solar transmittance; null when no transmission details.", ), # Window glazed_type categorical share columns (sum to 1.0 over total area when any windows present) **{ f"window_pct_glazed_type_{code}": ColumnSpec( dtype=float, nullable=False, description=f"Area share of windows with glazed_type {code} (0.0-1.0).", ) for code in _GLAZED_TYPE_CODES }, "window_pct_glazed_type_other": ColumnSpec( dtype=float, nullable=False, description="Area share of windows with glazed_type outside the SAP10 1-15 enum.", ), "window_pct_pvc_frame": ColumnSpec( dtype=float, nullable=True, description="Area share of windows with PVC frame; null when no windows.", ), # Building parts — cross-all-parts physical aggregates "building_parts_count": ColumnSpec( dtype=int, nullable=False, description="Number of sap_building_parts." ), "total_heat_loss_perimeter_m": ColumnSpec( dtype=float, nullable=False, description="Total heat-loss perimeter (m), summed across all floor dimensions.", ), "total_party_wall_length_m": ColumnSpec( dtype=float, nullable=False, description="Total party-wall length (m), summed across all floor dimensions.", ), "total_floor_area_from_parts_m2": ColumnSpec( dtype=float, nullable=False, description="Total floor area (m²) summed across sap_building_parts (sanity vs total_floor_area_m2).", ), "avg_room_height_m": ColumnSpec( dtype=float, nullable=True, description="Floor-area-weighted mean room height (m) across all floor dimensions.", ), # Building parts — Main Dwelling carve-out (none of these are populated if the # property has no part identified as 'Main Dwelling') "main_dwelling_heat_loss_perimeter_m": ColumnSpec( dtype=float, nullable=True, description="Heat-loss perimeter (m) for the Main Dwelling only.", ), "main_dwelling_party_wall_length_m": ColumnSpec( dtype=float, nullable=True, description="Party-wall length (m) for the Main Dwelling only.", ), "main_dwelling_total_floor_area_m2": ColumnSpec( dtype=float, nullable=True, description="Total floor area (m²) for the Main Dwelling only.", ), "main_dwelling_avg_room_height_m": ColumnSpec( dtype=float, nullable=True, description="Floor-area-weighted mean room height (m) for the Main Dwelling.", ), "main_dwelling_has_room_in_roof": ColumnSpec( dtype=bool, nullable=True, description="True if the Main Dwelling carries a sap_room_in_roof block.", ), "main_dwelling_construction_age_band": ColumnSpec( dtype=str, nullable=True, categorical=True, description="Main Dwelling construction age band (A-M, '0', or 'NR').", ), "main_dwelling_wall_construction": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Main Dwelling wall construction SAP10 code.", ), "main_dwelling_roof_construction": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Main Dwelling roof construction SAP10 code.", ), # Main Dwelling fabric inputs — wall, roof, floor (model retrofit simulation surface). "main_dwelling_wall_insulation_type": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Main Dwelling wall insulation type SAP10 code.", ), "main_dwelling_wall_insulation_thickness_mm": ColumnSpec( dtype=int, nullable=True, description="Main Dwelling wall insulation thickness in mm. 'NI' (no insulation) maps to 0.", ), "main_dwelling_wall_dry_lined": ColumnSpec( dtype=bool, nullable=True, description="Main Dwelling wall_dry_lined flag.", ), "main_dwelling_wall_thickness_mm": ColumnSpec( dtype=int, nullable=True, description="Main Dwelling external wall thickness in mm.", ), "main_dwelling_party_wall_construction": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Main Dwelling party wall construction SAP10 code (str sentinels NA/NI -> None).", ), "main_dwelling_roof_insulation_location": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Main Dwelling roof insulation location SAP10 code (str sentinels -> None).", ), "main_dwelling_roof_insulation_thickness_mm": ColumnSpec( dtype=int, nullable=True, description="Main Dwelling roof insulation thickness in mm. 'NI' -> 0; non-numeric sentinels -> None.", ), "main_dwelling_floor_construction": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Main Dwelling ground-floor construction SAP10 code (from sap_floor_dimensions[floor==0]).", ), "main_dwelling_floor_insulation": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Main Dwelling ground-floor insulation SAP10 code (from sap_floor_dimensions[floor==0]).", ), "main_dwelling_floor_insulation_thickness_mm": ColumnSpec( dtype=int, nullable=True, description="Main Dwelling floor insulation thickness in mm. 'NI' -> 0; non-numeric sentinels -> None.", ), "main_dwelling_floor_heat_loss": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Main Dwelling floor heat-loss SAP10 code.", ), # Heating — count of main heating systems (usually 1) "main_heating_count": ColumnSpec( dtype=int, nullable=False, description="Number of main heating systems declared on sap_heating.main_heating_details.", ), # Heating — primary (Top-1) slot from main_heating_details[0] "primary_main_fuel_type": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Primary heating main_fuel SAP10 code (per epc_codes.csv main_fuel enum).", ), "primary_heat_emitter_type": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Primary heating heat_emitter_type SAP10 code.", ), "primary_main_heating_control": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Primary heating main_heating_control SAP10 code.", ), "primary_main_heating_category": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Primary heating main_heating_category SAP10 code.", ), "primary_has_fghrs": ColumnSpec( dtype=bool, nullable=True, description="Primary heating has flue gas heat recovery system.", ), "primary_fan_flue_present": ColumnSpec( dtype=bool, nullable=True, description="Primary heating boiler has a fan flue.", ), "primary_boiler_flue_type": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Primary heating boiler flue type SAP10 code.", ), "primary_central_heating_pump_age": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Primary heating central-heating pump age band (SAP10 enum).", ), # Water heating — on sap_heating directly "water_heating_code": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Water heating SAP10 code.", ), "water_heating_fuel": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Water heating fuel SAP10 code (per epc_codes.csv water_heating_fuel enum).", ), "cylinder_size": ColumnSpec( dtype=int, nullable=True, description="Hot water cylinder size SAP10 code (1=small, 2=normal, 3=large).", ), "cylinder_insulation_thickness_mm": ColumnSpec( dtype=int, nullable=True, description="Hot water cylinder insulation thickness (mm).", ), # Secondary heating — present when secondary_fuel_type is set "has_secondary_heating": ColumnSpec( dtype=bool, nullable=False, description="True if sap_heating.secondary_fuel_type is populated.", ), "secondary_fuel_type": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Secondary heating fuel SAP10 code (shares main_fuel enum).", ), # PV — has-pv + measured-vs-estimated capacity + array aggregates "has_pv": ColumnSpec( dtype=bool, nullable=False, description="True if the property has any photovoltaic system (measured or estimated).", ), "pv_capacity_source": ColumnSpec( dtype=str, nullable=False, categorical=True, description=( "How PV capacity is known: 'measured' (per-array peak_power available), " "'estimated_from_roof_area' (only percent_roof_area), or 'none'." ), ), "pv_array_count": ColumnSpec( dtype=int, nullable=False, description="Number of measured PV arrays (0 unless capacity_source is 'measured').", ), "pv_total_peak_power_kw": ColumnSpec( dtype=float, nullable=False, description="Sum of peak_power (kW) across measured PV arrays.", ), **{ f"pv_peak_power_kw_{name}": ColumnSpec( dtype=float, nullable=False, description=( f"Sum of peak_power (kW) for measured PV arrays facing {name} " "(SAP orientation code)." ), ) for name in _OCTANT_NAMES.values() }, "pv_avg_pitch": ColumnSpec( dtype=float, nullable=True, description="Peak-power-weighted mean array pitch (SAP code); null when no measured arrays.", ), "pv_avg_overshading": ColumnSpec( dtype=float, nullable=True, description="Peak-power-weighted mean overshading (SAP code); null when no measured arrays.", ), "pv_percent_roof_area": ColumnSpec( dtype=int, nullable=True, description="Percent of roof covered by PV — populated only when capacity_source = 'estimated_from_roof_area'.", ), # PV battery, wind turbine, energy source flags "has_pv_battery": ColumnSpec( dtype=bool, nullable=False, description="True if the property has at least one PV battery.", ), "pv_battery_count": ColumnSpec( dtype=int, nullable=False, description="Number of PV batteries." ), "pv_battery_capacity_kwh": ColumnSpec( dtype=float, nullable=True, description=( "Total PV battery capacity (kWh) — pv_battery_count × per-unit capacity " "from sap_energy_source.pv_batteries. Null when count=0." ), ), "has_wind_turbine": ColumnSpec( dtype=bool, nullable=False, description="True if the property has at least one wind turbine.", ), "wind_turbine_count": ColumnSpec( dtype=int, nullable=False, description="Number of wind turbines." ), "mains_gas": ColumnSpec( dtype=bool, nullable=False, description="Property is connected to mains gas (strong fuel-deduction signal).", ), "electricity_smart_meter_present": ColumnSpec( dtype=bool, nullable=False, description="Electricity smart meter installed.", ), "gas_smart_meter_present": ColumnSpec( dtype=bool, nullable=False, description="Gas smart meter installed." ), "is_dwelling_export_capable": ColumnSpec( dtype=bool, nullable=False, description="Dwelling has an export-capable connection (eligible for SEG).", ), # Ventilation — flat fields direct off EpcPropertyData "mechanical_ventilation": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Mechanical ventilation SAP10 code (0=natural, 1-6 per epc_codes.csv enum).", ), "mechanical_vent_duct_type": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Mechanical ventilation duct type SAP10 code.", ), "blocked_chimneys_count": ColumnSpec( dtype=int, nullable=True, description="Number of blocked / capped-off chimneys.", ), "pressure_test": ColumnSpec( dtype=int, nullable=True, description="Air-tightness pressure-test SAP10 code.", ), # Dwelling-level fabric + demand inputs. "multiple_glazed_proportion": ColumnSpec( dtype=int, nullable=True, description="Percent of glazed area that is multiple-glazed.", ), "number_baths": ColumnSpec( dtype=int, nullable=True, description="Number of baths declared on sap_heating (hot-water demand proxy).", ), "number_baths_wwhrs": ColumnSpec( dtype=int, nullable=True, description="Number of baths served by a WWHRS unit.", ), "extract_fans_count": ColumnSpec( dtype=int, nullable=True, description="Number of extract fans (ventilation/heat-loss proxy).", ), # Heating — heating-system identity + flow temp + multi-system fraction. "primary_sap_main_heating_code": ColumnSpec( dtype=int, nullable=True, categorical=True, description="SAP10 main heating type code (canonical heating-system enum).", ), "primary_emitter_temperature": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Primary heating emitter temperature class (0=standard, 1=low-temp).", ), "primary_main_heating_fraction": ColumnSpec( dtype=float, nullable=True, description="Fraction of space heating delivered by the primary main heating system.", ), # Hot water — immersion type + presence of shower outlet block. "immersion_heating_type": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Electric immersion heater type SAP10 code.", ), "shower_outlet_count": ColumnSpec( dtype=int, nullable=False, description="1 if any shower_outlet block is declared on sap_heating, else 0.", ), # Windows — per-window-type share aggregates. "window_pct_living": ColumnSpec( dtype=float, nullable=True, description="Area share of windows with window_type == 1 (living room).", ), "window_pct_external": ColumnSpec( dtype=float, nullable=True, description="Area share of windows with window_location == 0 (external).", ), "window_pct_permanent_shutters": ColumnSpec( dtype=float, nullable=True, description="Area share of windows with permanent_shutters_present truthy.", ), # Dwelling — conservatory + flat-only block. "conservatory_type": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Conservatory SAP10 code (1=none, 2=heated, 3=unheated, ...).", ), "has_heated_separate_conservatory": ColumnSpec( dtype=bool, nullable=True, description="Whether the dwelling has a heated separate conservatory.", ), "flat_level": ColumnSpec( dtype=int, nullable=True, description="Flat-only: floor number on which the flat sits.", ), "flat_top_storey": ColumnSpec( dtype=str, nullable=True, categorical=True, description="Flat-only: Y/N flag indicating whether this is the top storey.", ), "flat_storey_count": ColumnSpec( dtype=int, nullable=True, description="Flat-only: storey count of the building containing the flat.", ), "flat_location": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Flat-only: location SAP10 code (corner/middle/...).", ), "flat_heat_loss_corridor": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Flat-only: heat-loss-corridor SAP10 code.", ), # Energy supply categoricals. "meter_type": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Electricity meter type SAP10 code (1=Standard, 2=Off-peak, ...).", ), "pv_connection": ColumnSpec( dtype=int, nullable=True, categorical=True, description="PV connection topology SAP10 code.", ), "wind_turbines_terrain_type": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Wind-turbine terrain type SAP10 code.", ), # Doors. "draughtproofed_door_count": ColumnSpec( dtype=int, nullable=True, description="Number of draught-proofed doors.", ), "insulated_door_u_value": ColumnSpec( dtype=float, nullable=True, description="U-value of insulated doors (W/m^2K).", ), # Hot water extras. "cylinder_insulation_type": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Cylinder insulation type SAP10 code (string sentinels -> None).", ), "cylinder_thermostat": ColumnSpec( dtype=str, nullable=True, categorical=True, description="Cylinder-thermostat flag (Y/N/missing).", ), "secondary_heating_type": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Secondary heating type SAP10 code (distinct from secondary_fuel_type).", ), # Mechanical ventilation extras. "mechanical_vent_duct_placement": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Mechanical-vent duct placement SAP10 code.", ), "mechanical_vent_duct_insulation": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Mechanical-vent duct insulation SAP10 code.", ), "mechanical_vent_duct_insulation_level": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Mechanical-vent duct insulation level SAP10 code.", ), "mechanical_vent_measured_installation": ColumnSpec( dtype=bool, nullable=True, description="Whether mechanical ventilation was measured at installation.", ), # Lighting extras. "low_energy_fixed_lighting_bulbs_count": ColumnSpec( dtype=int, nullable=True, description="Number of low-energy fixed-lighting bulbs (separate from CFL/LED).", ), "fixed_lighting_outlets_count": ColumnSpec( dtype=int, nullable=True, description="Total number of fixed-lighting outlets.", ), "low_energy_fixed_lighting_outlets_count": ColumnSpec( dtype=int, nullable=True, description="Number of low-energy fixed-lighting outlets.", ), # Window extras (per-window scalars area-weighted across windows). "window_avg_glazing_gap_mm": ColumnSpec( dtype=float, nullable=True, description="Area-weighted average glazing gap in mm (non-numeric sentinels excluded).", ), "window_avg_frame_factor": ColumnSpec( dtype=float, nullable=True, description="Area-weighted average frame factor across windows.", ), "window_pct_permanent_shutters_insulated": ColumnSpec( dtype=float, nullable=True, description="Area share of windows with permanent_shutters_insulated == 'Y'.", ), # Main-dwelling extras: room-in-roof + alternative walls + flat-roof + measured flag. "main_dwelling_room_in_roof_floor_area_m2": ColumnSpec( dtype=float, nullable=True, description="Floor area of main dwelling room-in-roof block (when present).", ), "main_dwelling_alternative_wall_count": ColumnSpec( dtype=int, nullable=False, description="Number of sap_alternative_wall_* blocks on the main dwelling (0-2).", ), "main_dwelling_alternative_wall_area_m2": ColumnSpec( dtype=float, nullable=False, description="Sum of sap_alternative_wall_*.wall_area for the main dwelling.", ), "main_dwelling_flat_roof_insulation_thickness_mm": ColumnSpec( dtype=int, nullable=True, description="Main dwelling flat-roof insulation thickness in mm (rare).", ), "main_dwelling_wall_thickness_measured": ColumnSpec( dtype=bool, nullable=True, description="Main dwelling wall_thickness_measured flag.", ), # Element list counts (split-fabric discriminator). "wall_count": ColumnSpec( dtype=int, nullable=False, description="Number of entries in the top-level walls EnergyElement list.", ), "roof_count": ColumnSpec( dtype=int, nullable=False, description="Number of entries in the top-level roofs EnergyElement list.", ), "floor_count": ColumnSpec( dtype=int, nullable=False, description="Number of entries in the top-level floors EnergyElement list.", ), "main_heating_count_elements": ColumnSpec( dtype=int, nullable=False, description="Number of entries in the top-level main_heating EnergyElement list.", ), "main_heating_controls_present": ColumnSpec( dtype=bool, nullable=False, description="Whether the cert carries a main_heating_controls EnergyElement.", ), # Wind turbine geometry. "wind_turbine_hub_height_m": ColumnSpec( dtype=float, nullable=True, description="Hub height of the (first) wind turbine, metres.", ), "wind_turbine_rotor_diameter_m": ColumnSpec( dtype=float, nullable=True, description="Rotor diameter of the (first) wind turbine, metres.", ), # Flat extras. "flat_unheated_corridor_length_m": ColumnSpec( dtype=int, nullable=True, description="Flat-only: length (m) of any unheated corridor adjacent to the dwelling.", ), # Addendum (~43% present). "addendum_stone_walls": ColumnSpec( dtype=bool, nullable=True, description="Addendum: stone-wall construction flagged by assessor.", ), "addendum_system_build": ColumnSpec( dtype=bool, nullable=True, description="Addendum: system-build construction flagged by assessor.", ), "addendum_numbers_count": ColumnSpec( dtype=int, nullable=False, description="Number of addendum codes flagged.", ), # Low-carbon energy sources. "lzc_energy_sources_count": ColumnSpec( dtype=int, nullable=False, description="Number of LZC energy-source codes declared (0 if none).", ), # Extension 1 (first non-main building part; ~36% of certs). "extension_1_present": ColumnSpec( dtype=bool, nullable=False, description="True if there is a building part beyond the Main Dwelling.", ), "extension_1_wall_construction": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Extension 1 wall construction SAP10 code.", ), "extension_1_wall_insulation_type": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Extension 1 wall insulation type SAP10 code.", ), "extension_1_wall_insulation_thickness_mm": ColumnSpec( dtype=int, nullable=True, description="Extension 1 wall insulation thickness in mm.", ), "extension_1_wall_thickness_mm": ColumnSpec( dtype=int, nullable=True, description="Extension 1 external wall thickness in mm.", ), "extension_1_roof_construction": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Extension 1 roof construction SAP10 code.", ), "extension_1_roof_insulation_thickness_mm": ColumnSpec( dtype=int, nullable=True, description="Extension 1 roof insulation thickness in mm.", ), "extension_1_floor_construction": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Extension 1 ground-floor construction SAP10 code.", ), "extension_1_floor_insulation": ColumnSpec( dtype=int, nullable=True, categorical=True, description="Extension 1 ground-floor insulation SAP10 code.", ), "extension_1_floor_insulation_thickness_mm": ColumnSpec( dtype=int, nullable=True, description="Extension 1 floor insulation thickness in mm.", ), "extension_1_total_floor_area_m2": ColumnSpec( dtype=float, nullable=True, description="Extension 1 total floor area (sum of its sap_floor_dimensions).", ), "extension_1_heat_loss_perimeter_m": ColumnSpec( dtype=float, nullable=True, description="Extension 1 heat-loss perimeter (sum of its sap_floor_dimensions).", ), "other_building_parts_count": ColumnSpec( dtype=int, nullable=False, description="Number of building parts beyond Main Dwelling and the secondary part.", ), "envelope_heat_loss_w_per_k": ColumnSpec( dtype=float, nullable=False, description=( "Sum of U*A over walls / roof / floor / party walls / windows / doors " "plus thermal-bridging factor y times total exposed area, summed across " "every sap_building_part. U-values cascade-default per ADR-0008 so the " "feature is never null. Approximates the SAP10.2 worksheet's envelope " "conduction loss in W/K." ), ), "ventilation_heat_loss_w_per_k": ColumnSpec( dtype=float, nullable=False, description=( "SAP10.2 §C ventilation heat-loss in W/K from structural infiltration " "(0.35 ACH masonry / 0.25 ACH timber) plus open chimneys (40 m³/h each) " "minus draught-proofing reduction (0.05 max × window DP share), all " "multiplied by dwelling volume × 0.33. Captures the infiltration share " "of total heat loss that envelope_heat_loss_w_per_k misses. ADR-0008." ), ), "seasonal_efficiency_main_heating": ColumnSpec( dtype=float, nullable=False, description=( "Space-heating seasonal efficiency as a decimal (e.g. 0.84 = 84%), " "from SAP10.2 Table 4a/4b keyed on primary_sap_main_heating_code. " "Unknown codes fall back to 0.80 (gas-boiler typical). ADR-0008." ), ), "seasonal_efficiency_water_heating": ColumnSpec( dtype=float, nullable=False, description=( "Water-heating seasonal efficiency as a decimal. Code 901 ('from main') " "inherits the main code's efficiency; unknown -> 0.78 (gas-combi). " "ADR-0008." ), ), "predicted_space_heating_kwh": ColumnSpec( dtype=float, nullable=False, description=( "Crude annual delivered space-heating kWh: envelope_heat_loss_w_per_k * " "HDH_region * 1e-3 / seasonal_efficiency_main_heating. HDH from a 22-row " "SAP-region lookup; UK average ~53,000 K*h/yr. ADR-0008." ), ), "predicted_hot_water_kwh": ColumnSpec( dtype=float, nullable=False, description=( "Crude annual delivered hot-water kWh from SAP10.2 Appendix J simplified: " "occupancy from TFA, daily volume 25*N+36 L, delta-T 43 K, +10% losses, " "divided by water-heating efficiency. ADR-0008." ), ), "predicted_lighting_kwh": ColumnSpec( dtype=float, nullable=False, description=( "Crude annual lighting kWh from SAP10.2 Section L simplified: " "9.3 * TFA reduced by 50% LED share + 40% CFL share. ADR-0008." ), ), "predicted_pv_generation_kwh": ColumnSpec( dtype=float, nullable=False, description=( "Annual PV generation kWh: pv_total_peak_power_kw * yield_factor " "(SAP10.2 Table 6e region-keyed; UK avg 850 kWh/kWp/yr). " "Subtracted from predicted_total_fuel_cost at the standard " "electricity rate per SAP10 §13 (slice 17a)." ), ), "predicted_total_fuel_cost_gbp": ColumnSpec( dtype=float, nullable=False, description=( "Annual regulated fuel cost (gbp/yr): space + DHW + lighting kWh " "multiplied by Table 32 unit prices. Standing charges omitted " "(approximately a constant fuel-mix offset the model can learn). " "ADR-0008 '+ Lighting' scope." ), ), "predicted_ecf": ColumnSpec( dtype=float, nullable=False, description=( "SAP10 §20.1 Energy Cost Factor: 0.42 * predicted_total_fuel_cost / " "(TFA + 45). SAP score is a piecewise log/linear function of ECF. " "ADR-0008." ), ), "predicted_log10_ecf": ColumnSpec( dtype=float, nullable=False, description=( "log10 of predicted_ecf. Monotone with sap_score so a tree-based " "model can use this as a near-target feature; the SAP rating's " "piecewise kink at ECF=3.5 is one further split. ADR-0008." ), ), } _TARGET_COLUMNS: dict[str, ColumnSpec] = { "sap_score": ColumnSpec( dtype=int, nullable=False, description="SAP10 energy rating, from `energy_rating_current` on the EPC.", ), "co2_emissions": ColumnSpec( dtype=float, nullable=False, description="Annual CO2 emissions in tonnes/yr, from `co2_emissions_current`.", ), "peui_raw": ColumnSpec( dtype=int, nullable=False, description=( "Primary energy intensity (kWh/m2/yr), from `energy_consumption_current`, " "untransformed." ), ), "peui_ucl": ColumnSpec( dtype=float, nullable=False, description=( "Primary energy intensity (kWh/m2/yr) with Few et al. 2023 per-band UCL " "correction folded into the training label (ADR-0007)." ), ), "space_heating_kwh": ColumnSpec( dtype=float, nullable=False, description=( "Annual space heating delivered kWh, from " "`renewable_heat_incentive.space_heating_existing_dwelling`." ), ), "hot_water_kwh": ColumnSpec( dtype=float, nullable=False, description=( "Annual hot water delivered kWh, from `renewable_heat_incentive.water_heating`." ), ), } class EpcMlTransform: """Maps an EpcPropertyData to a fixed-width row of ML features + targets. Version 0.1.0 — schema contract only; feature columns added in subsequent slices. """ VERSION: str = "2.7.1" def schema(self) -> TransformSchema: """The cross-repo ML data contract. Returns the column manifest the AutoGluon repo reads to know which columns are features, which are targets, and their dtypes. """ return TransformSchema( transform_version=self.VERSION, feature_columns=dict(_FEATURE_COLUMNS), target_columns=dict(_TARGET_COLUMNS), ) def to_rows(self, properties: Iterable[EpcPropertyData]) -> pd.DataFrame: """Apply `to_row` across many properties and return a typed DataFrame. Columns flagged `categorical=True` in the schema are cast to `pd.Categorical`; everything else is left at pandas-inferred dtype. The DataFrame always carries every advertised column, even when the input is empty. """ schema = self.schema() all_columns = list(schema.feature_columns.keys()) + list( schema.target_columns.keys() ) rows = [self.to_row(epc) for epc in properties] df = pd.DataFrame(rows, columns=all_columns) for name, spec in schema.feature_columns.items(): if spec.categorical: df[name] = df[name].astype("category") for name, spec in schema.target_columns.items(): if spec.categorical: df[name] = df[name].astype("category") return df def to_row(self, epc: EpcPropertyData) -> dict[str, Any]: """Map an EpcPropertyData to a single row of features + targets. v0.1.0 populates the six targets. Feature columns land in later slices. """ rhi = epc.renewable_heat_incentive window_aggregates = _window_aggregates(epc.sap_windows) building_part_aggregates = _building_part_aggregates(epc.sap_building_parts) heating_aggregates = _heating_aggregates(epc.sap_heating) pv_aggregates = _pv_aggregates(epc.sap_energy_source) energy_source_other = _energy_source_other_aggregates(epc.sap_energy_source) envelope_w_per_k = envelope_heat_loss_w_per_k( sap_building_parts=epc.sap_building_parts, country_code=epc.country_code, window_total_area_m2=float(window_aggregates.get("window_total_area_m2") or 0.0), window_avg_u_value=window_aggregates.get("window_avg_u_value"), door_count=epc.door_count, insulated_door_count=epc.insulated_door_count, insulated_door_u_value=epc.insulated_door_u_value, roof_description=_joined_descriptions(epc.roofs), wall_description=_joined_descriptions(epc.walls), ) main_wall_con = building_part_aggregates.get("main_dwelling_wall_construction") is_timber_frame = isinstance(main_wall_con, int) and main_wall_con in (5, 6) avg_room_h = building_part_aggregates.get("avg_room_height_m") window_dp_pct = window_aggregates.get("window_pct_draught_proofed") ventilation_w_per_k = ventilation_heat_loss_w_per_k( total_floor_area_m2=epc.total_floor_area_m2, avg_room_height_m=float(avg_room_h) if isinstance(avg_room_h, (int, float)) else 2.5, is_timber_frame=is_timber_frame, open_chimneys_count=epc.open_chimneys_count, window_pct_draught_proofed=float(window_dp_pct) if isinstance(window_dp_pct, (int, float)) else None, ) main_heating_code = heating_aggregates.get("primary_sap_main_heating_code") water_code = heating_aggregates.get("water_heating_code") main_category = heating_aggregates.get("primary_main_heating_category") main_fuel = heating_aggregates.get("primary_main_fuel_type") space_eff = seasonal_efficiency( main_heating_code if isinstance(main_heating_code, int) else None, main_heating_category=main_category if isinstance(main_category, int) else None, main_fuel_type=main_fuel if isinstance(main_fuel, int) else None, ) water_eff = water_heating_efficiency( water_heating_code=water_code if isinstance(water_code, int) else None, main_heating_code=main_heating_code if isinstance(main_heating_code, int) else None, ) pred_space_kwh = predicted_space_heating_kwh( envelope_heat_loss_w_per_k=envelope_w_per_k, region_code=epc.region_code, seasonal_efficiency_main=space_eff, ventilation_heat_loss_w_per_k=ventilation_w_per_k, ) cylinder_size_val = heating_aggregates.get("cylinder_size") cylinder_ins_thk = heating_aggregates.get("cylinder_insulation_thickness_mm") cylinder_ins_type = heating_aggregates.get("cylinder_insulation_type") main_age = building_part_aggregates.get("main_dwelling_construction_age_band") pred_hw_kwh = predicted_hot_water_kwh( total_floor_area_m2=epc.total_floor_area_m2, seasonal_efficiency_water=water_eff, cylinder_size=cylinder_size_val if isinstance(cylinder_size_val, int) else None, cylinder_insulation_thickness_mm=cylinder_ins_thk if isinstance(cylinder_ins_thk, int) else None, cylinder_insulation_type=cylinder_ins_type if isinstance(cylinder_ins_type, int) else None, age_band=main_age if isinstance(main_age, str) else None, has_wwhrs=bool(epc.sap_heating.number_baths_wwhrs and epc.sap_heating.number_baths_wwhrs > 0), has_solar_water_heating=epc.solar_water_heating, ) pred_light_kwh = predicted_lighting_kwh( total_floor_area_m2=epc.total_floor_area_m2, cfl_count=epc.cfl_fixed_lighting_bulbs_count, led_count=epc.led_fixed_lighting_bulbs_count, incandescent_count=epc.incandescent_fixed_lighting_bulbs_count, ) main_fuel_code = heating_aggregates.get("primary_main_fuel_type") water_fuel_code = heating_aggregates.get("water_heating_fuel") pv_kw = pv_aggregates.get("pv_total_peak_power_kw") or 0.0 pred_pv_kwh = predicted_pv_generation_kwh( pv_total_peak_power_kw=float(pv_kw), region_code=epc.region_code, ) pred_cost = predicted_total_fuel_cost_gbp( predicted_space_heating_kwh=pred_space_kwh, predicted_hot_water_kwh=pred_hw_kwh, predicted_lighting_kwh=pred_light_kwh, main_fuel_code=main_fuel_code if isinstance(main_fuel_code, int) else None, water_heating_fuel_code=water_fuel_code if isinstance(water_fuel_code, int) else None, predicted_pv_kwh=pred_pv_kwh, ) pred_ecf_v = predicted_ecf( predicted_total_cost_gbp=pred_cost, total_floor_area_m2=epc.total_floor_area_m2, ) pred_log10_ecf_v = predicted_log10_ecf(pred_ecf_v) return { # Features — geometry "total_floor_area_m2": epc.total_floor_area_m2, # Features — counts "door_count": epc.door_count, "habitable_rooms_count": epc.habitable_rooms_count, "heated_rooms_count": epc.heated_rooms_count, "wet_rooms_count": epc.wet_rooms_count, "extensions_count": epc.extensions_count, "open_chimneys_count": epc.open_chimneys_count, "insulated_door_count": epc.insulated_door_count, "cfl_fixed_lighting_bulbs_count": epc.cfl_fixed_lighting_bulbs_count, "led_fixed_lighting_bulbs_count": epc.led_fixed_lighting_bulbs_count, "incandescent_fixed_lighting_bulbs_count": epc.incandescent_fixed_lighting_bulbs_count, # Features — booleans "solar_water_heating": epc.solar_water_heating, "has_hot_water_cylinder": epc.has_hot_water_cylinder, "has_fixed_air_conditioning": epc.has_fixed_air_conditioning, # Features — optional integer indicators "percent_draughtproofed": epc.percent_draughtproofed, # Features — categoricals (raw strings; cast at parquet write time) "dwelling_type": epc.dwelling_type, "transaction_type": epc.transaction_type, "property_type": epc.property_type, "built_form": epc.built_form, "region_code": epc.region_code, "country_code": epc.country_code, # Features — window aggregates (physics + orientation) **window_aggregates, # Features — building parts aggregates + Main Dwelling carve-out **building_part_aggregates, # Features — engineered physics (ADR-0008) "envelope_heat_loss_w_per_k": envelope_w_per_k, "ventilation_heat_loss_w_per_k": ventilation_w_per_k, "seasonal_efficiency_main_heating": space_eff, "seasonal_efficiency_water_heating": water_eff, "predicted_space_heating_kwh": pred_space_kwh, "predicted_hot_water_kwh": pred_hw_kwh, "predicted_lighting_kwh": pred_light_kwh, "predicted_pv_generation_kwh": pred_pv_kwh, "predicted_total_fuel_cost_gbp": pred_cost, "predicted_ecf": pred_ecf_v, "predicted_log10_ecf": pred_log10_ecf_v, # Features — heating system (primary slot + water + secondary) **heating_aggregates, # Features — PV (capacity source + array aggregates by SAP octant) **pv_aggregates, # Features — battery, wind turbine, mains gas + smart meter flags **energy_source_other, # Features — ventilation "mechanical_ventilation": epc.mechanical_ventilation, "mechanical_vent_duct_type": epc.mechanical_vent_duct_type, "blocked_chimneys_count": epc.blocked_chimneys_count, "pressure_test": epc.pressure_test, # Features — dwelling-level fabric + demand scalars "multiple_glazed_proportion": epc.multiple_glazed_proportion, "number_baths": epc.sap_heating.number_baths, "number_baths_wwhrs": epc.sap_heating.number_baths_wwhrs, "extract_fans_count": epc.extract_fans_count, # Features — conservatory + flat-only block "conservatory_type": epc.conservatory_type, "has_heated_separate_conservatory": epc.has_heated_separate_conservatory, "flat_level": ( _int_or_none(epc.sap_flat_details.level) if epc.sap_flat_details else None ), "flat_top_storey": ( epc.sap_flat_details.top_storey if epc.sap_flat_details else None ), "flat_storey_count": ( _int_or_none(epc.sap_flat_details.storey_count) if epc.sap_flat_details else None ), "flat_location": ( _int_or_none(epc.sap_flat_details.flat_location) if epc.sap_flat_details else None ), "flat_heat_loss_corridor": ( _int_or_none(epc.sap_flat_details.heat_loss_corridor) if epc.sap_flat_details else None ), # Features — energy supply categoricals "meter_type": _meter_type_int(epc.sap_energy_source.meter_type), "pv_connection": epc.sap_energy_source.pv_connection, "wind_turbines_terrain_type": _wind_terrain_int(epc.sap_energy_source.wind_turbines_terrain_type), # Features — doors "draughtproofed_door_count": epc.draughtproofed_door_count, "insulated_door_u_value": epc.insulated_door_u_value, # Features — hot water extras "cylinder_insulation_type": _int_or_none(epc.sap_heating.cylinder_insulation_type), "cylinder_thermostat": epc.sap_heating.cylinder_thermostat, "secondary_heating_type": _int_or_none(epc.sap_heating.secondary_heating_type), # Features — mechanical ventilation extras "mechanical_vent_duct_placement": epc.mechanical_vent_duct_placement, "mechanical_vent_duct_insulation": epc.mechanical_vent_duct_insulation, "mechanical_vent_duct_insulation_level": epc.mechanical_vent_duct_insulation_level, "mechanical_vent_measured_installation": _truthy_yn(epc.mechanical_vent_measured_installation), # Features — lighting extras "low_energy_fixed_lighting_bulbs_count": epc.low_energy_fixed_lighting_bulbs_count, "fixed_lighting_outlets_count": epc.fixed_lighting_outlets_count, "low_energy_fixed_lighting_outlets_count": epc.low_energy_fixed_lighting_outlets_count, # Features — element list counts (split-fabric discriminators) "wall_count": len(epc.walls), "roof_count": len(epc.roofs), "floor_count": len(epc.floors), "main_heating_count_elements": len(epc.main_heating), "main_heating_controls_present": epc.main_heating_controls is not None, # Features — wind turbine geometry "wind_turbine_hub_height_m": ( epc.sap_energy_source.wind_turbine_details.hub_height if epc.sap_energy_source.wind_turbine_details is not None else None ), "wind_turbine_rotor_diameter_m": ( epc.sap_energy_source.wind_turbine_details.rotor_diameter if epc.sap_energy_source.wind_turbine_details is not None else None ), # Features — flat unheated corridor length "flat_unheated_corridor_length_m": ( epc.sap_flat_details.unheated_corridor_length_m if epc.sap_flat_details is not None else None ), # Features — addendum + LZC "addendum_stone_walls": ( epc.addendum.stone_walls if epc.addendum is not None else None ), "addendum_system_build": ( epc.addendum.system_build if epc.addendum is not None else None ), "addendum_numbers_count": ( len(epc.addendum.addendum_numbers) if epc.addendum is not None and epc.addendum.addendum_numbers is not None else 0 ), "lzc_energy_sources_count": ( len(epc.lzc_energy_sources) if epc.lzc_energy_sources is not None else 0 ), # Targets "sap_score": epc.energy_rating_current, "co2_emissions": epc.co2_emissions_current, "peui_raw": epc.energy_consumption_current, "peui_ucl": _peui_ucl(epc), "space_heating_kwh": rhi.space_heating_kwh if rhi is not None else None, "hot_water_kwh": rhi.water_heating_kwh if rhi is not None else None, } def _peui_ucl(epc: EpcPropertyData) -> Optional[float]: """Apply the Few et al. per-band UCL correction to PEUI for training labels. Returns None when: - either the raw PEUI or the SAP score is missing, or - the raw PEUI is non-positive (e.g. net-exporter homes with negative PEUI) so the UCL correction is undefined. Those rows are unusable as `peui_ucl` training labels and should be dropped upstream rather than crashing the transform. """ if epc.energy_consumption_current is None or epc.energy_rating_current is None: return None peui_raw = float(epc.energy_consumption_current) if peui_raw <= 0: return None band = Epc.from_sap_score(epc.energy_rating_current) return apply_ucl_correction(peui_raw, band) def _pv_aggregates(es: SapEnergySource) -> dict[str, Any]: """Aggregate the PV side of sap_energy_source into 15 columns. `pv_capacity_source` discriminates the three PV states: - 'measured': es.photovoltaic_arrays is non-empty — array aggregates populate - 'estimated_from_roof_area': only percent_roof_area > 0 is known - 'none': no PV (either no payload, or percent_roof_area == 0) """ octant_power: dict[str, float] = {name: 0.0 for name in _OCTANT_NAMES.values()} aggregates: dict[str, Any] = { "has_pv": False, "pv_capacity_source": "none", "pv_array_count": 0, "pv_total_peak_power_kw": 0.0, **{f"pv_peak_power_kw_{name}": 0.0 for name in _OCTANT_NAMES.values()}, "pv_avg_pitch": None, "pv_avg_overshading": None, "pv_percent_roof_area": None, } arrays = es.photovoltaic_arrays if arrays: total_power = 0.0 weighted_pitch = 0.0 weighted_overshading = 0.0 for a in arrays: total_power += a.peak_power weighted_pitch += a.pitch * a.peak_power weighted_overshading += a.overshading * a.peak_power if a.orientation in _OCTANT_NAMES: octant_power[_OCTANT_NAMES[a.orientation]] += a.peak_power aggregates["has_pv"] = True aggregates["pv_capacity_source"] = "measured" aggregates["pv_array_count"] = len(arrays) aggregates["pv_total_peak_power_kw"] = total_power for name, power in octant_power.items(): aggregates[f"pv_peak_power_kw_{name}"] = power if total_power > 0: aggregates["pv_avg_pitch"] = weighted_pitch / total_power aggregates["pv_avg_overshading"] = weighted_overshading / total_power return aggregates supply = es.photovoltaic_supply if supply is not None and supply.none_or_no_details.percent_roof_area > 0: aggregates["has_pv"] = True aggregates["pv_capacity_source"] = "estimated_from_roof_area" aggregates["pv_percent_roof_area"] = supply.none_or_no_details.percent_roof_area return aggregates def _energy_source_other_aggregates(es: SapEnergySource) -> dict[str, Any]: """Pull battery, wind turbine, and household energy source flags. Battery capacity multiplies pv_battery_count by the per-unit capacity carried on pv_batteries.pv_battery; null when no battery is present. """ battery_capacity_kwh: Optional[float] = None if es.pv_battery_count > 0 and es.pv_batteries is not None: battery_capacity_kwh = ( es.pv_battery_count * es.pv_batteries.pv_battery.battery_capacity ) return { "has_pv_battery": es.pv_battery_count > 0, "pv_battery_count": es.pv_battery_count, "pv_battery_capacity_kwh": battery_capacity_kwh, "has_wind_turbine": es.wind_turbines_count > 0, "wind_turbine_count": es.wind_turbines_count, "mains_gas": es.mains_gas, "electricity_smart_meter_present": es.electricity_smart_meter_present, "gas_smart_meter_present": es.gas_smart_meter_present, "is_dwelling_export_capable": es.is_dwelling_export_capable, } def _heating_aggregates(sap_heating: SapHeating) -> dict[str, Any]: """Aggregate sap_heating into 15 heating-feature columns. Hybrid Top-1: the primary heating slot comes from `main_heating_details[0]`; water heating fields read directly off `sap_heating`; secondary heating is inferred from `secondary_fuel_type`. Fields are Union[int, str] in the domain object — Union-int values pass through as int categoricals; str values (from site notes) coerce to None. """ shower_outlets = sap_heating.shower_outlets shower_outlet_count = 1 if shower_outlets is not None else 0 aggregates: dict[str, Any] = { "main_heating_count": len(sap_heating.main_heating_details), "primary_main_fuel_type": None, "primary_heat_emitter_type": None, "primary_main_heating_control": None, "primary_main_heating_category": None, "primary_has_fghrs": None, "primary_fan_flue_present": None, "primary_boiler_flue_type": None, "primary_central_heating_pump_age": None, "primary_sap_main_heating_code": None, "primary_emitter_temperature": None, "primary_main_heating_fraction": None, "water_heating_code": sap_heating.water_heating_code, "water_heating_fuel": sap_heating.water_heating_fuel, "cylinder_size": ( sap_heating.cylinder_size if isinstance(sap_heating.cylinder_size, int) else None ), "cylinder_insulation_thickness_mm": sap_heating.cylinder_insulation_thickness_mm, "has_secondary_heating": sap_heating.secondary_fuel_type is not None, "secondary_fuel_type": sap_heating.secondary_fuel_type, "immersion_heating_type": _int_or_none(sap_heating.immersion_heating_type), "shower_outlet_count": shower_outlet_count, } if sap_heating.main_heating_details: primary = sap_heating.main_heating_details[0] aggregates["primary_main_fuel_type"] = ( primary.main_fuel_type if isinstance(primary.main_fuel_type, int) else None ) aggregates["primary_heat_emitter_type"] = ( primary.heat_emitter_type if isinstance(primary.heat_emitter_type, int) else None ) aggregates["primary_main_heating_control"] = ( primary.main_heating_control if isinstance(primary.main_heating_control, int) else None ) aggregates["primary_main_heating_category"] = primary.main_heating_category aggregates["primary_has_fghrs"] = primary.has_fghrs aggregates["primary_fan_flue_present"] = primary.fan_flue_present aggregates["primary_boiler_flue_type"] = primary.boiler_flue_type aggregates["primary_central_heating_pump_age"] = ( primary.central_heating_pump_age ) aggregates["primary_sap_main_heating_code"] = primary.sap_main_heating_code aggregates["primary_emitter_temperature"] = _int_or_none(primary.emitter_temperature) aggregates["primary_main_heating_fraction"] = primary.main_heating_fraction return aggregates _MAIN_DWELLING_FABRIC_COLUMNS = ( "main_dwelling_wall_insulation_type", "main_dwelling_wall_insulation_thickness_mm", "main_dwelling_wall_dry_lined", "main_dwelling_wall_thickness_mm", "main_dwelling_party_wall_construction", "main_dwelling_roof_insulation_location", "main_dwelling_roof_insulation_thickness_mm", "main_dwelling_floor_construction", "main_dwelling_floor_insulation", "main_dwelling_floor_insulation_thickness_mm", "main_dwelling_floor_heat_loss", "main_dwelling_room_in_roof_floor_area_m2", "main_dwelling_flat_roof_insulation_thickness_mm", "main_dwelling_wall_thickness_measured", ) _SECONDARY_DWELLING_FABRIC_COLUMNS = ( "extension_1_wall_construction", "extension_1_wall_insulation_type", "extension_1_wall_insulation_thickness_mm", "extension_1_wall_thickness_mm", "extension_1_roof_construction", "extension_1_roof_insulation_thickness_mm", "extension_1_floor_construction", "extension_1_floor_insulation", "extension_1_floor_insulation_thickness_mm", "extension_1_total_floor_area_m2", "extension_1_heat_loss_perimeter_m", ) def _parse_thickness_mm(value: Any) -> Optional[int]: """Parse a SAP10 insulation-thickness string ('100mm', '400mm+', 'NI', 'ND') to int mm. Returns 0 for 'NI' (No Insulation — semantically meaningful as 0mm). Returns None for unparseable sentinels like 'ND' or '(assumed)'. """ if value is None: return None if isinstance(value, int): return value if not isinstance(value, str): return None s = value.strip() if s.upper() == "NI": return 0 digits = "" for c in s: if c.isdigit(): digits += c else: break return int(digits) if digits else None def _int_or_none(value: Any) -> Optional[int]: """Treat int values as-is, drop string sentinels like 'NA'/'NI'/'ND'.""" return value if isinstance(value, int) else None def _meter_type_int(value: Any) -> Optional[int]: """Domain mapper coerces sap_energy_source.meter_type to str(int) for site-notes compatibility ("1", "2", ...). Parse back to int for the categorical feature.""" if isinstance(value, int): return value if isinstance(value, str) and value.isdigit(): return int(value) return None def _wind_terrain_int(value: Any) -> Optional[int]: """Same shape as meter_type — int coerced to str by the 21.0.x mapper.""" if isinstance(value, int): return value if isinstance(value, str) and value.isdigit(): return int(value) return None def _truthy_yn(value: Any) -> Optional[bool]: """Map 'Y'/'true'/True to True, 'N'/'false'/False to False, anything else to None.""" if value is None: return None if isinstance(value, bool): return value if isinstance(value, str): v = value.strip().lower() if v in ("y", "true", "yes", "1"): return True if v in ("n", "false", "no", "0"): return False return None def _joined_descriptions(elements: list[EnergyElement]) -> Optional[str]: """Concatenate `description` text across an `EnergyElement` list. Used so envelope_heat_loss_w_per_k can spot worst-case markers ("no insulation" / "limited insulation") across every roof / wall / floor entry on the cert, since those are top-level lists not keyed by building part. Returns None when the list is empty so callers can short-circuit. """ if not elements: return None parts = [e.description for e in elements if e.description] if not parts: return None return " | ".join(parts) def _ground_floor(part: SapBuildingPart) -> Optional[Any]: """Pick the ground-floor `SapFloorDimension` (floor==0) for a building part. Falls back to the first floor dimension if no part is flagged as ground floor. Returns None if the part has no floor dimensions at all. """ if not part.sap_floor_dimensions: return None for fd in part.sap_floor_dimensions: if fd.floor == 0: return fd return part.sap_floor_dimensions[0] def _building_part_aggregates(parts: list[SapBuildingPart]) -> dict[str, Any]: """Aggregate sap_building_parts into 24 columns: 5 cross-all + 19 Main-Dwelling. Cross-all aggregates always populate (zeros when no parts). Main-Dwelling columns populate only when a part with `identifier == "Main Dwelling"` is present — otherwise None (we don't silently fall back to the first part). """ main = next( (p for p in parts if p.identifier is BuildingPartIdentifier.MAIN), None ) aggregates: dict[str, Any] = { "building_parts_count": len(parts), "total_heat_loss_perimeter_m": 0.0, "total_party_wall_length_m": 0.0, "total_floor_area_from_parts_m2": 0.0, "avg_room_height_m": None, "main_dwelling_heat_loss_perimeter_m": None, "main_dwelling_party_wall_length_m": None, "main_dwelling_total_floor_area_m2": None, "main_dwelling_avg_room_height_m": None, "main_dwelling_has_room_in_roof": None, "main_dwelling_construction_age_band": None, "main_dwelling_wall_construction": None, "main_dwelling_roof_construction": None, "main_dwelling_alternative_wall_count": 0, "main_dwelling_alternative_wall_area_m2": 0.0, "extension_1_present": False, "other_building_parts_count": 0, } for col in _MAIN_DWELLING_FABRIC_COLUMNS: aggregates[col] = None for col in _SECONDARY_DWELLING_FABRIC_COLUMNS: aggregates[col] = None if not parts: return aggregates total_floor_area = 0.0 weighted_room_height = 0.0 for p in parts: for fd in p.sap_floor_dimensions: aggregates["total_heat_loss_perimeter_m"] += fd.heat_loss_perimeter_m aggregates["total_party_wall_length_m"] += fd.party_wall_length_m total_floor_area += fd.total_floor_area_m2 weighted_room_height += fd.room_height_m * fd.total_floor_area_m2 aggregates["total_floor_area_from_parts_m2"] = total_floor_area if total_floor_area > 0: aggregates["avg_room_height_m"] = weighted_room_height / total_floor_area if main is not None: main_floor_area = 0.0 main_weighted_height = 0.0 main_hlp = 0.0 main_pwl = 0.0 for fd in main.sap_floor_dimensions: main_hlp += fd.heat_loss_perimeter_m main_pwl += fd.party_wall_length_m main_floor_area += fd.total_floor_area_m2 main_weighted_height += fd.room_height_m * fd.total_floor_area_m2 aggregates["main_dwelling_heat_loss_perimeter_m"] = main_hlp aggregates["main_dwelling_party_wall_length_m"] = main_pwl aggregates["main_dwelling_total_floor_area_m2"] = main_floor_area if main_floor_area > 0: aggregates["main_dwelling_avg_room_height_m"] = ( main_weighted_height / main_floor_area ) aggregates["main_dwelling_has_room_in_roof"] = main.sap_room_in_roof is not None aggregates["main_dwelling_construction_age_band"] = main.construction_age_band aggregates["main_dwelling_wall_construction"] = ( main.wall_construction if isinstance(main.wall_construction, int) else None ) aggregates["main_dwelling_roof_construction"] = main.roof_construction # New fabric inputs: walls aggregates["main_dwelling_wall_insulation_type"] = _int_or_none(main.wall_insulation_type) aggregates["main_dwelling_wall_insulation_thickness_mm"] = _parse_thickness_mm( main.wall_insulation_thickness ) aggregates["main_dwelling_wall_dry_lined"] = main.wall_dry_lined aggregates["main_dwelling_wall_thickness_mm"] = main.wall_thickness_mm aggregates["main_dwelling_party_wall_construction"] = _int_or_none( main.party_wall_construction ) # New fabric inputs: roof aggregates["main_dwelling_roof_insulation_location"] = _int_or_none( main.roof_insulation_location ) aggregates["main_dwelling_roof_insulation_thickness_mm"] = _parse_thickness_mm( main.roof_insulation_thickness ) # New fabric inputs: floor — from ground-floor SapFloorDimension aggregates["main_dwelling_floor_heat_loss"] = main.floor_heat_loss aggregates["main_dwelling_floor_insulation_thickness_mm"] = _parse_thickness_mm( main.floor_insulation_thickness ) ground_floor = _ground_floor(main) if ground_floor is not None: aggregates["main_dwelling_floor_construction"] = ground_floor.floor_construction aggregates["main_dwelling_floor_insulation"] = ground_floor.floor_insulation # Main dwelling extras: room-in-roof, alternative walls, flat-roof, measured flag. if main.sap_room_in_roof is not None: aggregates["main_dwelling_room_in_roof_floor_area_m2"] = float( main.sap_room_in_roof.floor_area ) alt_count = 0 alt_area = 0.0 for alt in (main.sap_alternative_wall_1, main.sap_alternative_wall_2): if alt is not None: alt_count += 1 alt_area += float(alt.wall_area) aggregates["main_dwelling_alternative_wall_count"] = alt_count aggregates["main_dwelling_alternative_wall_area_m2"] = alt_area aggregates["main_dwelling_flat_roof_insulation_thickness_mm"] = _parse_thickness_mm( main.flat_roof_insulation_thickness ) aggregates["main_dwelling_wall_thickness_measured"] = main.wall_thickness_measured # Extension 1 — first non-main entry in the list. secondary = next( (p for p in parts if p.identifier is not BuildingPartIdentifier.MAIN), None ) if secondary is not None: aggregates["extension_1_present"] = True aggregates["extension_1_wall_construction"] = _int_or_none( secondary.wall_construction ) aggregates["extension_1_wall_insulation_type"] = _int_or_none( secondary.wall_insulation_type ) aggregates["extension_1_wall_insulation_thickness_mm"] = _parse_thickness_mm( secondary.wall_insulation_thickness ) aggregates["extension_1_wall_thickness_mm"] = secondary.wall_thickness_mm aggregates["extension_1_roof_construction"] = secondary.roof_construction aggregates["extension_1_roof_insulation_thickness_mm"] = _parse_thickness_mm( secondary.roof_insulation_thickness ) sec_ground = _ground_floor(secondary) if sec_ground is not None: aggregates["extension_1_floor_construction"] = sec_ground.floor_construction aggregates["extension_1_floor_insulation"] = sec_ground.floor_insulation aggregates["extension_1_floor_insulation_thickness_mm"] = _parse_thickness_mm( secondary.floor_insulation_thickness ) sec_floor_area = 0.0 sec_hlp = 0.0 if secondary.sap_floor_dimensions: for fd in secondary.sap_floor_dimensions: sec_floor_area += fd.total_floor_area_m2 sec_hlp += fd.heat_loss_perimeter_m aggregates["extension_1_total_floor_area_m2"] = sec_floor_area aggregates["extension_1_heat_loss_perimeter_m"] = sec_hlp # Anything beyond main + secondary just gets counted (extension chains, etc.). aggregates["other_building_parts_count"] = max(0, len(parts) - (1 if main else 0) - (1 if secondary else 0)) return aggregates def _window_aggregates(windows: list[SapWindow]) -> dict[str, Any]: """Aggregate a list of windows into the 30 window-feature columns. With no windows: counts/areas/shares are 0; nullable averages and the pvc_frame share are None. Windows whose `orientation` isn't an integer in 1-8 contribute to count and total area but to no octant. Windows whose `glazing_type` isn't in the SAP10 1-15 enum fall into the `_other` share. """ octant_areas: dict[str, float] = {name: 0.0 for name in _OCTANT_NAMES.values()} glazed_type_areas: dict[str, float] = { f"window_pct_glazed_type_{code}": 0.0 for code in _GLAZED_TYPE_CODES } glazed_type_areas["window_pct_glazed_type_other"] = 0.0 aggregates: dict[str, Any] = { "window_count": len(windows), "window_total_area_m2": 0.0, **{f"window_area_orientation_{name}": 0.0 for name in _OCTANT_NAMES.values()}, "window_pct_draught_proofed": None, "window_avg_u_value": None, "window_avg_solar_transmittance": None, **glazed_type_areas, "window_pct_pvc_frame": None, "window_pct_living": None, "window_pct_external": None, "window_pct_permanent_shutters": None, "window_avg_glazing_gap_mm": None, "window_avg_frame_factor": None, "window_pct_permanent_shutters_insulated": None, } if not windows: return aggregates total_area = 0.0 draught_proofed_area = 0.0 pvc_frame_area = 0.0 living_area = 0.0 external_area = 0.0 shutters_area = 0.0 shutters_insulated_area = 0.0 transmission_area = 0.0 weighted_u_value = 0.0 weighted_solar_transmittance = 0.0 glazing_gap_area = 0.0 weighted_glazing_gap = 0.0 frame_factor_area = 0.0 weighted_frame_factor = 0.0 for w in windows: area = w.window_width * w.window_height total_area += area if w.draught_proofed is True or w.draught_proofed == "true": draught_proofed_area += area if w.frame_material == "PVC": pvc_frame_area += area if w.window_type == 1: # living room living_area += area if w.window_location == 0: # external (not conservatory) external_area += area if w.permanent_shutters_present is True or w.permanent_shutters_present == "Y": shutters_area += area if w.permanent_shutters_insulated == "Y": shutters_insulated_area += area if isinstance(w.glazing_gap, int): glazing_gap_area += area weighted_glazing_gap += float(w.glazing_gap) * area if w.frame_factor is not None: frame_factor_area += area weighted_frame_factor += float(w.frame_factor) * area if isinstance(w.orientation, int) and w.orientation in _OCTANT_NAMES: octant_areas[_OCTANT_NAMES[w.orientation]] += area if isinstance(w.glazing_type, int) and w.glazing_type in _GLAZED_TYPE_CODES: glazed_type_areas[f"window_pct_glazed_type_{w.glazing_type}"] += area else: glazed_type_areas["window_pct_glazed_type_other"] += area if w.window_transmission_details is not None: transmission_area += area weighted_u_value += w.window_transmission_details.u_value * area weighted_solar_transmittance += ( w.window_transmission_details.solar_transmittance * area ) aggregates["window_total_area_m2"] = total_area for name, area in octant_areas.items(): aggregates[f"window_area_orientation_{name}"] = area if total_area > 0: aggregates["window_pct_draught_proofed"] = ( draught_proofed_area / total_area * 100.0 ) aggregates["window_pct_pvc_frame"] = pvc_frame_area / total_area aggregates["window_pct_living"] = living_area / total_area aggregates["window_pct_external"] = external_area / total_area aggregates["window_pct_permanent_shutters"] = shutters_area / total_area aggregates["window_pct_permanent_shutters_insulated"] = ( shutters_insulated_area / total_area ) for column, area in glazed_type_areas.items(): aggregates[column] = area / total_area if transmission_area > 0: aggregates["window_avg_u_value"] = weighted_u_value / transmission_area aggregates["window_avg_solar_transmittance"] = ( weighted_solar_transmittance / transmission_area ) if glazing_gap_area > 0: aggregates["window_avg_glazing_gap_mm"] = weighted_glazing_gap / glazing_gap_area if frame_factor_area > 0: aggregates["window_avg_frame_factor"] = weighted_frame_factor / frame_factor_area return aggregates