mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
437 lines
18 KiB
Python
437 lines
18 KiB
Python
import re
|
|
from numpy import nan
|
|
from tqdm import tqdm
|
|
import pandas as pd
|
|
from backend.onboarders.mappings.property_type import parity_map as property_map
|
|
from backend.onboarders.mappings.age_band import parity_map as age_band_map
|
|
from backend.onboarders.mappings.built_form import parity_map as built_form_map
|
|
from backend.onboarders.epc_descriptions import EpcWallDescriptions, EpcConstructionAgeBand, EpcEfficiency, \
|
|
WALL_DESCRIPTION_EFFICIENCIES, EpcRoofDescriptions, resolve_roof_efficiency, EpcFloorDescriptions
|
|
from backend.onboarders.mappings.as_built_wall_classifiers import AS_BUILT_WALL_CLASSIFIERS
|
|
from backend.onboarders.mappings.as_built_roof_classifiers import AS_BUILT_ROOF_CLASSIFIERS
|
|
from backend.onboarders.mappings.as_built_floor_classifiers import unknown_floor_as_built, unknown_floor_retrofitted, \
|
|
solid_floor_as_built, suspended_floor_as_built
|
|
|
|
tqdm.pandas()
|
|
|
|
|
|
def check_nulls(data, original_column, mapped_column):
|
|
# We only allow nulls if the oroginal value was null
|
|
null_vals = data[pd.isnull(data[mapped_column])]
|
|
if null_vals.empty:
|
|
return True
|
|
# We make sure all original values were null
|
|
assert pd.isnull(null_vals[original_column]).all(), (
|
|
f"Some values in {mapped_column} were not mapped, but original values were not null"
|
|
)
|
|
|
|
|
|
# Sample input data
|
|
data = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody "
|
|
"- Data Extracts for Domna.xlsx",
|
|
sheet_name="Sustainability"
|
|
)
|
|
|
|
# We want to map the parity fields to standard EPC references. This will allow us to
|
|
# 1) Estimate EPCs, more accurately
|
|
# 2) Patch incorrect EPCs with ease
|
|
# 3) Indicate already installed measures
|
|
|
|
# ------------ construction_age_band ------------
|
|
|
|
data["construction_age_band"] = data["Construction Years"].map(age_band_map)
|
|
|
|
check_nulls(data, "Construction Years", "construction_age_band")
|
|
|
|
# ------------ property_type ------------
|
|
data["property_type"] = data["Type"].map(property_map)
|
|
|
|
assert pd.isnull(data["property_type"]).sum() == 0, "Some property types were not mapped"
|
|
|
|
# ------------ built_form ------------
|
|
data["built_form"] = data["Attachment"].map(built_form_map)
|
|
|
|
assert pd.isnull(data["built_form"]).sum() == 0, "Some built forms were not mapped"
|
|
|
|
# ------------ Wall Construction ------------
|
|
|
|
# Unique combindations
|
|
wall_mapping = {
|
|
# Cavity walls
|
|
('Cavity', 'FilledCavity'): EpcWallDescriptions.cavity_filled_cavity,
|
|
('Cavity', 'Internal'): EpcWallDescriptions.cavity_internal_insulation,
|
|
('Cavity', 'External'): EpcWallDescriptions.cavity_external_insulation,
|
|
('Cavity', 'FilledCavityPlusInternal'): EpcWallDescriptions.cavity_filled_plus_internal,
|
|
('Cavity', 'FilledCavityPlusExternal'): EpcWallDescriptions.cavity_filled_plus_external,
|
|
('Cavity', 'AsBuilt'): None, # To be classified
|
|
('Cavity', 'Unknown'): None, # To be classified
|
|
|
|
# System built walls
|
|
('System', 'External'): EpcWallDescriptions.system_external_insulation,
|
|
('System', 'Internal'): EpcWallDescriptions.system_internal_insulation,
|
|
('System', 'AsBuilt'): None, # To be classified
|
|
('System', 'Unknown'): None,
|
|
|
|
# Timber Frame walls
|
|
('Timber Frame', 'Internal'): EpcWallDescriptions.timber_frame_internal_insulation,
|
|
('Timber Frame', 'External'): EpcWallDescriptions.timber_frame_external_insulation,
|
|
('Timber Frame', 'AsBuilt'): None, # To be classified
|
|
('Timber Frame', 'Unknown'): None,
|
|
|
|
# Solid Brick walls
|
|
('Solid Brick', 'External'): EpcWallDescriptions.solid_brick_external_insulation,
|
|
('Solid Brick', 'Internal'): EpcWallDescriptions.solid_brick_internal_insulation,
|
|
('Solid Brick', 'AsBuilt'): None, # To be classified
|
|
('Solid Brick', 'Unknown'): None,
|
|
|
|
# Granite walls
|
|
('Granite', 'External'): EpcWallDescriptions.granite_whinstone_external_insulation,
|
|
("Granite", 'Internal'): EpcWallDescriptions.granite_whinstone_internal_insulation,
|
|
('Granite', 'AsBuilt'): None,
|
|
('Granite', 'Unknown'): None,
|
|
|
|
# Sandstone walls
|
|
('Sandstone', 'Internal'): EpcWallDescriptions.sandstone_limestone_internal_insulation,
|
|
('Sandstone', 'External'): EpcWallDescriptions.sandstone_limestone_external_insulation,
|
|
('Sandstone', 'Unknown'): None,
|
|
('Sandstone', 'AsBuilt'): None,
|
|
|
|
# Cob walls
|
|
('Cob', 'AsBuilt'): None,
|
|
}
|
|
|
|
WALL_UNKNOWN_AGE_FALLBACK = {
|
|
"Cavity": EpcWallDescriptions.cavity_as_built_unknown,
|
|
"Solid Brick": EpcWallDescriptions.solid_brick_as_built_unknown,
|
|
"Timber Frame": EpcWallDescriptions.timber_frame_as_built_unknown,
|
|
"System": EpcWallDescriptions.system_as_built_unknown,
|
|
"Granite": EpcWallDescriptions.granite_as_built_unknown,
|
|
"Sandstone": EpcWallDescriptions.sandstone_as_built_unknown,
|
|
"Cob": EpcWallDescriptions.cob_as_built_unknown,
|
|
}
|
|
|
|
data["landlord_wall_description"] = (
|
|
data[["Wall Construction", "Wall Insulation"]]
|
|
.apply(tuple, axis=1)
|
|
.map(wall_mapping)
|
|
)
|
|
|
|
|
|
def fill_as_built(row):
|
|
# Already resolved via direct mapping
|
|
if row.landlord_wall_description is not None:
|
|
return row.landlord_wall_description
|
|
|
|
wall_type = row["Wall Construction"]
|
|
|
|
# Missing construction age → conservative fallback
|
|
if pd.isnull(row.construction_age_band):
|
|
return WALL_UNKNOWN_AGE_FALLBACK.get(wall_type)
|
|
|
|
classifier = AS_BUILT_WALL_CLASSIFIERS.get(wall_type)
|
|
if classifier is None:
|
|
return None
|
|
|
|
return classifier(row.construction_age_band)
|
|
|
|
|
|
def resolve_wall_efficiency(
|
|
description: EpcWallDescriptions,
|
|
age_band: EpcConstructionAgeBand | None,
|
|
) -> EpcEfficiency:
|
|
# Unknown / holding descriptions → efficiency unknown
|
|
if "unknown insulation" in description.value.lower():
|
|
return EpcEfficiency.NA
|
|
|
|
rule = WALL_DESCRIPTION_EFFICIENCIES.get(description)
|
|
|
|
if rule is None:
|
|
return EpcEfficiency.NA
|
|
|
|
if isinstance(rule, EpcEfficiency):
|
|
return rule
|
|
|
|
# Rule needs age band but we don't have one
|
|
if age_band is None or pd.isnull(age_band):
|
|
return EpcEfficiency.NA
|
|
|
|
return rule(age_band)
|
|
|
|
|
|
data["landlord_wall_description"] = data.progress_apply(fill_as_built, axis=1)
|
|
|
|
assert data["landlord_wall_description"].isnull().sum() == 0, (
|
|
"Some wall descriptions could not be resolved"
|
|
)
|
|
|
|
data["landlord_wall_efficiency"] = data.progress_apply(
|
|
lambda row: resolve_wall_efficiency(
|
|
row.landlord_wall_description,
|
|
row.construction_age_band,
|
|
),
|
|
axis=1,
|
|
)
|
|
# Sanity check
|
|
assert data["landlord_wall_efficiency"].isnull().sum() == 0
|
|
|
|
# ------------ Roof Construction ------------
|
|
|
|
|
|
roof_mapping = {
|
|
# Dwelling above
|
|
('AnotherDwellingAbove', 'Another Dwelling Above'): EpcRoofDescriptions.another_dwelling_above,
|
|
('SameDwellingAbove', 'Same Dwelling Above'): EpcRoofDescriptions.another_dwelling_above,
|
|
# Pitched, normal loft access, with a loft thickness
|
|
('PitchedNormalLoftAccess', 'mm25'): EpcRoofDescriptions.loft_25mm_insulation,
|
|
('PitchedNormalLoftAccess', 'mm50'): EpcRoofDescriptions.loft_50mm_insulation,
|
|
('PitchedNormalLoftAccess', 'mm75'): EpcRoofDescriptions.loft_75mm_insulation,
|
|
('PitchedNormalLoftAccess', 'mm100'): EpcRoofDescriptions.loft_100mm_insulation,
|
|
('PitchedNormalLoftAccess', 'mm150'): EpcRoofDescriptions.loft_150mm_insulation,
|
|
('PitchedNormalLoftAccess', 'mm200'): EpcRoofDescriptions.loft_200mm_insulation,
|
|
('PitchedNormalLoftAccess', 'mm250'): EpcRoofDescriptions.loft_250mm_insulation,
|
|
('PitchedNormalLoftAccess', 'mm270'): EpcRoofDescriptions.loft_270mm_insulation,
|
|
('PitchedNormalLoftAccess', 'mm300'): EpcRoofDescriptions.loft_300mm_insulation,
|
|
('PitchedNormalLoftAccess', 'mm350'): EpcRoofDescriptions.loft_350mm_insulation,
|
|
('PitchedNormalLoftAccess', 'mm400'): EpcRoofDescriptions.loft_400mm_plus_insulation,
|
|
|
|
# Pitched, no loft access, with a loft thickness
|
|
('PitchedNormalNoLoftAccess', 'mm25'): EpcRoofDescriptions.loft_25mm_insulation,
|
|
('PitchedNormalNoLoftAccess', 'mm50'): EpcRoofDescriptions.loft_50mm_insulation,
|
|
('PitchedNormalNoLoftAccess', 'mm75'): EpcRoofDescriptions.loft_75mm_insulation,
|
|
('PitchedNormalNoLoftAccess', 'mm100'): EpcRoofDescriptions.loft_100mm_insulation,
|
|
('PitchedNormalNoLoftAccess', 'mm150'): EpcRoofDescriptions.loft_150mm_insulation,
|
|
('PitchedNormalNoLoftAccess', 'mm200'): EpcRoofDescriptions.loft_200mm_insulation,
|
|
('PitchedNormalNoLoftAccess', 'mm250'): EpcRoofDescriptions.loft_250mm_insulation,
|
|
('PitchedNormalNoLoftAccess', 'mm270'): EpcRoofDescriptions.loft_270mm_insulation,
|
|
('PitchedNormalNoLoftAccess', 'mm300'): EpcRoofDescriptions.loft_300mm_insulation,
|
|
('PitchedNormalNoLoftAccess', 'mm350'): EpcRoofDescriptions.loft_350mm_insulation,
|
|
('PitchedNormalNoLoftAccess', 'mm400'): EpcRoofDescriptions.loft_400mm_plus_insulation,
|
|
|
|
# All pitched options with asbuilt or unknown got to EpcRoofDescriptions.pitched_insulated_assumed
|
|
# With access
|
|
('PitchedNormalLoftAccess', nan): EpcRoofDescriptions.pitched_insulated_assumed,
|
|
('PitchedNormalLoftAccess', 'AsBuilt'): EpcRoofDescriptions.pitched_insulated_assumed,
|
|
('PitchedNormalLoftAccess', 'Unknown'): EpcRoofDescriptions.pitched_insulated_assumed,
|
|
# No access
|
|
('PitchedNormalNoLoftAccess', nan): EpcRoofDescriptions.pitched_insulated_assumed,
|
|
('PitchedNormalNoLoftAccess', 'AsBuilt'): EpcRoofDescriptions.pitched_insulated_assumed,
|
|
('PitchedNormalNoLoftAccess', 'Unknown'): EpcRoofDescriptions.pitched_insulated_assumed,
|
|
|
|
# Flat
|
|
('Flat', 'NoInsulation'): EpcRoofDescriptions.flat_no_insulation,
|
|
# Flat - limited insulation
|
|
('Flat', '12mm'): EpcRoofDescriptions.flat_limited_insulation,
|
|
('Flat', 'mm25'): EpcRoofDescriptions.flat_limited_insulation,
|
|
('Flat', 'mm50'): EpcRoofDescriptions.flat_limited_insulation,
|
|
# Flat insulated
|
|
('Flat', 'mm75'): EpcRoofDescriptions.flat_insulated,
|
|
('Flat', 'mm100'): EpcRoofDescriptions.flat_insulated,
|
|
('Flat', 'mm150'): EpcRoofDescriptions.flat_insulated,
|
|
('Flat', 'mm200'): EpcRoofDescriptions.flat_insulated,
|
|
('Flat', 'mm250'): EpcRoofDescriptions.flat_insulated,
|
|
('Flat', 'mm300'): EpcRoofDescriptions.flat_insulated,
|
|
('Flat', 'mm350'): EpcRoofDescriptions.flat_insulated,
|
|
('Flat', 'mm400'): EpcRoofDescriptions.flat_insulated,
|
|
# Flat - as built or unknown
|
|
('Flat', 'AsBuilt'): None, # To be classified
|
|
('Flat', nan): None, # To be classified
|
|
('Flat', 'Unknown'): None, # To be classified
|
|
|
|
# 12mm = very poor & has limited insulation description
|
|
# 25, 50 = poor & has limited insulation description
|
|
# 75, 100, 125mm = average (Flat, insulated)
|
|
# 150, 175, 200, 225, 250mm = good (Flat, insulated)
|
|
# 270mm+ = very good (Flat, insulated)
|
|
|
|
# Thatched
|
|
('PitchedThatched', 'mm50'): EpcRoofDescriptions.thatched_with_additional_insulation,
|
|
('PitchedThatched', 'mm150'): EpcRoofDescriptions.thatched_with_additional_insulation,
|
|
('PitchedThatched', 'mm300'): EpcRoofDescriptions.thatched_with_additional_insulation,
|
|
('PitchedThatched', 'Unknown'): EpcRoofDescriptions.thatched, # efficiency classified based on age
|
|
|
|
# Sloping:
|
|
# Limited (12 very poor, 25-50 poor)
|
|
('PitchedWithSlopingCeiling', 'mm12'): EpcRoofDescriptions.sloping_pitched_limited_insulation,
|
|
('PitchedWithSlopingCeiling', 'mm25'): EpcRoofDescriptions.sloping_pitched_limited_insulation,
|
|
('PitchedWithSlopingCeiling', 'mm50'): EpcRoofDescriptions.sloping_pitched_limited_insulation,
|
|
# Insulated 75mm+ (75 - 125 average, 150 - 250 good, 270+ very good)
|
|
('PitchedWithSlopingCeiling', 'mm75'): EpcRoofDescriptions.sloping_pitched_insulated,
|
|
('PitchedWithSlopingCeiling', 'mm100'): EpcRoofDescriptions.sloping_pitched_insulated,
|
|
('PitchedWithSlopingCeiling', 'mm150'): EpcRoofDescriptions.sloping_pitched_insulated,
|
|
('PitchedWithSlopingCeiling', 'mm200'): EpcRoofDescriptions.sloping_pitched_insulated,
|
|
('PitchedWithSlopingCeiling', 'mm250'): EpcRoofDescriptions.sloping_pitched_insulated,
|
|
('PitchedWithSlopingCeiling', 'mm270'): EpcRoofDescriptions.sloping_pitched_insulated,
|
|
('PitchedWithSlopingCeiling', 'mm300'): EpcRoofDescriptions.sloping_pitched_insulated,
|
|
('PitchedWithSlopingCeiling', 'mm350'): EpcRoofDescriptions.sloping_pitched_insulated,
|
|
('PitchedWithSlopingCeiling', 'mm400'): EpcRoofDescriptions.sloping_pitched_insulated,
|
|
# As built/unknown
|
|
('PitchedWithSlopingCeiling', 'AsBuilt'): None, # To be classified
|
|
('PitchedWithSlopingCeiling', nan): None, # To be classified
|
|
('PitchedWithSlopingCeiling', 'Unknown'): None, #
|
|
}
|
|
|
|
ROOF_UNKNOWN_AGE_FALLBACK = {
|
|
"Flat": EpcRoofDescriptions.flat_as_built_unknown,
|
|
"PitchedWithSlopingCeiling": EpcRoofDescriptions.sloping_pitched_as_built_unknown,
|
|
"PitchedThatched": EpcRoofDescriptions.thatched_as_built_unknown,
|
|
"PitchedNormalLoftAccess": EpcRoofDescriptions.loft_as_built_unknown,
|
|
"PitchedNormalNoLoftAccess": EpcRoofDescriptions.loft_as_built_unknown,
|
|
}
|
|
|
|
|
|
def fill_roof_as_built(row):
|
|
# Already resolved
|
|
if not pd.isnull(row.landlord_roof_description):
|
|
return row.landlord_roof_description
|
|
|
|
roof_type = row["Roof Construction"]
|
|
|
|
classifier = AS_BUILT_ROOF_CLASSIFIERS.get(roof_type)
|
|
if classifier is None:
|
|
raise NotImplementedError(f"No roof classifier for roof type '{roof_type}'")
|
|
|
|
if pd.isnull(row.construction_age_band):
|
|
return ROOF_UNKNOWN_AGE_FALLBACK.get(roof_type)
|
|
|
|
output = classifier(row.construction_age_band)
|
|
if output is None:
|
|
raise NotImplementedError(
|
|
f"Roof classification returned None for roof type '{roof_type}'"
|
|
)
|
|
|
|
return output
|
|
|
|
|
|
data["landlord_roof_description"] = (
|
|
data[["Roof Construction", "Roof Insulation"]]
|
|
.progress_apply(tuple, axis=1)
|
|
.map(roof_mapping)
|
|
)
|
|
|
|
data["landlord_roof_description"] = data.progress_apply(
|
|
fill_roof_as_built,
|
|
axis=1,
|
|
)
|
|
# Sanity check
|
|
assert data["landlord_roof_description"].isnull().sum() == 0, (
|
|
"Some roof descriptions could not be resolved"
|
|
)
|
|
|
|
|
|
def extract_insulation_thickness(value: str | None) -> int | None:
|
|
"""
|
|
Extract insulation thickness in mm from a string like 'mm150'.
|
|
Returns None if not present or not parseable.
|
|
"""
|
|
if value is None or pd.isnull(value):
|
|
return None
|
|
|
|
match = re.search(r"(\d+)", str(value))
|
|
if not match:
|
|
return None
|
|
|
|
return int(match.group(1))
|
|
|
|
|
|
data["roof_insulation_thickness_mm"] = data["Roof Insulation"].apply(
|
|
extract_insulation_thickness
|
|
)
|
|
|
|
data["landlord_roof_efficiency"] = data.progress_apply(
|
|
lambda row: resolve_roof_efficiency(
|
|
description=row.landlord_roof_description,
|
|
age_band=row.construction_age_band,
|
|
insulation_thickness=row.roof_insulation_thickness_mm,
|
|
),
|
|
axis=1,
|
|
)
|
|
|
|
assert data["landlord_roof_efficiency"].isnull().sum() == 0
|
|
|
|
# Flag sloping ceiling
|
|
data["has_sloping_ceiling"] = data["Roof Construction"].apply(
|
|
lambda x: x == "PitchedWithSlopingCeiling"
|
|
)
|
|
|
|
# ------------ Floor Construction ------------
|
|
|
|
|
|
floor_mapping = {
|
|
# Solid floor
|
|
('Solid', 'AsBuilt'): None, # Mapped
|
|
('Solid', 'Unknown'): None, # Mapped
|
|
('Solid', nan): None, # Mapped
|
|
('Solid', 'RetroFitted'): EpcFloorDescriptions.solid_insulated,
|
|
|
|
# Suspended floor
|
|
('SuspendedTimber', nan): None, # Mapped suspended_floor_as_built
|
|
('SuspendedTimber', 'AsBuilt'): None, # Mapped suspended_floor_as_built
|
|
('SuspendedTimber', 'RetroFitted'): EpcFloorDescriptions.suspended_insulated,
|
|
('SuspendedTimber', 'Unknown'): None, # Mapped suspended_floor_as_built
|
|
('SuspendedNotTimber', 'RetroFitted'): EpcFloorDescriptions.suspended_insulated,
|
|
('SuspendedNotTimber', nan): None, # Mapped suspended_floor_as_built
|
|
('SuspendedNotTimber', 'Unknown'): None, # Mapped suspended_floor_as_built
|
|
('SuspendedNotTimber', 'AsBuilt'): None, # Mapped suspended_floor_as_built
|
|
|
|
# Unknown type - mapped on age
|
|
('Unknown', 'Unknown'): None, # Mapped unknown_floor_as_built
|
|
('Unknown', 'RetroFitted'): None, # Mapped unknown_floor_retrofitted
|
|
(nan, nan): None, # No actual information!
|
|
('Unknown', 'AsBuilt'): None, # Mapped unknown_floor_as_built
|
|
}
|
|
|
|
data["landlord_floor_description"] = (
|
|
data[["Floor Construction", "Floor Insulation"]]
|
|
.progress_apply(tuple, axis=1)
|
|
.map(floor_mapping)
|
|
)
|
|
|
|
|
|
def fill_floor_as_built(row):
|
|
# 1. Already resolved
|
|
if row.landlord_floor_description is not None:
|
|
return row.landlord_floor_description
|
|
|
|
age_band = row.construction_age_band
|
|
floor_type = row["Floor Construction"]
|
|
insulation = row["Floor Insulation"]
|
|
|
|
# 2. Missing age band → conservative fallback
|
|
if pd.isnull(age_band):
|
|
return EpcFloorDescriptions.unknown
|
|
|
|
# 3. Known floor types
|
|
if floor_type == "Solid":
|
|
return solid_floor_as_built(age_band)
|
|
|
|
if floor_type in {"SuspendedTimber", "SuspendedNotTimber"}:
|
|
return suspended_floor_as_built(age_band)
|
|
|
|
# 4. Unknown floor type
|
|
if floor_type == "Unknown":
|
|
if insulation == "RetroFitted":
|
|
return unknown_floor_retrofitted(age_band)
|
|
return unknown_floor_as_built(age_band)
|
|
|
|
# 5. Truly missing / garbage input
|
|
return EpcFloorDescriptions.unknown
|
|
|
|
|
|
data["landlord_floor_description"] = data.progress_apply(
|
|
fill_floor_as_built,
|
|
axis=1,
|
|
)
|
|
|
|
# All values should be remapped now
|
|
assert data["landlord_floor_description"].isnull().sum() == 0, (
|
|
"Some floor descriptions could not be resolved"
|
|
)
|
|
|
|
# TODO: Convert everything to values
|
|
|
|
# Variables we want to map
|
|
# 'Org Ref', 'Address 1', 'Address 2', 'Address 3', 'Postcode',
|
|
# 'Floor Construction', 'Floor Insulation', 'Glazing', 'Heating',
|
|
# 'Boiler Efficiency', 'Main Fuel', 'Controls Adequacy', 'UPRN',
|
|
# 'Total Floor Area (m2)'
|