Model/backend/onboarders/parity.py
2026-02-02 20:01:47 +00:00

437 lines
18 KiB
Python

import re
from numpy import nan
from tqdm import tqdm
import pandas as pd
from backend.onboarders.mappings.property_type import parity_map as property_map
from backend.onboarders.mappings.age_band import parity_map as age_band_map
from backend.onboarders.mappings.built_form import parity_map as built_form_map
from backend.onboarders.epc_descriptions import EpcWallDescriptions, EpcConstructionAgeBand, EpcEfficiency, \
WALL_DESCRIPTION_EFFICIENCIES, EpcRoofDescriptions, resolve_roof_efficiency, EpcFloorDescriptions
from backend.onboarders.mappings.as_built_wall_classifiers import AS_BUILT_WALL_CLASSIFIERS
from backend.onboarders.mappings.as_built_roof_classifiers import AS_BUILT_ROOF_CLASSIFIERS
from backend.onboarders.mappings.as_built_floor_classifiers import unknown_floor_as_built, unknown_floor_retrofitted, \
solid_floor_as_built, suspended_floor_as_built
tqdm.pandas()
def check_nulls(data, original_column, mapped_column):
# We only allow nulls if the oroginal value was null
null_vals = data[pd.isnull(data[mapped_column])]
if null_vals.empty:
return True
# We make sure all original values were null
assert pd.isnull(null_vals[original_column]).all(), (
f"Some values in {mapped_column} were not mapped, but original values were not null"
)
# Sample input data
data = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody "
"- Data Extracts for Domna.xlsx",
sheet_name="Sustainability"
)
# We want to map the parity fields to standard EPC references. This will allow us to
# 1) Estimate EPCs, more accurately
# 2) Patch incorrect EPCs with ease
# 3) Indicate already installed measures
# ------------ construction_age_band ------------
data["construction_age_band"] = data["Construction Years"].map(age_band_map)
check_nulls(data, "Construction Years", "construction_age_band")
# ------------ property_type ------------
data["property_type"] = data["Type"].map(property_map)
assert pd.isnull(data["property_type"]).sum() == 0, "Some property types were not mapped"
# ------------ built_form ------------
data["built_form"] = data["Attachment"].map(built_form_map)
assert pd.isnull(data["built_form"]).sum() == 0, "Some built forms were not mapped"
# ------------ Wall Construction ------------
# Unique combindations
wall_mapping = {
# Cavity walls
('Cavity', 'FilledCavity'): EpcWallDescriptions.cavity_filled_cavity,
('Cavity', 'Internal'): EpcWallDescriptions.cavity_internal_insulation,
('Cavity', 'External'): EpcWallDescriptions.cavity_external_insulation,
('Cavity', 'FilledCavityPlusInternal'): EpcWallDescriptions.cavity_filled_plus_internal,
('Cavity', 'FilledCavityPlusExternal'): EpcWallDescriptions.cavity_filled_plus_external,
('Cavity', 'AsBuilt'): None, # To be classified
('Cavity', 'Unknown'): None, # To be classified
# System built walls
('System', 'External'): EpcWallDescriptions.system_external_insulation,
('System', 'Internal'): EpcWallDescriptions.system_internal_insulation,
('System', 'AsBuilt'): None, # To be classified
('System', 'Unknown'): None,
# Timber Frame walls
('Timber Frame', 'Internal'): EpcWallDescriptions.timber_frame_internal_insulation,
('Timber Frame', 'External'): EpcWallDescriptions.timber_frame_external_insulation,
('Timber Frame', 'AsBuilt'): None, # To be classified
('Timber Frame', 'Unknown'): None,
# Solid Brick walls
('Solid Brick', 'External'): EpcWallDescriptions.solid_brick_external_insulation,
('Solid Brick', 'Internal'): EpcWallDescriptions.solid_brick_internal_insulation,
('Solid Brick', 'AsBuilt'): None, # To be classified
('Solid Brick', 'Unknown'): None,
# Granite walls
('Granite', 'External'): EpcWallDescriptions.granite_whinstone_external_insulation,
("Granite", 'Internal'): EpcWallDescriptions.granite_whinstone_internal_insulation,
('Granite', 'AsBuilt'): None,
('Granite', 'Unknown'): None,
# Sandstone walls
('Sandstone', 'Internal'): EpcWallDescriptions.sandstone_limestone_internal_insulation,
('Sandstone', 'External'): EpcWallDescriptions.sandstone_limestone_external_insulation,
('Sandstone', 'Unknown'): None,
('Sandstone', 'AsBuilt'): None,
# Cob walls
('Cob', 'AsBuilt'): None,
}
WALL_UNKNOWN_AGE_FALLBACK = {
"Cavity": EpcWallDescriptions.cavity_as_built_unknown,
"Solid Brick": EpcWallDescriptions.solid_brick_as_built_unknown,
"Timber Frame": EpcWallDescriptions.timber_frame_as_built_unknown,
"System": EpcWallDescriptions.system_as_built_unknown,
"Granite": EpcWallDescriptions.granite_as_built_unknown,
"Sandstone": EpcWallDescriptions.sandstone_as_built_unknown,
"Cob": EpcWallDescriptions.cob_as_built_unknown,
}
data["landlord_wall_description"] = (
data[["Wall Construction", "Wall Insulation"]]
.apply(tuple, axis=1)
.map(wall_mapping)
)
def fill_as_built(row):
# Already resolved via direct mapping
if row.landlord_wall_description is not None:
return row.landlord_wall_description
wall_type = row["Wall Construction"]
# Missing construction age → conservative fallback
if pd.isnull(row.construction_age_band):
return WALL_UNKNOWN_AGE_FALLBACK.get(wall_type)
classifier = AS_BUILT_WALL_CLASSIFIERS.get(wall_type)
if classifier is None:
return None
return classifier(row.construction_age_band)
def resolve_wall_efficiency(
description: EpcWallDescriptions,
age_band: EpcConstructionAgeBand | None,
) -> EpcEfficiency:
# Unknown / holding descriptions → efficiency unknown
if "unknown insulation" in description.value.lower():
return EpcEfficiency.NA
rule = WALL_DESCRIPTION_EFFICIENCIES.get(description)
if rule is None:
return EpcEfficiency.NA
if isinstance(rule, EpcEfficiency):
return rule
# Rule needs age band but we don't have one
if age_band is None or pd.isnull(age_band):
return EpcEfficiency.NA
return rule(age_band)
data["landlord_wall_description"] = data.progress_apply(fill_as_built, axis=1)
assert data["landlord_wall_description"].isnull().sum() == 0, (
"Some wall descriptions could not be resolved"
)
data["landlord_wall_efficiency"] = data.progress_apply(
lambda row: resolve_wall_efficiency(
row.landlord_wall_description,
row.construction_age_band,
),
axis=1,
)
# Sanity check
assert data["landlord_wall_efficiency"].isnull().sum() == 0
# ------------ Roof Construction ------------
roof_mapping = {
# Dwelling above
('AnotherDwellingAbove', 'Another Dwelling Above'): EpcRoofDescriptions.another_dwelling_above,
('SameDwellingAbove', 'Same Dwelling Above'): EpcRoofDescriptions.another_dwelling_above,
# Pitched, normal loft access, with a loft thickness
('PitchedNormalLoftAccess', 'mm25'): EpcRoofDescriptions.loft_25mm_insulation,
('PitchedNormalLoftAccess', 'mm50'): EpcRoofDescriptions.loft_50mm_insulation,
('PitchedNormalLoftAccess', 'mm75'): EpcRoofDescriptions.loft_75mm_insulation,
('PitchedNormalLoftAccess', 'mm100'): EpcRoofDescriptions.loft_100mm_insulation,
('PitchedNormalLoftAccess', 'mm150'): EpcRoofDescriptions.loft_150mm_insulation,
('PitchedNormalLoftAccess', 'mm200'): EpcRoofDescriptions.loft_200mm_insulation,
('PitchedNormalLoftAccess', 'mm250'): EpcRoofDescriptions.loft_250mm_insulation,
('PitchedNormalLoftAccess', 'mm270'): EpcRoofDescriptions.loft_270mm_insulation,
('PitchedNormalLoftAccess', 'mm300'): EpcRoofDescriptions.loft_300mm_insulation,
('PitchedNormalLoftAccess', 'mm350'): EpcRoofDescriptions.loft_350mm_insulation,
('PitchedNormalLoftAccess', 'mm400'): EpcRoofDescriptions.loft_400mm_plus_insulation,
# Pitched, no loft access, with a loft thickness
('PitchedNormalNoLoftAccess', 'mm25'): EpcRoofDescriptions.loft_25mm_insulation,
('PitchedNormalNoLoftAccess', 'mm50'): EpcRoofDescriptions.loft_50mm_insulation,
('PitchedNormalNoLoftAccess', 'mm75'): EpcRoofDescriptions.loft_75mm_insulation,
('PitchedNormalNoLoftAccess', 'mm100'): EpcRoofDescriptions.loft_100mm_insulation,
('PitchedNormalNoLoftAccess', 'mm150'): EpcRoofDescriptions.loft_150mm_insulation,
('PitchedNormalNoLoftAccess', 'mm200'): EpcRoofDescriptions.loft_200mm_insulation,
('PitchedNormalNoLoftAccess', 'mm250'): EpcRoofDescriptions.loft_250mm_insulation,
('PitchedNormalNoLoftAccess', 'mm270'): EpcRoofDescriptions.loft_270mm_insulation,
('PitchedNormalNoLoftAccess', 'mm300'): EpcRoofDescriptions.loft_300mm_insulation,
('PitchedNormalNoLoftAccess', 'mm350'): EpcRoofDescriptions.loft_350mm_insulation,
('PitchedNormalNoLoftAccess', 'mm400'): EpcRoofDescriptions.loft_400mm_plus_insulation,
# All pitched options with asbuilt or unknown got to EpcRoofDescriptions.pitched_insulated_assumed
# With access
('PitchedNormalLoftAccess', nan): EpcRoofDescriptions.pitched_insulated_assumed,
('PitchedNormalLoftAccess', 'AsBuilt'): EpcRoofDescriptions.pitched_insulated_assumed,
('PitchedNormalLoftAccess', 'Unknown'): EpcRoofDescriptions.pitched_insulated_assumed,
# No access
('PitchedNormalNoLoftAccess', nan): EpcRoofDescriptions.pitched_insulated_assumed,
('PitchedNormalNoLoftAccess', 'AsBuilt'): EpcRoofDescriptions.pitched_insulated_assumed,
('PitchedNormalNoLoftAccess', 'Unknown'): EpcRoofDescriptions.pitched_insulated_assumed,
# Flat
('Flat', 'NoInsulation'): EpcRoofDescriptions.flat_no_insulation,
# Flat - limited insulation
('Flat', '12mm'): EpcRoofDescriptions.flat_limited_insulation,
('Flat', 'mm25'): EpcRoofDescriptions.flat_limited_insulation,
('Flat', 'mm50'): EpcRoofDescriptions.flat_limited_insulation,
# Flat insulated
('Flat', 'mm75'): EpcRoofDescriptions.flat_insulated,
('Flat', 'mm100'): EpcRoofDescriptions.flat_insulated,
('Flat', 'mm150'): EpcRoofDescriptions.flat_insulated,
('Flat', 'mm200'): EpcRoofDescriptions.flat_insulated,
('Flat', 'mm250'): EpcRoofDescriptions.flat_insulated,
('Flat', 'mm300'): EpcRoofDescriptions.flat_insulated,
('Flat', 'mm350'): EpcRoofDescriptions.flat_insulated,
('Flat', 'mm400'): EpcRoofDescriptions.flat_insulated,
# Flat - as built or unknown
('Flat', 'AsBuilt'): None, # To be classified
('Flat', nan): None, # To be classified
('Flat', 'Unknown'): None, # To be classified
# 12mm = very poor & has limited insulation description
# 25, 50 = poor & has limited insulation description
# 75, 100, 125mm = average (Flat, insulated)
# 150, 175, 200, 225, 250mm = good (Flat, insulated)
# 270mm+ = very good (Flat, insulated)
# Thatched
('PitchedThatched', 'mm50'): EpcRoofDescriptions.thatched_with_additional_insulation,
('PitchedThatched', 'mm150'): EpcRoofDescriptions.thatched_with_additional_insulation,
('PitchedThatched', 'mm300'): EpcRoofDescriptions.thatched_with_additional_insulation,
('PitchedThatched', 'Unknown'): EpcRoofDescriptions.thatched, # efficiency classified based on age
# Sloping:
# Limited (12 very poor, 25-50 poor)
('PitchedWithSlopingCeiling', 'mm12'): EpcRoofDescriptions.sloping_pitched_limited_insulation,
('PitchedWithSlopingCeiling', 'mm25'): EpcRoofDescriptions.sloping_pitched_limited_insulation,
('PitchedWithSlopingCeiling', 'mm50'): EpcRoofDescriptions.sloping_pitched_limited_insulation,
# Insulated 75mm+ (75 - 125 average, 150 - 250 good, 270+ very good)
('PitchedWithSlopingCeiling', 'mm75'): EpcRoofDescriptions.sloping_pitched_insulated,
('PitchedWithSlopingCeiling', 'mm100'): EpcRoofDescriptions.sloping_pitched_insulated,
('PitchedWithSlopingCeiling', 'mm150'): EpcRoofDescriptions.sloping_pitched_insulated,
('PitchedWithSlopingCeiling', 'mm200'): EpcRoofDescriptions.sloping_pitched_insulated,
('PitchedWithSlopingCeiling', 'mm250'): EpcRoofDescriptions.sloping_pitched_insulated,
('PitchedWithSlopingCeiling', 'mm270'): EpcRoofDescriptions.sloping_pitched_insulated,
('PitchedWithSlopingCeiling', 'mm300'): EpcRoofDescriptions.sloping_pitched_insulated,
('PitchedWithSlopingCeiling', 'mm350'): EpcRoofDescriptions.sloping_pitched_insulated,
('PitchedWithSlopingCeiling', 'mm400'): EpcRoofDescriptions.sloping_pitched_insulated,
# As built/unknown
('PitchedWithSlopingCeiling', 'AsBuilt'): None, # To be classified
('PitchedWithSlopingCeiling', nan): None, # To be classified
('PitchedWithSlopingCeiling', 'Unknown'): None, #
}
ROOF_UNKNOWN_AGE_FALLBACK = {
"Flat": EpcRoofDescriptions.flat_as_built_unknown,
"PitchedWithSlopingCeiling": EpcRoofDescriptions.sloping_pitched_as_built_unknown,
"PitchedThatched": EpcRoofDescriptions.thatched_as_built_unknown,
"PitchedNormalLoftAccess": EpcRoofDescriptions.loft_as_built_unknown,
"PitchedNormalNoLoftAccess": EpcRoofDescriptions.loft_as_built_unknown,
}
def fill_roof_as_built(row):
# Already resolved
if not pd.isnull(row.landlord_roof_description):
return row.landlord_roof_description
roof_type = row["Roof Construction"]
classifier = AS_BUILT_ROOF_CLASSIFIERS.get(roof_type)
if classifier is None:
raise NotImplementedError(f"No roof classifier for roof type '{roof_type}'")
if pd.isnull(row.construction_age_band):
return ROOF_UNKNOWN_AGE_FALLBACK.get(roof_type)
output = classifier(row.construction_age_band)
if output is None:
raise NotImplementedError(
f"Roof classification returned None for roof type '{roof_type}'"
)
return output
data["landlord_roof_description"] = (
data[["Roof Construction", "Roof Insulation"]]
.progress_apply(tuple, axis=1)
.map(roof_mapping)
)
data["landlord_roof_description"] = data.progress_apply(
fill_roof_as_built,
axis=1,
)
# Sanity check
assert data["landlord_roof_description"].isnull().sum() == 0, (
"Some roof descriptions could not be resolved"
)
def extract_insulation_thickness(value: str | None) -> int | None:
"""
Extract insulation thickness in mm from a string like 'mm150'.
Returns None if not present or not parseable.
"""
if value is None or pd.isnull(value):
return None
match = re.search(r"(\d+)", str(value))
if not match:
return None
return int(match.group(1))
data["roof_insulation_thickness_mm"] = data["Roof Insulation"].apply(
extract_insulation_thickness
)
data["landlord_roof_efficiency"] = data.progress_apply(
lambda row: resolve_roof_efficiency(
description=row.landlord_roof_description,
age_band=row.construction_age_band,
insulation_thickness=row.roof_insulation_thickness_mm,
),
axis=1,
)
assert data["landlord_roof_efficiency"].isnull().sum() == 0
# Flag sloping ceiling
data["has_sloping_ceiling"] = data["Roof Construction"].apply(
lambda x: x == "PitchedWithSlopingCeiling"
)
# ------------ Floor Construction ------------
floor_mapping = {
# Solid floor
('Solid', 'AsBuilt'): None, # Mapped
('Solid', 'Unknown'): None, # Mapped
('Solid', nan): None, # Mapped
('Solid', 'RetroFitted'): EpcFloorDescriptions.solid_insulated,
# Suspended floor
('SuspendedTimber', nan): None, # Mapped suspended_floor_as_built
('SuspendedTimber', 'AsBuilt'): None, # Mapped suspended_floor_as_built
('SuspendedTimber', 'RetroFitted'): EpcFloorDescriptions.suspended_insulated,
('SuspendedTimber', 'Unknown'): None, # Mapped suspended_floor_as_built
('SuspendedNotTimber', 'RetroFitted'): EpcFloorDescriptions.suspended_insulated,
('SuspendedNotTimber', nan): None, # Mapped suspended_floor_as_built
('SuspendedNotTimber', 'Unknown'): None, # Mapped suspended_floor_as_built
('SuspendedNotTimber', 'AsBuilt'): None, # Mapped suspended_floor_as_built
# Unknown type - mapped on age
('Unknown', 'Unknown'): None, # Mapped unknown_floor_as_built
('Unknown', 'RetroFitted'): None, # Mapped unknown_floor_retrofitted
(nan, nan): None, # No actual information!
('Unknown', 'AsBuilt'): None, # Mapped unknown_floor_as_built
}
data["landlord_floor_description"] = (
data[["Floor Construction", "Floor Insulation"]]
.progress_apply(tuple, axis=1)
.map(floor_mapping)
)
def fill_floor_as_built(row):
# 1. Already resolved
if row.landlord_floor_description is not None:
return row.landlord_floor_description
age_band = row.construction_age_band
floor_type = row["Floor Construction"]
insulation = row["Floor Insulation"]
# 2. Missing age band → conservative fallback
if pd.isnull(age_band):
return EpcFloorDescriptions.unknown
# 3. Known floor types
if floor_type == "Solid":
return solid_floor_as_built(age_band)
if floor_type in {"SuspendedTimber", "SuspendedNotTimber"}:
return suspended_floor_as_built(age_band)
# 4. Unknown floor type
if floor_type == "Unknown":
if insulation == "RetroFitted":
return unknown_floor_retrofitted(age_band)
return unknown_floor_as_built(age_band)
# 5. Truly missing / garbage input
return EpcFloorDescriptions.unknown
data["landlord_floor_description"] = data.progress_apply(
fill_floor_as_built,
axis=1,
)
# All values should be remapped now
assert data["landlord_floor_description"].isnull().sum() == 0, (
"Some floor descriptions could not be resolved"
)
# TODO: Convert everything to values
# Variables we want to map
# 'Org Ref', 'Address 1', 'Address 2', 'Address 3', 'Postcode',
# 'Floor Construction', 'Floor Insulation', 'Glazing', 'Heating',
# 'Boiler Efficiency', 'Main Fuel', 'Controls Adequacy', 'UPRN',
# 'Total Floor Area (m2)'