From 1e9aa98ffc127e9bbef798ec0164e9d2568f04b9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 28 Nov 2025 17:02:32 +0000 Subject: [PATCH 1/2] debugging fuel types and a zero floor area property --- backend/Property.py | 10 +++++++++- etl/epc/Record.py | 3 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/backend/Property.py b/backend/Property.py index 6148b40a..cbc762e6 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -1212,9 +1212,17 @@ class Property: else: self.heating_energy_source = ['Wood Logs'] - if len(self.heating_energy_source) == 0 or len(self.heating_energy_source) > 1: + if len(self.heating_energy_source) == 0 or ( + len(self.heating_energy_source) > 1 and "Varied (Community Scheme)" not in self.heating_energy_source + ): + # We might have something like heating energy source equal to ['Natural Gas', 'Varied (Community Scheme)'] + # so we treat this as community heating raise Exception("Investigate me") + if len(self.heating_energy_source) > 1: + # We treat this as a community scheme + self.heating_energy_source = ["Varied (Community Scheme)"] + self.heating_energy_source = self.heating_energy_source[0] if self.heating_energy_source == "Varied (Community Scheme)": diff --git a/etl/epc/Record.py b/etl/epc/Record.py index c1c3ff67..e1853361 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -596,6 +596,9 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") + if self.prepared_epc["total-floor-area"] is None: + return + self.prepared_epc["total-floor-area"] = float( self.prepared_epc["total-floor-area"] ) From 1f267e7d47322dfae0d14d6e94e9e68911454c37 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 28 Nov 2025 18:15:56 +0000 Subject: [PATCH 2/2] debugging for peabody - edge case properties and added placeholder averages cleaning --- backend/Property.py | 40 ++++++++++++++++++------ backend/engine/engine.py | 67 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 93 insertions(+), 14 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index cbc762e6..ae79f250 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -1192,6 +1192,14 @@ class Property: 'oil range cooker': 'Oil' } + fuel_map = { + None: "Natural Gas (Community Scheme)", + "mains gas": "Natural Gas (Community Scheme)", + "biomass": "Smokeless Fuel", + "electricity": "Electricity", + "biogas": "Smokeless Fuel", + } + self.heating_energy_source = list({ fuel for key, fuel in heating_fuel_mapping.items() if self.main_heating.get(key, False) }) @@ -1212,13 +1220,31 @@ class Property: else: self.heating_energy_source = ['Wood Logs'] - if len(self.heating_energy_source) == 0 or ( - len(self.heating_energy_source) > 1 and "Varied (Community Scheme)" not in self.heating_energy_source - ): + if len(self.heating_energy_source) > 1 and "Varied (Community Scheme)" not in self.heating_energy_source: # We might have something like heating energy source equal to ['Natural Gas', 'Varied (Community Scheme)'] # so we treat this as community heating raise Exception("Investigate me") + if len(self.heating_energy_source) == 0: + heating_flags = { + v for k, v in self.main_heating.items() if k not in ["original_description", "clean_description"] + } + hotwater_flags = { + v for k, v in self.hotwater.items() if k not in ["original_description", "clean_description"] + } + + # If all flags are zero, we have a no data example + if (heating_flags == {False} or hotwater_flags == {None}) and ( + hotwater_flags == {False} or hotwater_flags == {None}): + # We have nodata so we try and rely on main fuel + if self.main_fuel["fuel_type"] in fuel_map: # We assume when None as it's unknown + mapped_fuel = fuel_map[self.main_fuel["fuel_type"]] + self.heating_energy_source = mapped_fuel + self.hot_water_energy_source = mapped_fuel + return + else: + raise NotImplementedError(f"Unhandled fuel {self.main_fuel['fuel_type']}") + if len(self.heating_energy_source) > 1: # We treat this as a community scheme self.heating_energy_source = ["Varied (Community Scheme)"] @@ -1226,13 +1252,7 @@ class Property: self.heating_energy_source = self.heating_energy_source[0] if self.heating_energy_source == "Varied (Community Scheme)": - fuel_map = { - None: "Natural Gas (Community Scheme)", - "mains gas": "Natural Gas (Community Scheme)", - "biomass": "Smokeless Fuel", - "electricity": "Electricity", - "biogas": "Smokeless Fuel", - } + if self.main_fuel["fuel_type"] in fuel_map: # We assume when None as it's unknown self.heating_energy_source = fuel_map[self.main_fuel["fuel_type"]] else: diff --git a/backend/engine/engine.py b/backend/engine/engine.py index e9cb86ea..217be3c3 100644 --- a/backend/engine/engine.py +++ b/backend/engine/engine.py @@ -405,6 +405,65 @@ def check_duplicate_uprns(plan_input): return True +def averages_cleaning(prepared_epc: EPCRecord, cleaning_data: pd.DataFrame): + """ + Placeholder cleaning function to handle edge cases where we have missing data for + number of habitable rooms, number of heated rooms and floor height. We take the median + This need was born out of the Peabody project + :param prepared_epc: + :param cleaning_data: + :return: + """ + + if not pd.isnull(prepared_epc.prepared_epc["number_habitable_rooms"]) and not pd.isnull( + prepared_epc.prepared_epc["number_heated_rooms"]) and not pd.isnull(prepared_epc.prepared_epc["floor_height"]): + # Nothing to do + return prepared_epc + + # Clean with cleaning_data + clean_with = cleaning_data[ + (cleaning_data["property_type"] == prepared_epc.prepared_epc["property_type"]) & + (cleaning_data["property_type"] == prepared_epc.prepared_epc["property_type"]) + ] + if prepared_epc.prepared_epc["local_authority"] in clean_with["local_authority"].values: + clean_with = clean_with[ + clean_with["local_authority"] == prepared_epc.prepared_epc["local_authority"] + ] + + floor_area_clean_with = clean_with[ + (clean_with["total_floor_area"] <= prepared_epc.prepared_epc["total_floor_area"] * 1.1) & + (clean_with["total_floor_area"] >= prepared_epc.prepared_epc["total_floor_area"] * 0.9) + ] + + if not floor_area_clean_with.empty: + clean_with = floor_area_clean_with + + clean_n_habitable_rooms = int(round(clean_with["number_habitable_rooms"].median())) + clean_n_heated_rooms = int(round(clean_with["number_heated_rooms"].median())) + if clean_n_heated_rooms > clean_n_habitable_rooms: + clean_n_heated_rooms = clean_n_habitable_rooms + + clean_floor_height = clean_with["floor_height"].median() + + # We now fill + if not pd.isnull(clean_n_habitable_rooms) and pd.isnull( + prepared_epc.prepared_epc["number_habitable_rooms"]): + prepared_epc.prepared_epc["number_habitable_rooms"] = clean_n_habitable_rooms + prepared_epc.number_habitable_rooms = clean_n_habitable_rooms + + if not pd.isnull(clean_n_heated_rooms) and pd.isnull( + prepared_epc.prepared_epc["number_heated_rooms"]): + prepared_epc.prepared_epc["number_heated_rooms"] = clean_n_heated_rooms + prepared_epc.number_heated_rooms = clean_n_heated_rooms + + if not pd.isnull(clean_floor_height) and pd.isnull( + prepared_epc.prepared_epc["floor_height"]): + prepared_epc.prepared_epc["floor_height"] = clean_floor_height + prepared_epc.floor_height = clean_floor_height + + return prepared_epc + + async def model_engine(body: PlanTriggerRequest): logger.info("Model Engine triggered with body: %s", json.loads(body.model_dump_json())) @@ -669,6 +728,10 @@ async def model_engine(body: PlanTriggerRequest): cleaning_data=cleaning_data, ) + # TODO: This is a temp function to handle a specific edge case with Peabody. We should + # factor this into EPCRecord as part of the cleaning however we need some more testing + prepared_epc = averages_cleaning(prepared_epc, cleaning_data) + # If we have an ECO project, we parse the cavity/solar reasons eco_packages[property_id] = parse_eco_packages(config, prepared_epc) @@ -756,10 +819,6 @@ async def model_engine(body: PlanTriggerRequest): input_properties = OpenUprnClient.set_spatial_data(input_properties, bucket_name=get_settings().DATA_BUCKET) [p.set_features(cleaned=cleaned, kwh_client=kwh_client, kwh_predictions=kwh_preds) for p in input_properties] - # TODO: If a property is semi-detached, we might get roof surfaces for the main building + the neighbour - # TODO: If we can't get high image quality, should we use the solar API? Maybe just for semi-detached units with - # extensions, since it doesn't seem to do a great job - logger.info("Performing solar analysis") ofgem_consumption_averages = read_dataframe_from_s3_parquet(