From 3bf641f49d2f28453c7f517538d11004e7c4bbaf Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 11 Mar 2026 20:52:20 +0000 Subject: [PATCH] created _apply_averages_cleaning --- backend/Property.py | 24 +++---- backend/engine/engine.py | 131 --------------------------------------- etl/epc/Record.py | 63 +++++++++++++++++++ 3 files changed, 75 insertions(+), 143 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index b2be7210..1b73429a 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -487,7 +487,7 @@ class Property: # CO₂ emissions per square metre floor area per year in kg/m². Since CO₂ emissions are in tonnes # per year, we multiply by 1000 to get kg/m² "co2-emiss-curr-per-floor-area": round( - 1000 * (rec_impact["carbon"] / self.data["total-floor-area"]) + 1000 * (rec_impact["carbon"] / self.epc_record.get("total_floor_area")) ), "co2-emissions-current": rec_impact["carbon"], "current-energy-rating": sap_to_epc(rec_impact["sap"]), @@ -594,21 +594,21 @@ class Property: if not cleaned: raise ValueError("Cleaner does not contain cleaned data") - if not self.data: + if not self.epc_record: raise ValueError("Property does not contain data") for description, attribute in cleaned.items(): cleaner_cls = all_cleaner_map[description] - if self.data[description] in self.DATA_ANOMALY_MATCHES: + if self.epc_record.get(description) in self.DATA_ANOMALY_MATCHES: if description == "lighting-description": cleaner_cls = cleaner_cls("", averages=None) else: cleaner_cls = cleaner_cls("") fill_dict = { - "original_description": self.data[description], - "clean_description": self.data[description], + "original_description": self.epc_record.get(description), + "clean_description": self.epc_record.get(description), **cleaner_cls.process() } setattr(self, self.ATTRIBUTE_MAP[description], fill_dict) @@ -617,7 +617,7 @@ class Property: attributes = [ x for x in cleaned[description] - if x["original_description"] == self.data[description] + if x["original_description"] == self.epc_record.get(description) ] if len(attributes) > 1: @@ -628,11 +628,11 @@ class Property: if len(attributes) == 0: # We attempt to perform the clean on the fly if description == "lighting-description": - cleaner_cls = cleaner_cls(self.data[description], averages=None) + cleaner_cls = cleaner_cls(self.epc_record.get(description), averages=None) else: - cleaner_cls = cleaner_cls(self.data[description]) + cleaner_cls = cleaner_cls(self.epc_record.get(description)) processed = { - "original_description": self.data[description], + "original_description": self.epc_record.get(description), "clean_description": cleaner_cls.description.replace( "(assumed)", "" ) @@ -672,7 +672,7 @@ class Property: # Today's costs todays_lighting_cost = kwh_client.convert_cost_to_today( original_cost=float(self.data["lighting-cost-current"]), - lodgement_date=pd.Timestamp(self.epc_record.prepared_epc["lodgement_date"]).tz_localize(None) + lodgement_date=pd.Timestamp(self.epc_record.get("lodgement_date")).tz_localize(None) ) # If we have the kwh figures, we don't need to predict them @@ -1299,7 +1299,7 @@ class Property: valid for 10 years. :return: boolean indicating whether the EPC is expired """ - lodgement_date = self.data["lodgement-date"] + lodgement_date = self.epc_record.get("lodgement-date") return (datetime.now() - pd.to_datetime(lodgement_date)) > timedelta(days=3650) @property @@ -1308,4 +1308,4 @@ class Property: This property indicates that the EPC is estimated, based on the presence of the "estimated" flag in the data :return: boolean indicating whether the EPC is estimated """ - return self.data.get("estimated", False) + return self.epc_record.get("estimated") diff --git a/backend/engine/engine.py b/backend/engine/engine.py index 45a3f5e6..339a4236 100644 --- a/backend/engine/engine.py +++ b/backend/engine/engine.py @@ -413,94 +413,6 @@ def check_duplicate_property_ids(input_properties): return True -def averages_cleaning(prepared_epc: EPCRecord, cleaning_data: pd.DataFrame): - """ - Placeholder cleaning function to handle edge cases where we have missing data for - number of habitable rooms, number of heated rooms and floor height. We take the median - This need was born out of the Peabody project - :param prepared_epc: - :param cleaning_data: - :return: - """ - - variables_to_clean = [ - "number_habitable_rooms", - "number_heated_rooms", - "floor_height", - ] - - if not any([pd.isnull(prepared_epc.prepared_epc[k]) for k in variables_to_clean]): - # Nothing to do - return prepared_epc - - # Clean with cleaning_data - clean_with = cleaning_data[ - (cleaning_data["property_type"] == prepared_epc.prepared_epc["property_type"]) & - (cleaning_data["property_type"] == prepared_epc.prepared_epc["property_type"]) - ] - if prepared_epc.prepared_epc["local_authority"] in clean_with["local_authority"].values: - clean_with = clean_with[ - clean_with["local_authority"] == prepared_epc.prepared_epc["local_authority"] - ] - - floor_area_clean_with = clean_with[ - (clean_with["total_floor_area"] <= prepared_epc.prepared_epc["total_floor_area"] * 1.1) & - (clean_with["total_floor_area"] >= prepared_epc.prepared_epc["total_floor_area"] * 0.9) - ] - - if not floor_area_clean_with.empty: - clean_with = floor_area_clean_with - - clean_n_habitable_rooms = int(round(clean_with["number_habitable_rooms"].median())) - clean_n_heated_rooms = int(round(clean_with["number_heated_rooms"].median())) - if clean_n_heated_rooms > clean_n_habitable_rooms: - clean_n_heated_rooms = clean_n_habitable_rooms - - clean_floor_height = clean_with["floor_height"].median() - - # We now fill - if not pd.isnull(clean_n_habitable_rooms) and pd.isnull( - prepared_epc.prepared_epc["number_habitable_rooms"]): - prepared_epc.prepared_epc["number_habitable_rooms"] = clean_n_habitable_rooms - prepared_epc.number_habitable_rooms = clean_n_habitable_rooms - - if not pd.isnull(clean_n_heated_rooms) and pd.isnull( - prepared_epc.prepared_epc["number_heated_rooms"]): - prepared_epc.prepared_epc["number_heated_rooms"] = clean_n_heated_rooms - prepared_epc.number_heated_rooms = clean_n_heated_rooms - - if not pd.isnull(clean_floor_height) and pd.isnull( - prepared_epc.prepared_epc["floor_height"]): - prepared_epc.prepared_epc["floor_height"] = clean_floor_height - prepared_epc.floor_height = clean_floor_height - - # if pd.isnull(prepared_epc.lighting_cost_current): - # # This is a basic assumption as an average - # prepared_epc.prepared_epc["lighting_cost_current"] = assumptions.AVERAGE_LIGHTING_COST - # prepared_epc.lighting_cost_current = assumptions.AVERAGE_LIGHTING_COST - - # if pd.isnull(prepared_epc.heating_cost_current): - # # This is a basic assumption as an average - # appliance_cost = AnnualBillSavings.estimate_appliances_energy_use( - # total_floor_area=prepared_epc.total_floor_area - # ) * AnnualBillSavings.ELECTRICITY_PRICE_CAP - # heating_cleaned_value = assumptions.AVERAGE_HEATING_AND_APPLIANCE_COST - appliance_cost - # prepared_epc.prepared_epc["heating_cost_current"] = heating_cleaned_value - # prepared_epc.heating_cost_current = heating_cleaned_value - # - # if pd.isnull(prepared_epc.hot_water_cost_current): - # # This is a basic assumption as an average - # prepared_epc.prepared_epc["hot_water_cost_current"] = assumptions.AVERAGE_HOT_WATER_COST - # prepared_epc.hot_water_cost_current = assumptions.AVERAGE_HOT_WATER_COST - # - # if pd.isnull(prepared_epc.energy_consumption_potential): - # # Set to current - # prepared_epc.prepared_epc["energy_consumption_potential"] = prepared_epc.energy_consumption_current - # prepared_epc.energy_consumption_potential = prepared_epc.energy_consumption_current - - return prepared_epc - - def extract_address_data(config, body): """ Simple helper to grab address data from the config @@ -828,10 +740,6 @@ async def model_engine(body: PlanTriggerRequest): epc_records=epc_records, run_mode="newdata", cleaning_data=cleaning_data, address_metadata=addr ) - # TODO: This is a temp function to handle a specific edge case with Peabody. We should - # factor this into EPCRecord as part of the cleaning however we need some more testing - prepared_epc = averages_cleaning(prepared_epc, cleaning_data) - input_properties.append( Property( id=property_id, @@ -906,45 +814,6 @@ async def model_engine(body: PlanTriggerRequest): # 2) Missing EPC # 3) Materially different information from landlord vs EPC # make the landlord remapping dictionary - addr = next((a for a in addresses if a.uprn == p.uprn), None) - if addr is None: - raise ValueError("Could not find address for property with UPRN: %s", p.uprn) - - landlord_remapping = { - "total_floor_area": addr.landlord_total_floor_area_m2, # 1m tolerance on floor area to perform remap - "property_type": addr.landlord_property_type, - "built_form": addr.landlord_built_form, - - # Components - "walls_description": addr.landlord_wall_construction, - "roof_description": addr.landlord_roof_construction, - "floor_description": addr.landlord_floor_construction, - "windows_description": addr.landlord_windows_type, - "main_fuel": addr.landlord_fuel_type, - "mainheat_description": addr.landlord_heating_system, - "mainheatcont_description": addr.landlord_heating_controls, - "hotwater_description": addr.landlord_hot_water_system, - - # Efficiency - "walls_energy_eff": addr.landlord_wall_efficiency, - "roof_energy_eff": addr.landlord_roof_efficiency, - "windows_energy_eff": addr.landlord_windows_efficiency, - "mainheat_energy_eff": addr.landlord_heating_efficiency, - "mainheatc_energy_eff": addr.landlord_heating_controls_efficiency, - "hot_water_energy_eff": addr.landlord_hot_water_efficiency, - - "multi_glaze_proportion": addr.landlord_multi_glaze_proportion * 100, # TODO: Fix this! - "construction_age_band": addr.landlord_construction_age_band, - } - # Find differences between EPC and landlord data - differences = {} - for k, v in landlord_remapping.items(): - if k == "total_floor_area": - if abs(p.epc_record.prepared_epc.get(k) - v) > 1: # 1m tolerance - differences[k] = v - else: - if v != p.epc_record.get(k) and (not pd.isnull(v)) and (not pd.isnull(p.epc_record.get(k))): - differences[k] = v needs_rebaselining = p.epc_is_expired | p.epc_is_estimated | (len(differences) > 0) diff --git a/etl/epc/Record.py b/etl/epc/Record.py index eb462850..1ed0fc41 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -303,6 +303,12 @@ class EPCRecord: glazed_type: Optional[str] = None multi_glaze_proportion: Optional[float] = None + # ------------------------------------------------------------------ + # CLEANING FLAG + # ------------------------------------------------------------------ + # Indicates if the EPC record has been predicted. By default, false + estimated: Optional[bool] = False + # ------------------------------------------------------------------ # MODEL FLAGS # ------------------------------------------------------------------ @@ -379,6 +385,63 @@ class EPCRecord: return + def _apply_averages_cleaning(self) -> None: + """ + Fills missing property dimension values using medians from cleaning_data. + """ + + if self._prepared_epc is None: + raise ValueError("Prepared EPC missing") + + if self.cleaning_data is None: + raise ValueError("Cleaning data required for averages cleaning") + + variables = [ + "number-habitable-rooms", + "number-heated-rooms", + "floor-height", + ] + + if not any(pd.isnull(self._prepared_epc.get(v)) for v in variables): + return + + cleaning_data: pd.DataFrame = self.cleaning_data + + clean_with = cleaning_data[ + (cleaning_data["property_type"] == self._prepared_epc["property-type"]) + ] + + if self._prepared_epc["local-authority"] in clean_with["local_authority"].values: + clean_with = clean_with[ + clean_with["local_authority"] == self._prepared_epc["local-authority"] + ] + + floor_area = self._prepared_epc.get("total-floor-area") + + if floor_area is not None: + subset = clean_with[ + ( + (clean_with["total_floor_area"].astype(float) <= floor_area * 1.1) & + (clean_with["total_floor_area"].astype(float) >= floor_area * 0.9) + ) + ] + if not subset.empty: + clean_with = subset + + medians = { + "number-habitable-rooms": int(round(clean_with["number_habitable_rooms"].median())), + "number-heated-rooms": int(round(clean_with["number_heated_rooms"].median())), + "floor-height": float(clean_with["floor_height"].median()), + } + + # heated rooms should never exceed habitable + if medians["number-heated-rooms"] > medians["number-habitable-rooms"]: + medians["number-heated-rooms"] = medians["number-habitable-rooms"] + + for key, value in medians.items(): + if pd.isnull(self._prepared_epc.get(key)): + self._prepared_epc[key] = value + def _apply_cleaning_rules(self) -> None: """ Apply simple field-level cleaning rules defined in CLEANING_RULES.