From 3bf641f49d2f28453c7f517538d11004e7c4bbaf Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 11 Mar 2026 20:52:20 +0000
Subject: [PATCH] created _apply_averages_cleaning

---
 backend/Property.py      |  24 +++----
 backend/engine/engine.py | 131 ---------------------------------------
 etl/epc/Record.py        |  63 +++++++++++++++++++
 3 files changed, 75 insertions(+), 143 deletions(-)

diff --git a/backend/Property.py b/backend/Property.py
index b2be7210..1b73429a 100644
--- a/backend/Property.py
+++ b/backend/Property.py
@@ -487,7 +487,7 @@ class Property:
                     # CO₂ emissions per square metre floor area per year in kg/m². Since CO₂ emissions are in tonnes
                     # per year, we multiply by 1000 to get kg/m²
                     "co2-emiss-curr-per-floor-area": round(
-                        1000 * (rec_impact["carbon"] / self.data["total-floor-area"])
+                        1000 * (rec_impact["carbon"] / self.epc_record.get("total_floor_area"))
                     ),
                     "co2-emissions-current": rec_impact["carbon"],
                     "current-energy-rating": sap_to_epc(rec_impact["sap"]),
@@ -594,21 +594,21 @@ class Property:
         if not cleaned:
             raise ValueError("Cleaner does not contain cleaned data")
 
-        if not self.data:
+        if not self.epc_record:
             raise ValueError("Property does not contain data")
 
         for description, attribute in cleaned.items():
 
             cleaner_cls = all_cleaner_map[description]
 
-            if self.data[description] in self.DATA_ANOMALY_MATCHES:
+            if self.epc_record.get(description) in self.DATA_ANOMALY_MATCHES:
                 if description == "lighting-description":
                     cleaner_cls = cleaner_cls("", averages=None)
                 else:
                     cleaner_cls = cleaner_cls("")
                 fill_dict = {
-                    "original_description": self.data[description],
-                    "clean_description": self.data[description],
+                    "original_description": self.epc_record.get(description),
+                    "clean_description": self.epc_record.get(description),
                     **cleaner_cls.process()
                 }
                 setattr(self, self.ATTRIBUTE_MAP[description], fill_dict)
@@ -617,7 +617,7 @@ class Property:
             attributes = [
                 x
                 for x in cleaned[description]
-                if x["original_description"] == self.data[description]
+                if x["original_description"] == self.epc_record.get(description)
             ]
 
             if len(attributes) > 1:
@@ -628,11 +628,11 @@ class Property:
             if len(attributes) == 0:
                 # We attempt to perform the clean on the fly
                 if description == "lighting-description":
-                    cleaner_cls = cleaner_cls(self.data[description], averages=None)
+                    cleaner_cls = cleaner_cls(self.epc_record.get(description), averages=None)
                 else:
-                    cleaner_cls = cleaner_cls(self.data[description])
+                    cleaner_cls = cleaner_cls(self.epc_record.get(description))
                 processed = {
-                    "original_description": self.data[description],
+                    "original_description": self.epc_record.get(description),
                     "clean_description": cleaner_cls.description.replace(
                         "(assumed)", ""
                     )
@@ -672,7 +672,7 @@ class Property:
         # Today's costs
         todays_lighting_cost = kwh_client.convert_cost_to_today(
             original_cost=float(self.data["lighting-cost-current"]),
-            lodgement_date=pd.Timestamp(self.epc_record.prepared_epc["lodgement_date"]).tz_localize(None)
+            lodgement_date=pd.Timestamp(self.epc_record.get("lodgement_date")).tz_localize(None)
         )
 
         # If we have the kwh figures, we don't need to predict them
@@ -1299,7 +1299,7 @@ class Property:
         valid for 10 years.
         :return: boolean indicating whether the EPC is expired
         """
-        lodgement_date = self.data["lodgement-date"]
+        lodgement_date = self.epc_record.get("lodgement-date")
         return (datetime.now() - pd.to_datetime(lodgement_date)) > timedelta(days=3650)
 
     @property
@@ -1308,4 +1308,4 @@ class Property:
         This property indicates that the EPC is estimated, based on the presence of the "estimated" flag in the data
         :return: boolean indicating whether the EPC is estimated
         """
-        return self.data.get("estimated", False)
+        return self.epc_record.get("estimated")
diff --git a/backend/engine/engine.py b/backend/engine/engine.py
index 45a3f5e6..339a4236 100644
--- a/backend/engine/engine.py
+++ b/backend/engine/engine.py
@@ -413,94 +413,6 @@ def check_duplicate_property_ids(input_properties):
     return True
 
 
-def averages_cleaning(prepared_epc: EPCRecord, cleaning_data: pd.DataFrame):
-    """
-    Placeholder cleaning function to handle edge cases where we have missing data for
-    number of habitable rooms, number of heated rooms and floor height. We take the median
-    This need was born out of the Peabody project
-    :param prepared_epc:
-    :param cleaning_data:
-    :return:
-    """
-
-    variables_to_clean = [
-        "number_habitable_rooms",
-        "number_heated_rooms",
-        "floor_height",
-    ]
-
-    if not any([pd.isnull(prepared_epc.prepared_epc[k]) for k in variables_to_clean]):
-        # Nothing to do
-        return prepared_epc
-
-    # Clean with cleaning_data
-    clean_with = cleaning_data[
-        (cleaning_data["property_type"] == prepared_epc.prepared_epc["property_type"]) &
-        (cleaning_data["property_type"] == prepared_epc.prepared_epc["property_type"])
-        ]
-    if prepared_epc.prepared_epc["local_authority"] in clean_with["local_authority"].values:
-        clean_with = clean_with[
-            clean_with["local_authority"] == prepared_epc.prepared_epc["local_authority"]
-            ]
-
-    floor_area_clean_with = clean_with[
-        (clean_with["total_floor_area"] <= prepared_epc.prepared_epc["total_floor_area"] * 1.1) &
-        (clean_with["total_floor_area"] >= prepared_epc.prepared_epc["total_floor_area"] * 0.9)
-        ]
-
-    if not floor_area_clean_with.empty:
-        clean_with = floor_area_clean_with
-
-    clean_n_habitable_rooms = int(round(clean_with["number_habitable_rooms"].median()))
-    clean_n_heated_rooms = int(round(clean_with["number_heated_rooms"].median()))
-    if clean_n_heated_rooms > clean_n_habitable_rooms:
-        clean_n_heated_rooms = clean_n_habitable_rooms
-
-    clean_floor_height = clean_with["floor_height"].median()
-
-    # We now fill
-    if not pd.isnull(clean_n_habitable_rooms) and pd.isnull(
-        prepared_epc.prepared_epc["number_habitable_rooms"]):
-        prepared_epc.prepared_epc["number_habitable_rooms"] = clean_n_habitable_rooms
-        prepared_epc.number_habitable_rooms = clean_n_habitable_rooms
-
-    if not pd.isnull(clean_n_heated_rooms) and pd.isnull(
-        prepared_epc.prepared_epc["number_heated_rooms"]):
-        prepared_epc.prepared_epc["number_heated_rooms"] = clean_n_heated_rooms
-        prepared_epc.number_heated_rooms = clean_n_heated_rooms
-
-    if not pd.isnull(clean_floor_height) and pd.isnull(
-        prepared_epc.prepared_epc["floor_height"]):
-        prepared_epc.prepared_epc["floor_height"] = clean_floor_height
-        prepared_epc.floor_height = clean_floor_height
-
-    # if pd.isnull(prepared_epc.lighting_cost_current):
-    #     # This is a basic assumption as an average
-    #     prepared_epc.prepared_epc["lighting_cost_current"] = assumptions.AVERAGE_LIGHTING_COST
-    #     prepared_epc.lighting_cost_current = assumptions.AVERAGE_LIGHTING_COST
-
-    # if pd.isnull(prepared_epc.heating_cost_current):
-    #     # This is a basic assumption as an average
-    #     appliance_cost = AnnualBillSavings.estimate_appliances_energy_use(
-    #         total_floor_area=prepared_epc.total_floor_area
-    #     ) * AnnualBillSavings.ELECTRICITY_PRICE_CAP
-    #     heating_cleaned_value = assumptions.AVERAGE_HEATING_AND_APPLIANCE_COST - appliance_cost
-    #     prepared_epc.prepared_epc["heating_cost_current"] = heating_cleaned_value
-    #     prepared_epc.heating_cost_current = heating_cleaned_value
-    #
-    # if pd.isnull(prepared_epc.hot_water_cost_current):
-    #     # This is a basic assumption as an average
-    #     prepared_epc.prepared_epc["hot_water_cost_current"] = assumptions.AVERAGE_HOT_WATER_COST
-    #     prepared_epc.hot_water_cost_current = assumptions.AVERAGE_HOT_WATER_COST
-    #
-    # if pd.isnull(prepared_epc.energy_consumption_potential):
-    #     # Set to current
-    #     prepared_epc.prepared_epc["energy_consumption_potential"] = prepared_epc.energy_consumption_current
-    #     prepared_epc.energy_consumption_potential = prepared_epc.energy_consumption_current
-
-    return prepared_epc
-
-
 def extract_address_data(config, body):
     """
     Simple helper to grab address data from the config
@@ -828,10 +740,6 @@ async def model_engine(body: PlanTriggerRequest):
                 epc_records=epc_records, run_mode="newdata", cleaning_data=cleaning_data, address_metadata=addr
             )
 
-            # TODO: This is a temp function to handle a specific edge case with Peabody. We should
-            #       factor this into EPCRecord as part of the cleaning however we need some more testing
-            prepared_epc = averages_cleaning(prepared_epc, cleaning_data)
-
             input_properties.append(
                 Property(
                     id=property_id,
@@ -906,45 +814,6 @@ async def model_engine(body: PlanTriggerRequest):
             # 2) Missing EPC
             # 3) Materially different information from landlord vs EPC
             # make the landlord remapping dictionary
-            addr = next((a for a in addresses if a.uprn == p.uprn), None)
-            if addr is None:
-                raise ValueError("Could not find address for property with UPRN: %s", p.uprn)
-
-            landlord_remapping = {
-                "total_floor_area": addr.landlord_total_floor_area_m2,  # 1m tolerance on floor area to perform remap
-                "property_type": addr.landlord_property_type,
-                "built_form": addr.landlord_built_form,
-
-                # Components
-                "walls_description": addr.landlord_wall_construction,
-                "roof_description": addr.landlord_roof_construction,
-                "floor_description": addr.landlord_floor_construction,
-                "windows_description": addr.landlord_windows_type,
-                "main_fuel": addr.landlord_fuel_type,
-                "mainheat_description": addr.landlord_heating_system,
-                "mainheatcont_description": addr.landlord_heating_controls,
-                "hotwater_description": addr.landlord_hot_water_system,
-
-                # Efficiency
-                "walls_energy_eff": addr.landlord_wall_efficiency,
-                "roof_energy_eff": addr.landlord_roof_efficiency,
-                "windows_energy_eff": addr.landlord_windows_efficiency,
-                "mainheat_energy_eff": addr.landlord_heating_efficiency,
-                "mainheatc_energy_eff": addr.landlord_heating_controls_efficiency,
-                "hot_water_energy_eff": addr.landlord_hot_water_efficiency,
-
-                "multi_glaze_proportion": addr.landlord_multi_glaze_proportion * 100,  # TODO: Fix this!
-                "construction_age_band": addr.landlord_construction_age_band,
-            }
-            # Find differences between EPC and landlord data
-            differences = {}
-            for k, v in landlord_remapping.items():
-                if k == "total_floor_area":
-                    if abs(p.epc_record.prepared_epc.get(k) - v) > 1:  # 1m tolerance
-                        differences[k] = v
-                else:
-                    if v != p.epc_record.get(k) and (not pd.isnull(v)) and (not pd.isnull(p.epc_record.get(k))):
-                        differences[k] = v
 
             needs_rebaselining = p.epc_is_expired | p.epc_is_estimated | (len(differences) > 0)
 
diff --git a/etl/epc/Record.py b/etl/epc/Record.py
index eb462850..1ed0fc41 100644
--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@@ -303,6 +303,12 @@ class EPCRecord:
     glazed_type: Optional[str] = None
     multi_glaze_proportion: Optional[float] = None
 
+    # ------------------------------------------------------------------
+    # CLEANING FLAG
+    # ------------------------------------------------------------------
+    # Indicates if the EPC record has been predicted. By default, false
+    estimated: Optional[bool] = False
+
     # ------------------------------------------------------------------
     # MODEL FLAGS
     # ------------------------------------------------------------------
@@ -379,6 +385,63 @@ class EPCRecord:
 
         return
 
+    def _apply_averages_cleaning(self) -> None:
+        """
+        Fills missing property dimension values using medians from cleaning_data.
+        """
+
+        if self._prepared_epc is None:
+            raise ValueError("Prepared EPC missing")
+
+        if self.cleaning_data is None:
+            raise ValueError("Cleaning data required for averages cleaning")
+
+        variables = [
+            "number-habitable-rooms",
+            "number-heated-rooms",
+            "floor-height",
+        ]
+
+        if not any(pd.isnull(self._prepared_epc.get(v)) for v in variables):
+            return
+
+        cleaning_data: pd.DataFrame = self.cleaning_data
+
+        clean_with = cleaning_data[
+            (cleaning_data["property_type"] == self._prepared_epc["property-type"])
+        ]
+
+        if self._prepared_epc["local-authority"] in clean_with["local_authority"].values:
+            clean_with = clean_with[
+                clean_with["local_authority"] == self._prepared_epc["local-authority"]
+                ]
+
+        floor_area = self._prepared_epc.get("total-floor-area")
+
+        if floor_area is not None:
+            subset = clean_with[
+                (
+                    (clean_with["total_floor_area"].astype(float) <= floor_area * 1.1) &
+                    (clean_with["total_floor_area"].astype(float) >= floor_area * 0.9)
+                )
+            ]
+            if not subset.empty:
+                clean_with = subset
+
+        medians = {
+            "number-habitable-rooms": int(round(clean_with["number_habitable_rooms"].median())),
+            "number-heated-rooms": int(round(clean_with["number_heated_rooms"].median())),
+            "floor-height": float(clean_with["floor_height"].median()),
+        }
+
+        # heated rooms should never exceed habitable
+        if medians["number-heated-rooms"] > medians["number-habitable-rooms"]:
+            medians["number-heated-rooms"] = medians["number-habitable-rooms"]
+
+        for key, value in medians.items():
+            if pd.isnull(self._prepared_epc.get(key)):
+                self._prepared_epc[key] = value
+
     def _apply_cleaning_rules(self) -> None:
         """
         Apply simple field-level cleaning rules defined in CLEANING_RULES.