fixed bug in epc record cleaning"

2026-07-27 23:35:01 +00:00 · 2026-03-18 18:16:57 +00:00 · 2026-03-18 18:16:57 +00:00 · f45260706e
commit f45260706e
parent 5e8847d028
8 changed files with 163 additions and 110 deletions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -6,6 +6,7 @@
      <sourceFolder url="file://$MODULE_DIR$/model_data" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
+      <excludeFolder url="file://$MODULE_DIR$/infrastructure/terraform/.terraform" />
    </content>
    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
--- a/asset_list/app.py
+++ b/asset_list/app.py
@ -73,25 +73,24 @@ def app():
    Property UPRN
    """

-    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lifespace Rentals/Missed"
-    # data_filename = "For Modelling - Final - reviewed.xlsx"
-    data_filename = "Missed Properties - with address.xlsx"
+    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/March 2026 SAL"
+    data_filename = "Domna System Review - Livewest.xlsx"
    sheet_name = "Sheet1"
    postcode_column = "Postcode"
-    address1_column = "address1"
-    address1_method = None
-    fulladdress_column = "address1"
+    address1_column = None
+    address1_method = "house_number_extraction"
+    fulladdress_column = "Address"
    address_cols_to_concat = []
    missing_postcodes_method = None
    landlord_year_built = None
-    landlord_os_uprn = "UPRN"
-    landlord_property_type = "Type"
-    landlord_built_form = None
+    landlord_os_uprn = "gov UPRN"
+    landlord_property_type = "AssetType"
+    landlord_built_form = "AssetType"
    landlord_wall_construction = None
    landlord_roof_construction = None
    landlord_heating_system = None
    landlord_existing_pv = None
-    landlord_property_id = "Reference"
+    landlord_property_id = "landlord_uprn"
    landlord_sap = None
    outcomes_filename = None
    outcomes_sheetname = None
--- a/asset_list/utils.py
+++ b/asset_list/utils.py
@ -173,6 +173,7 @@ def get_data(
    errors = []
    no_epc = []
    for _, home in tqdm(df.iterrows(), total=len(df)):
+        
        try:

            # If we have a block of flats, we cannot retrieve this data
--- a/backend/app/db/functions/address_functions.py
+++ b/backend/app/db/functions/address_functions.py
@ -20,7 +20,7 @@ def _get_associated_records(results, uprn, uprn_key="UPRN"):
    return matched_record


-def get_associated_uprns(postcode_search: PostcodeSearch, uprn: str | int):
+def get_associated_uprns(postcode_search: Optional[PostcodeSearch], uprn: str | int):
    """
    Given a postcode and UPRN, for a remote assessment, fetch all associated UPRNs, based
    on parent UPRN. This will be properties in the same building
--- a/backend/app/db/models/portfolio.py
+++ b/backend/app/db/models/portfolio.py
@ -147,6 +147,10 @@ class PropertyModel(Base):
    is_sap_points_adjusted_for_installed_measures = Column(Boolean, default=False)
    original_sap_points = Column(Float)

+    # New for re-scoring - we will need to delete some of the redundant fields but there is a ticket for this
+    lodged_sap_points = Column(Float)
+    lodged_epc_rating = Column(Enum(Epc))
+

 class FeatureRating(enum.Enum):
    VERY_GOOD = 5
@ -253,6 +257,12 @@ class PropertyDetailsEpcModel(Base):
    installed_measures_heat_demand_adjustment = Column(Float)
    is_epc_adjusted_for_installed_measures = Column(Boolean, default=False)

+    # New columns - we'll need to delete some of the redundant fields, associated to "already installed" but
+    # we have a ticket for this piece of work
+    lodged_co2_emissions = Column(Float)
+    lodged_heat_demand = Column(Float)
+    has_been_remodelled = Column(Boolean, default=False)
+

 class PropertyDetailsSpatial(Base):
    __tablename__ = "property_details_spatial"
--- a/backend/engine/engine.py
+++ b/backend/engine/engine.py
@ -837,41 +837,41 @@ async def model_engine(body: PlanTriggerRequest):
            extract_uprn=True
        )

-        for idx, rebaselined_prediction in rebaselining_response["retrofit-sap-baseline-predictions"].iterrows():
-            property_instance = next(p for p in input_properties if p.uprn == int(rebaselined_prediction["uprn"]))
-            new_rating = rebaselined_prediction["predictions"]
-            new_epc_rating = sap_to_epc(new_rating)
-            # Insert
+        # TODO: TEMP: Compare values
+        compare_scores = []
+        for x in rebaselining_scoring_data["uprn"].unique():
+            record = [p for p in input_properties if p.uprn == x][0].epc_record
+            original_sap = record.current_energy_efficiency
+            new_sap = rebaselining_response["retrofit-sap-baseline-predictions"][
+                rebaselining_response["retrofit-sap-baseline-predictions"]["uprn"] == x
+                ]["predictions"].values[0]
+            lodgement_date = record.lodgement_date
+            compare_scores.append({
+                "uprn": x,
+                "original_sap": original_sap,
+                "new_sap": new_sap,
+                "lodgement_date": lodgement_date
+            })
+        compare_scores = pd.DataFrame(compare_scores)

-            # property_instance.data["current-energy-efficiency"] = sap_to_epc(new_rating)
+        for uprn in rebaselining_scoring_data["uprn"].unique():
+            # Get the predictions
+            sap_prediction = rebaselining_response["retrofit-sap-baseline-predictions"][
+                rebaselining_response["retrofit-sap-baseline-predictions"]["uprn"] == uprn
+                ]["predictions"].values[0]

-            addr = [a for a in addresses if a.uprn == property_instance.uprn][0]
-            landlord_remapping = {
-                "total-floor-area": addr.landlord_total_floor_area_m2,  # 1m tolerance on floor area to perform remap
-                "property-type": addr.landlord_property_type,
-                "built-form": addr.landlord_built_form,
-                # Components
-                "walls-description": addr.landlord_wall_construction,
-                "roof-description": addr.landlord_roof_construction,
-                "floor-description": addr.landlord_floor_construction,
-                "windows-description": addr.landlord_windows_type,
-                "main-fuel": addr.landlord_fuel_type,
-                "mainheatcont-description": addr.landlord_heating_controls,
-                "hotwater-description": addr.landlord_hot_water_system,
-                # Efficiency
-                "walls-energy-eff": addr.landlord_wall_efficiency,
-                "roof-energy-eff": addr.landlord_roof_efficiency,
-                "windows-energy-eff": addr.landlord_windows_efficiency,
-                "mainheat-energy-eff": addr.landlord_heating_efficiency,
-                "mainheatc-energy-eff": addr.landlord_heating_controls_efficiency,
-                "hot-water-energy-eff": addr.landlord_hot_water_efficiency,
-                "multi-glaze-proportion": addr.landlord_multi_glaze_proportion * 100,  # TODO: Fix this!
-                "construction-age-band": addr.landlord_construction_age_band,
-            }
+            carbon_prediction = 1337
+            heat_demand_prediction = 1337

-        # Insert the re-baselined scores into the property data
-        for p in input_properties:
-            property_rebaselined_sap = rebaselining_response["retrofit-sap-baseline-predictions"]
+            epc_prediction = sap_to_epc(sap_prediction)
+            # We now need to insert the new values into the epc_record
+            property_instance = next(p for p in input_properties if p.uprn == int(uprn))
+            property_instance.epc_record.insert_new_performance_values(
+                new_sap=sap_prediction,
+                new_epc=epc_prediction,
+                new_carbon=carbon_prediction,
+                new_heat_demand=heat_demand_prediction,
+            )

        kwh_client = KwhData(bucket=get_settings().DATA_BUCKET, read_consumption_data=True)

@ -924,26 +924,6 @@ async def model_engine(body: PlanTriggerRequest):
        # We also make a tweak - if the property has been flagged for solar but doesn't contain
        # any panel performance, we ensure that we have a 3kWp and 4kWp option for the property

-        # TODO: Temp - test re-baselining
-        p = input_properties[0]
-        p.create_base_difference_epc_record(cleaned_lookup=cleaned)
-        scoring_data = p.base_difference_record.df
-        # We just need a recent date to trigger the right models,
-        # as we are only interested in the deltas
-        scoring_data["is_post_sap10_starting"] = True
-        # Score model - SAP re-baselining model
-        model_api.MODEL_URLS["retrofit-sap-baseline-predictions"] = "sapbaselinemodel"
-        model_api.prediction_buckets["retrofit-sap-baseline-predictions"] = "retrofit-sap-baseline-predictions-dev"
-        example_response = model_api.predict_all(
-            df=scoring_data,
-            bucket=get_settings().DATA_BUCKET,
-            model_prefixes=["retrofit-sap-baseline-predictions"],
-            extract_ids=False
-        )
-
-        input_properties[0].data["current-energy-efficiency"] = 58.8
-        input_properties[0].data["current-energy-rating"] = "D"
-
        logger.info("Identifying property recommendations")
        recommendations, recommendations_scoring_data, representative_recommendations = {}, [], {}
        for p in tqdm(input_properties):
--- a/etl/bill_savings/KwhData.py
+++ b/etl/bill_savings/KwhData.py
@ -1,4 +1,5 @@
 import re
+from dataclasses import fields
 import pandas as pd
 import numpy as np
 from datetime import datetime
@ -14,24 +15,24 @@ logger = setup_logger()


 class KwhData:
-    COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"]
+    COLS_TO_STRINGIFY = ["main_heating_controls", "floor_level"]

    CATEGORICAL_COLUMNS = [
-        "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
-        "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type",
-        "built-form",
-        "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
-        "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
+        "lodgement_year", "lodgement_month", "main_fuel", "mainheat_description", "number_heated_rooms",
+        "number_habitable_rooms", "mainheat_energy_eff", "mainheatcont_description", "property_type",
+        "built_form",
+        "construction_age_band", "secondheat_description", "hotwater_description", "hot_water_energy_eff",
+        "walls_description", "walls_energy_eff", "roof_description", "roof_energy_eff", "floor_description",
        "county",
-        "windows-description", "windows-energy-eff", "flat-top-storey",
-        "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
-        "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating",
-        "floor-level"
+        "windows_description", "windows_energy_eff", "flat_top_storey",
+        "flat_storey_count", "unheated_corridor_length", "solar_water_heating_flag", "mechanical_ventilation",
+        "low_energy_lighting", "environment_impact_current", "energy_tariff", "current_energy_rating",
+        "floor_level"
    ]

    NUMERICAL_COLUMNS = [
-        'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current',
-        'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency'
+        'heating_cost_current', 'total_floor_area', 'co2_emissions_current', 'energy_consumption_current',
+        'heating_cost_potential', 'hot_water_cost_current', 'current_energy_efficiency'
    ]

    def __init__(self, bucket=None, read_consumption_data=False):
@ -106,6 +107,16 @@ class KwhData:
            # If no match is found, return None or raise an exception
            return None

+    @staticmethod
+    def _normalise_epc_keys(data):
+        if isinstance(data, dict):
+            return {key.replace("-", "_"): value for key, value in data.items()}
+
+        if isinstance(data, pd.DataFrame):
+            return data.rename(columns=lambda column: column.replace("-", "_"))
+
+        raise TypeError("Expected dict or DataFrame")
+
    def combine(self):
        """
        Given the data that is collected containing the kwh values for heating and hot water, this method will combine
@ -128,9 +139,9 @@ class KwhData:
            # We check that the retrieved energy consumption sufficiently matches the EPC data
            internal_dataset = []
            for x in data:
-                epc_data = x["epc"]
-                epc_sap = epc_data["current-energy-efficiency"]
-                epc_potential_sap = epc_data["potential-energy-efficiency"]
+                epc_data = self._normalise_epc_keys(x["epc"])
+                epc_sap = epc_data["current_energy_efficiency"]
+                epc_potential_sap = epc_data["potential_energy_efficiency"]
                # Make sure this matches the extracted sap
                if int(epc_sap) != int(x["current_epc_efficiency"]) or int(epc_potential_sap) != int(
                    x["potential_epc_efficiency"]
@ -171,7 +182,7 @@ class KwhData:

        # We also estimate the energy consumption reduction from this data, by band
        df["total_consumption"] = df["heating_kwh"] + df["hot_water_kwh"]
-        consumption_averages = df.groupby("current-energy-efficiency")["total_consumption"].mean().reset_index()
+        consumption_averages = df.groupby("current_energy_efficiency")["total_consumption"].mean().reset_index()
        df = df.drop(columns=["total_consumption"])

        self.consumption_averages_filepath = f"energy_consumption/{self.run_date}/consumption_averages.parquet"
@ -203,9 +214,11 @@ class KwhData:
        # TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features
        #       in anticipation of the new model

-        data["lodgement-date"] = pd.to_datetime(data["lodgement-date"])
-        data["lodgement-year"] = data["lodgement-date"].dt.year
-        data["lodgement-month"] = data["lodgement-date"].dt.month
+        data = self._normalise_epc_keys(data.copy())
+
+        data["lodgement_date"] = pd.to_datetime(data["lodgement_date"])
+        data["lodgement_year"] = data["lodgement_date"].dt.year
+        data["lodgement_month"] = data["lodgement_date"].dt.month

        # For walls, roof, floor description where we have average thermal transmittance, to avoid too many
        # categories
@ -231,8 +244,10 @@ class KwhData:
        thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)

        # Apply the lookup table to the data
-        for feature in ["walls-description", "roof-description", "floor-description"]:
-            cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
+        for feature in ["walls_description", "roof_description", "floor_description"]:
+            cleaned_df = pd.DataFrame(
+                cleaned[feature.replace("_", "-")]
+            )[["original_description", "thermal_transmittance"]]
            # Round to 2 decimal places and convert to string
            cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)

@ -261,10 +276,10 @@ class KwhData:
        data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str)

        # Create new features:
-        data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area']
+        data['estimate_annual_kwh'] = data['energy_consumption_current'] * data['total_floor_area']

        # Ensure this is string, because we could have mixed types
-        data["lodgement-datetime"] = data["lodgement-datetime"].astype(str)
+        data["lodgement_datetime"] = data["lodgement_datetime"].astype(str)

        if save:
            self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet"
@ -286,29 +301,39 @@ class KwhData:
        data is in the format required by the model
        :return:
        """
-
-        epc = p.data.copy()
        numeric_cols = [
-            'current-energy-efficiency',
-            'potential-energy-efficiency', 'environment-impact-current',
-            'environment-impact-potential', 'energy-consumption-current',
-            'energy-consumption-potential', 'co2-emissions-current',
-            'co2-emiss-curr-per-floor-area', 'co2-emissions-potential',
-            'lighting-cost-current', 'lighting-cost-potential',
-            'heating-cost-current', 'heating-cost-potential',
-            'hot-water-cost-current', 'hot-water-cost-potential',
-            'total-floor-area', 'multi-glaze-proportion',
-            'extension-count', 'number-habitable-rooms', 'number-heated-rooms',
-            'low-energy-lighting', 'number-open-fireplaces',
-            'wind-turbine-count', 'unheated-corridor-length',
-            'floor-height', 'photo-supply', 'fixed-lighting-outlets-count',
-            'low-energy-fixed-light-count',
+            'current_energy_efficiency',
+            'potential_energy_efficiency', 'environment_impact_current',
+            'environment_impact_potential', 'energy_consumption_current',
+            'energy_consumption_potential', 'co2_emissions_current',
+            'co2_emiss_curr_per_floor_area', 'co2_emissions_potential',
+            'lighting_cost_current', 'lighting_cost_potential',
+            'heating_cost_current', 'heating_cost_potential',
+            'hot_water_cost_current', 'hot_water_cost_potential',
+            'total_floor_area', 'multi_glaze_proportion',
+            'extension_count', 'number_habitable_rooms', 'number_heated_rooms',
+            'low_energy_lighting', 'number_open_fireplaces',
+            'wind_turbine_count', 'unheated_corridor_length',
+            'floor_height', 'photo_supply', 'fixed_lighting_outlets_count',
+            'low_energy_fixed_light_count',
        ]
+        required_cols = set(numeric_cols + KwhData.CATEGORICAL_COLUMNS + [
+            "uprn", "lodgement_date", "lodgement_datetime", "floor_energy_eff"
+        ])
+
+        epc_record = p.epc_record
+        available_fields = {field.name for field in fields(epc_record)}
+        missing_fields = required_cols - available_fields
+        if missing_fields:
+            raise ValueError(f"Missing EPCRecord fields required by KwhData: {sorted(missing_fields)}")
+
+        epc = {field_name: getattr(epc_record, field_name) for field_name in required_cols}
+
        for v in numeric_cols:
            if epc[v] is not None:
                epc[v] = float(epc[v])

-        bools_to_remap = ['mains-gas-flag', 'flat-top-storey']
+        bools_to_remap = ['mains_gas_flag', 'flat_top_storey']
        bool_map = {
            True: "Y",
            False: "N",
@ -320,8 +345,8 @@ class KwhData:
            epc[v] = bool_map[epc[v]]

        no_data = {
-            "floor-level": "NODATA!",
-            "floor-energy-eff": "NO DATA!"
+            "floor_level": "NODATA!",
+            "floor_energy_eff": "NO DATA!"
        }
        for v, fill_val in no_data.items():
            if pd.isnull(epc[v]):
@ -331,8 +356,8 @@ class KwhData:

    def prepare_epc(self, input_properties: list[Property]):
        scoring_data = pd.DataFrame([self._prepare_epc(p) for p in input_properties])
-        scoring_data["lodgement-year"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.year
-        scoring_data["lodgement-month"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.month
+        scoring_data["lodgement_year"] = pd.to_datetime(scoring_data["lodgement_date"]).dt.year
+        scoring_data["lodgement_month"] = pd.to_datetime(scoring_data["lodgement_date"]).dt.month

        scoring_data["id"] = scoring_data["uprn"].copy()

--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@ -309,6 +309,7 @@ class EPCRecord:
    # Indicates if the EPC record has been predicted. By default, false
    estimated: Optional[bool] = False
    sap_05_overwritten: Optional[bool] = False
+    has_been_remodelled: Optional[bool] = False

    # ------------------------------------------------------------------
    # MODEL FLAGS
@ -386,6 +387,35 @@ class EPCRecord:

        return

+    def insert_new_performance_values(
+        self, new_sap: float, new_epc: float, new_carbon: float, new_heat_demand: float,
+    ):
+        """
+        Given re-modelling for this property, is used to insert the new values and also keep a record of the
+        fact that re-modelling has taken place
+        :param new_sap:
+        :param new_epc:
+        :param new_carbon:
+        :param new_heat_demand:
+        :return:
+        """
+
+        self.has_been_remodelled = True
+        # Update prepared epc
+        update_data = {
+            "current_energy_efficiency": new_sap,
+            "current_energy_rating": new_epc,
+            "co2_emissions_current": new_carbon,
+            "energy_consumption_current": new_heat_demand,
+        }
+        # Validate we're updating correct fields
+        for k in update_data:
+            if k not in self._prepared_epc:
+                raise ValueError(f"Attempting to update unknown field '{k}' in prepared EPC")
+        self._prepared_epc.update(update_data)
+        # Update dataclass attributes
+        self._expand_prepared_epc_to_attributes()
+
    def _apply_averages_cleaning(self) -> None:
        """
        Fills missing property dimension values using medians from cleaning_data.
@ -626,6 +656,10 @@ class EPCRecord:
                # Ignore keys that are not part of the dataclass schema
                continue

+            if value is None:
+                setattr(self, key, None)
+                continue
+
            try:
                cast_value = self._cast_value(value, field_map[key].type)
                setattr(self, key, cast_value)
@ -812,14 +846,17 @@ class EPCRecord:
            (property_dimensions["PROPERTY_TYPE"] == self._prepared_epc["property-type"])
        ]

-        if self.construction_age_band not in DATA_ANOMALY_MATCHES:
+        if (
+            (self.construction_age_band not in DATA_ANOMALY_MATCHES) and
+            (self.construction_age_band in result["CONSTRUCTION_AGE_BAND"].values)
+        ):
            result = result[
                (result["CONSTRUCTION_AGE_BAND"] == self.construction_age_band)
            ]

        if (
            self._prepared_epc["built-form"] not in DATA_ANOMALY_MATCHES
-            and self._prepared_epc["built-form"] in result["BUILT_FORM"]
+            and self._prepared_epc["built-form"] in result["BUILT_FORM"].values
        ):
            result = result[(result["BUILT_FORM"] == self._prepared_epc["built-form"])]

@ -935,7 +972,7 @@ class EPCRecord:

        self._prepared_epc["unheated-corridor-length"] = (
            float(self._prepared_epc["unheated-corridor-length"])
-            if self._prepared_epc["unheated-corridor-length"] not in ["", None]
+            if self._prepared_epc["unheated-corridor-length"] not in DATA_ANOMALY_MATCHES
            else None
        )