diff --git a/.idea/Model.iml b/.idea/Model.iml index 1e51ede4..4d94187d 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -6,6 +6,7 @@ + diff --git a/asset_list/app.py b/asset_list/app.py index a97bb8e0..8becbd3e 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -73,25 +73,24 @@ def app(): Property UPRN """ - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lifespace Rentals/Missed" - # data_filename = "For Modelling - Final - reviewed.xlsx" - data_filename = "Missed Properties - with address.xlsx" + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/March 2026 SAL" + data_filename = "Domna System Review - Livewest.xlsx" sheet_name = "Sheet1" postcode_column = "Postcode" - address1_column = "address1" - address1_method = None - fulladdress_column = "address1" + address1_column = None + address1_method = "house_number_extraction" + fulladdress_column = "Address" address_cols_to_concat = [] missing_postcodes_method = None landlord_year_built = None - landlord_os_uprn = "UPRN" - landlord_property_type = "Type" - landlord_built_form = None + landlord_os_uprn = "gov UPRN" + landlord_property_type = "AssetType" + landlord_built_form = "AssetType" landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "Reference" + landlord_property_id = "landlord_uprn" landlord_sap = None outcomes_filename = None outcomes_sheetname = None diff --git a/asset_list/utils.py b/asset_list/utils.py index d83a35f2..9d3ae1b6 100644 --- a/asset_list/utils.py +++ b/asset_list/utils.py @@ -173,6 +173,7 @@ def get_data( errors = [] no_epc = [] for _, home in tqdm(df.iterrows(), total=len(df)): + try: # If we have a block of flats, we cannot retrieve this data diff --git a/backend/app/db/functions/address_functions.py b/backend/app/db/functions/address_functions.py index 4b8ad5f2..42fcdcfa 100644 --- a/backend/app/db/functions/address_functions.py +++ b/backend/app/db/functions/address_functions.py @@ -20,7 +20,7 @@ def _get_associated_records(results, uprn, uprn_key="UPRN"): return matched_record -def get_associated_uprns(postcode_search: PostcodeSearch, uprn: str | int): +def get_associated_uprns(postcode_search: Optional[PostcodeSearch], uprn: str | int): """ Given a postcode and UPRN, for a remote assessment, fetch all associated UPRNs, based on parent UPRN. This will be properties in the same building diff --git a/backend/app/db/models/portfolio.py b/backend/app/db/models/portfolio.py index 9eb26597..c511b6c9 100644 --- a/backend/app/db/models/portfolio.py +++ b/backend/app/db/models/portfolio.py @@ -147,6 +147,10 @@ class PropertyModel(Base): is_sap_points_adjusted_for_installed_measures = Column(Boolean, default=False) original_sap_points = Column(Float) + # New for re-scoring - we will need to delete some of the redundant fields but there is a ticket for this + lodged_sap_points = Column(Float) + lodged_epc_rating = Column(Enum(Epc)) + class FeatureRating(enum.Enum): VERY_GOOD = 5 @@ -253,6 +257,12 @@ class PropertyDetailsEpcModel(Base): installed_measures_heat_demand_adjustment = Column(Float) is_epc_adjusted_for_installed_measures = Column(Boolean, default=False) + # New columns - we'll need to delete some of the redundant fields, associated to "already installed" but + # we have a ticket for this piece of work + lodged_co2_emissions = Column(Float) + lodged_heat_demand = Column(Float) + has_been_remodelled = Column(Boolean, default=False) + class PropertyDetailsSpatial(Base): __tablename__ = "property_details_spatial" diff --git a/backend/engine/engine.py b/backend/engine/engine.py index 4f698e18..e1e45b47 100644 --- a/backend/engine/engine.py +++ b/backend/engine/engine.py @@ -837,41 +837,41 @@ async def model_engine(body: PlanTriggerRequest): extract_uprn=True ) - for idx, rebaselined_prediction in rebaselining_response["retrofit-sap-baseline-predictions"].iterrows(): - property_instance = next(p for p in input_properties if p.uprn == int(rebaselined_prediction["uprn"])) - new_rating = rebaselined_prediction["predictions"] - new_epc_rating = sap_to_epc(new_rating) - # Insert + # TODO: TEMP: Compare values + compare_scores = [] + for x in rebaselining_scoring_data["uprn"].unique(): + record = [p for p in input_properties if p.uprn == x][0].epc_record + original_sap = record.current_energy_efficiency + new_sap = rebaselining_response["retrofit-sap-baseline-predictions"][ + rebaselining_response["retrofit-sap-baseline-predictions"]["uprn"] == x + ]["predictions"].values[0] + lodgement_date = record.lodgement_date + compare_scores.append({ + "uprn": x, + "original_sap": original_sap, + "new_sap": new_sap, + "lodgement_date": lodgement_date + }) + compare_scores = pd.DataFrame(compare_scores) - # property_instance.data["current-energy-efficiency"] = sap_to_epc(new_rating) + for uprn in rebaselining_scoring_data["uprn"].unique(): + # Get the predictions + sap_prediction = rebaselining_response["retrofit-sap-baseline-predictions"][ + rebaselining_response["retrofit-sap-baseline-predictions"]["uprn"] == uprn + ]["predictions"].values[0] - addr = [a for a in addresses if a.uprn == property_instance.uprn][0] - landlord_remapping = { - "total-floor-area": addr.landlord_total_floor_area_m2, # 1m tolerance on floor area to perform remap - "property-type": addr.landlord_property_type, - "built-form": addr.landlord_built_form, - # Components - "walls-description": addr.landlord_wall_construction, - "roof-description": addr.landlord_roof_construction, - "floor-description": addr.landlord_floor_construction, - "windows-description": addr.landlord_windows_type, - "main-fuel": addr.landlord_fuel_type, - "mainheatcont-description": addr.landlord_heating_controls, - "hotwater-description": addr.landlord_hot_water_system, - # Efficiency - "walls-energy-eff": addr.landlord_wall_efficiency, - "roof-energy-eff": addr.landlord_roof_efficiency, - "windows-energy-eff": addr.landlord_windows_efficiency, - "mainheat-energy-eff": addr.landlord_heating_efficiency, - "mainheatc-energy-eff": addr.landlord_heating_controls_efficiency, - "hot-water-energy-eff": addr.landlord_hot_water_efficiency, - "multi-glaze-proportion": addr.landlord_multi_glaze_proportion * 100, # TODO: Fix this! - "construction-age-band": addr.landlord_construction_age_band, - } + carbon_prediction = 1337 + heat_demand_prediction = 1337 - # Insert the re-baselined scores into the property data - for p in input_properties: - property_rebaselined_sap = rebaselining_response["retrofit-sap-baseline-predictions"] + epc_prediction = sap_to_epc(sap_prediction) + # We now need to insert the new values into the epc_record + property_instance = next(p for p in input_properties if p.uprn == int(uprn)) + property_instance.epc_record.insert_new_performance_values( + new_sap=sap_prediction, + new_epc=epc_prediction, + new_carbon=carbon_prediction, + new_heat_demand=heat_demand_prediction, + ) kwh_client = KwhData(bucket=get_settings().DATA_BUCKET, read_consumption_data=True) @@ -924,26 +924,6 @@ async def model_engine(body: PlanTriggerRequest): # We also make a tweak - if the property has been flagged for solar but doesn't contain # any panel performance, we ensure that we have a 3kWp and 4kWp option for the property - # TODO: Temp - test re-baselining - p = input_properties[0] - p.create_base_difference_epc_record(cleaned_lookup=cleaned) - scoring_data = p.base_difference_record.df - # We just need a recent date to trigger the right models, - # as we are only interested in the deltas - scoring_data["is_post_sap10_starting"] = True - # Score model - SAP re-baselining model - model_api.MODEL_URLS["retrofit-sap-baseline-predictions"] = "sapbaselinemodel" - model_api.prediction_buckets["retrofit-sap-baseline-predictions"] = "retrofit-sap-baseline-predictions-dev" - example_response = model_api.predict_all( - df=scoring_data, - bucket=get_settings().DATA_BUCKET, - model_prefixes=["retrofit-sap-baseline-predictions"], - extract_ids=False - ) - - input_properties[0].data["current-energy-efficiency"] = 58.8 - input_properties[0].data["current-energy-rating"] = "D" - logger.info("Identifying property recommendations") recommendations, recommendations_scoring_data, representative_recommendations = {}, [], {} for p in tqdm(input_properties): diff --git a/etl/bill_savings/KwhData.py b/etl/bill_savings/KwhData.py index b4bb979d..dfb0be85 100644 --- a/etl/bill_savings/KwhData.py +++ b/etl/bill_savings/KwhData.py @@ -1,4 +1,5 @@ import re +from dataclasses import fields import pandas as pd import numpy as np from datetime import datetime @@ -14,24 +15,24 @@ logger = setup_logger() class KwhData: - COLS_TO_STRINGIFY = ["main-heating-controls", "floor-level"] + COLS_TO_STRINGIFY = ["main_heating_controls", "floor_level"] CATEGORICAL_COLUMNS = [ - "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms", - "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type", - "built-form", - "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff", - "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description", + "lodgement_year", "lodgement_month", "main_fuel", "mainheat_description", "number_heated_rooms", + "number_habitable_rooms", "mainheat_energy_eff", "mainheatcont_description", "property_type", + "built_form", + "construction_age_band", "secondheat_description", "hotwater_description", "hot_water_energy_eff", + "walls_description", "walls_energy_eff", "roof_description", "roof_energy_eff", "floor_description", "county", - "windows-description", "windows-energy-eff", "flat-top-storey", - "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation", - "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating", - "floor-level" + "windows_description", "windows_energy_eff", "flat_top_storey", + "flat_storey_count", "unheated_corridor_length", "solar_water_heating_flag", "mechanical_ventilation", + "low_energy_lighting", "environment_impact_current", "energy_tariff", "current_energy_rating", + "floor_level" ] NUMERICAL_COLUMNS = [ - 'heating-cost-current', 'total-floor-area', 'co2-emissions-current', 'energy-consumption-current', - 'heating-cost-potential', 'hot-water-cost-current', 'current-energy-efficiency' + 'heating_cost_current', 'total_floor_area', 'co2_emissions_current', 'energy_consumption_current', + 'heating_cost_potential', 'hot_water_cost_current', 'current_energy_efficiency' ] def __init__(self, bucket=None, read_consumption_data=False): @@ -106,6 +107,16 @@ class KwhData: # If no match is found, return None or raise an exception return None + @staticmethod + def _normalise_epc_keys(data): + if isinstance(data, dict): + return {key.replace("-", "_"): value for key, value in data.items()} + + if isinstance(data, pd.DataFrame): + return data.rename(columns=lambda column: column.replace("-", "_")) + + raise TypeError("Expected dict or DataFrame") + def combine(self): """ Given the data that is collected containing the kwh values for heating and hot water, this method will combine @@ -128,9 +139,9 @@ class KwhData: # We check that the retrieved energy consumption sufficiently matches the EPC data internal_dataset = [] for x in data: - epc_data = x["epc"] - epc_sap = epc_data["current-energy-efficiency"] - epc_potential_sap = epc_data["potential-energy-efficiency"] + epc_data = self._normalise_epc_keys(x["epc"]) + epc_sap = epc_data["current_energy_efficiency"] + epc_potential_sap = epc_data["potential_energy_efficiency"] # Make sure this matches the extracted sap if int(epc_sap) != int(x["current_epc_efficiency"]) or int(epc_potential_sap) != int( x["potential_epc_efficiency"] @@ -171,7 +182,7 @@ class KwhData: # We also estimate the energy consumption reduction from this data, by band df["total_consumption"] = df["heating_kwh"] + df["hot_water_kwh"] - consumption_averages = df.groupby("current-energy-efficiency")["total_consumption"].mean().reset_index() + consumption_averages = df.groupby("current_energy_efficiency")["total_consumption"].mean().reset_index() df = df.drop(columns=["total_consumption"]) self.consumption_averages_filepath = f"energy_consumption/{self.run_date}/consumption_averages.parquet" @@ -203,9 +214,11 @@ class KwhData: # TODO: New is a temporary parameter, which will transform the epc descriptions to their transformed features # in anticipation of the new model - data["lodgement-date"] = pd.to_datetime(data["lodgement-date"]) - data["lodgement-year"] = data["lodgement-date"].dt.year - data["lodgement-month"] = data["lodgement-date"].dt.month + data = self._normalise_epc_keys(data.copy()) + + data["lodgement_date"] = pd.to_datetime(data["lodgement_date"]) + data["lodgement_year"] = data["lodgement_date"].dt.year + data["lodgement_month"] = data["lodgement_date"].dt.month # For walls, roof, floor description where we have average thermal transmittance, to avoid too many # categories @@ -231,8 +244,10 @@ class KwhData: thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str) # Apply the lookup table to the data - for feature in ["walls-description", "roof-description", "floor-description"]: - cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]] + for feature in ["walls_description", "roof_description", "floor_description"]: + cleaned_df = pd.DataFrame( + cleaned[feature.replace("_", "-")] + )[["original_description", "thermal_transmittance"]] # Round to 2 decimal places and convert to string cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str) @@ -261,10 +276,10 @@ class KwhData: data[self.CATEGORICAL_COLUMNS] = data[self.CATEGORICAL_COLUMNS].astype(str) # Create new features: - data['estimate_annual_kwh'] = data['energy-consumption-current'] * data['total-floor-area'] + data['estimate_annual_kwh'] = data['energy_consumption_current'] * data['total_floor_area'] # Ensure this is string, because we could have mixed types - data["lodgement-datetime"] = data["lodgement-datetime"].astype(str) + data["lodgement_datetime"] = data["lodgement_datetime"].astype(str) if save: self.model_training_data_filepath = f"energy_consumption/{self.run_date}/training_data.parquet" @@ -286,29 +301,39 @@ class KwhData: data is in the format required by the model :return: """ - - epc = p.data.copy() numeric_cols = [ - 'current-energy-efficiency', - 'potential-energy-efficiency', 'environment-impact-current', - 'environment-impact-potential', 'energy-consumption-current', - 'energy-consumption-potential', 'co2-emissions-current', - 'co2-emiss-curr-per-floor-area', 'co2-emissions-potential', - 'lighting-cost-current', 'lighting-cost-potential', - 'heating-cost-current', 'heating-cost-potential', - 'hot-water-cost-current', 'hot-water-cost-potential', - 'total-floor-area', 'multi-glaze-proportion', - 'extension-count', 'number-habitable-rooms', 'number-heated-rooms', - 'low-energy-lighting', 'number-open-fireplaces', - 'wind-turbine-count', 'unheated-corridor-length', - 'floor-height', 'photo-supply', 'fixed-lighting-outlets-count', - 'low-energy-fixed-light-count', + 'current_energy_efficiency', + 'potential_energy_efficiency', 'environment_impact_current', + 'environment_impact_potential', 'energy_consumption_current', + 'energy_consumption_potential', 'co2_emissions_current', + 'co2_emiss_curr_per_floor_area', 'co2_emissions_potential', + 'lighting_cost_current', 'lighting_cost_potential', + 'heating_cost_current', 'heating_cost_potential', + 'hot_water_cost_current', 'hot_water_cost_potential', + 'total_floor_area', 'multi_glaze_proportion', + 'extension_count', 'number_habitable_rooms', 'number_heated_rooms', + 'low_energy_lighting', 'number_open_fireplaces', + 'wind_turbine_count', 'unheated_corridor_length', + 'floor_height', 'photo_supply', 'fixed_lighting_outlets_count', + 'low_energy_fixed_light_count', ] + required_cols = set(numeric_cols + KwhData.CATEGORICAL_COLUMNS + [ + "uprn", "lodgement_date", "lodgement_datetime", "floor_energy_eff" + ]) + + epc_record = p.epc_record + available_fields = {field.name for field in fields(epc_record)} + missing_fields = required_cols - available_fields + if missing_fields: + raise ValueError(f"Missing EPCRecord fields required by KwhData: {sorted(missing_fields)}") + + epc = {field_name: getattr(epc_record, field_name) for field_name in required_cols} + for v in numeric_cols: if epc[v] is not None: epc[v] = float(epc[v]) - bools_to_remap = ['mains-gas-flag', 'flat-top-storey'] + bools_to_remap = ['mains_gas_flag', 'flat_top_storey'] bool_map = { True: "Y", False: "N", @@ -320,8 +345,8 @@ class KwhData: epc[v] = bool_map[epc[v]] no_data = { - "floor-level": "NODATA!", - "floor-energy-eff": "NO DATA!" + "floor_level": "NODATA!", + "floor_energy_eff": "NO DATA!" } for v, fill_val in no_data.items(): if pd.isnull(epc[v]): @@ -331,8 +356,8 @@ class KwhData: def prepare_epc(self, input_properties: list[Property]): scoring_data = pd.DataFrame([self._prepare_epc(p) for p in input_properties]) - scoring_data["lodgement-year"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.year - scoring_data["lodgement-month"] = pd.to_datetime(scoring_data["lodgement-date"]).dt.month + scoring_data["lodgement_year"] = pd.to_datetime(scoring_data["lodgement_date"]).dt.year + scoring_data["lodgement_month"] = pd.to_datetime(scoring_data["lodgement_date"]).dt.month scoring_data["id"] = scoring_data["uprn"].copy() diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 84d4d19a..0428542c 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -309,6 +309,7 @@ class EPCRecord: # Indicates if the EPC record has been predicted. By default, false estimated: Optional[bool] = False sap_05_overwritten: Optional[bool] = False + has_been_remodelled: Optional[bool] = False # ------------------------------------------------------------------ # MODEL FLAGS @@ -386,6 +387,35 @@ class EPCRecord: return + def insert_new_performance_values( + self, new_sap: float, new_epc: float, new_carbon: float, new_heat_demand: float, + ): + """ + Given re-modelling for this property, is used to insert the new values and also keep a record of the + fact that re-modelling has taken place + :param new_sap: + :param new_epc: + :param new_carbon: + :param new_heat_demand: + :return: + """ + + self.has_been_remodelled = True + # Update prepared epc + update_data = { + "current_energy_efficiency": new_sap, + "current_energy_rating": new_epc, + "co2_emissions_current": new_carbon, + "energy_consumption_current": new_heat_demand, + } + # Validate we're updating correct fields + for k in update_data: + if k not in self._prepared_epc: + raise ValueError(f"Attempting to update unknown field '{k}' in prepared EPC") + self._prepared_epc.update(update_data) + # Update dataclass attributes + self._expand_prepared_epc_to_attributes() + def _apply_averages_cleaning(self) -> None: """ Fills missing property dimension values using medians from cleaning_data. @@ -626,6 +656,10 @@ class EPCRecord: # Ignore keys that are not part of the dataclass schema continue + if value is None: + setattr(self, key, None) + continue + try: cast_value = self._cast_value(value, field_map[key].type) setattr(self, key, cast_value) @@ -812,14 +846,17 @@ class EPCRecord: (property_dimensions["PROPERTY_TYPE"] == self._prepared_epc["property-type"]) ] - if self.construction_age_band not in DATA_ANOMALY_MATCHES: + if ( + (self.construction_age_band not in DATA_ANOMALY_MATCHES) and + (self.construction_age_band in result["CONSTRUCTION_AGE_BAND"].values) + ): result = result[ (result["CONSTRUCTION_AGE_BAND"] == self.construction_age_band) ] if ( self._prepared_epc["built-form"] not in DATA_ANOMALY_MATCHES - and self._prepared_epc["built-form"] in result["BUILT_FORM"] + and self._prepared_epc["built-form"] in result["BUILT_FORM"].values ): result = result[(result["BUILT_FORM"] == self._prepared_epc["built-form"])] @@ -935,7 +972,7 @@ class EPCRecord: self._prepared_epc["unheated-corridor-length"] = ( float(self._prepared_epc["unheated-corridor-length"]) - if self._prepared_epc["unheated-corridor-length"] not in ["", None] + if self._prepared_epc["unheated-corridor-length"] not in DATA_ANOMALY_MATCHES else None )