From 110cb8070ce78823d2bd9edcca5d5d95222a9da4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 10 Dec 2025 18:42:25 +0000 Subject: [PATCH 1/2] [Cincreased concurrency of backend --- etl/customers/lincs_rural/prepare_data.py | 71 ++++++++++++++++--- .../data_cleanse.py | 6 ++ etl/find_my_epc/RetrieveFindMyEpc.py | 13 ++-- serverless.yml | 2 +- 4 files changed, 74 insertions(+), 18 deletions(-) create mode 100644 etl/customers/peabody/Nov 2025 Consulting Project/data_cleanse.py diff --git a/etl/customers/lincs_rural/prepare_data.py b/etl/customers/lincs_rural/prepare_data.py index db7a9087..675179a8 100644 --- a/etl/customers/lincs_rural/prepare_data.py +++ b/etl/customers/lincs_rural/prepare_data.py @@ -1,8 +1,15 @@ """ Rough script to prepare the data for Lincs Rural project """ +from tqdm import tqdm import pandas as pd +import os +from dotenv import load_dotenv from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from backend.SearchEpc import SearchEpc + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") data = pd.read_excel( "/Users/khalimconn-kowlessar/Downloads/MASTER LIST EPCS UPDATED November 2025 Domna Homes.xlsx", @@ -11,16 +18,58 @@ data = pd.read_excel( # We have property RRNs - we need UPRN -for _, x in data.iterrows(): - rrn = x["EPC Ref."] +standardised_ara_list = [] +missed = [] +for _, x in tqdm(data.iterrows(), total=len(data)): + try: + rrn = x["EPC Ref."] - # Fetch from find my epc - retriever = RetrieveFindMyEpc( - address="", - postcode="", - rrn=rrn, - address_postal_town="", - sap_rating=x["Actual"] - ) + # Fetch from find my epc + retriever = RetrieveFindMyEpc( + address="", + postcode="", + rrn=rrn, + address_postal_town="", + ) - find_epc_data = retriever.retrieve_all_find_my_epc_data() + find_epc_data = retriever.retrieve_newest_find_my_epc_data(rrn=rrn) + + # Find the UPRN + epc_searcher = SearchEpc( + address1=str(find_epc_data["address1"]), + postcode=str(find_epc_data["postcode"]), + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=False, + full_address=",".join([find_epc_data["address1"], find_epc_data["address2"]]), + max_retries=5, + ) + epc_searcher.find_property(skip_os=True) + + # Append in format we need + # Stuff we need: + standardised_ara_list.append( + { + "landlord_property_id": x["Property Ref."], + "landlord_property_type": epc_searcher.newest_epc.get("property-type"), + "landlord_built_form": epc_searcher.newest_epc.get("built-form"), + "landlord_heating_system": epc_searcher.newest_epc.get("mainheat-description", ""), + "epc_os_uprn": epc_searcher.newest_epc.get("uprn"), + "domna_property_id": x["Property Ref."], + "domna_full_address": epc_searcher.newest_epc.get( + "address", ", ".join([ + find_epc_data["address1"], + find_epc_data["address2"], + ]) + ), + } + ) + except Exception as e: + missed.append({ + "property_ref": x["Property Ref."], + "rrn": x["EPC Ref."], + "error": str(e) + }) + +missed_df = pd.DataFrame(missed) diff --git a/etl/customers/peabody/Nov 2025 Consulting Project/data_cleanse.py b/etl/customers/peabody/Nov 2025 Consulting Project/data_cleanse.py new file mode 100644 index 00000000..a1be533d --- /dev/null +++ b/etl/customers/peabody/Nov 2025 Consulting Project/data_cleanse.py @@ -0,0 +1,6 @@ +""" +We have found, within the Peabody data, a large volume of properties with missing and incorrects +UPRNS and incorrect address data. We want to flag these records and also find missings where we can + +We also have duplicate UPRNS that should be flagged +""" diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index eb330948..cf6659f9 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -465,12 +465,13 @@ class RetrieveFindMyEpc: potential_rating = ratings.split(".")[1] current_sap = int(current_rating.split(' ')[-1]) - if current_sap != self.sap_rating: - # This means we likely have the wrong data. If we are in this scenario, we return nothing - return { - "epc_certificate": None, - "page_source": None, - } + if self.sap_rating: + if current_sap != self.sap_rating: + # This means we likely have the wrong data. If we are in this scenario, we return nothing + return { + "epc_certificate": None, + "page_source": None, + } # Retrieve the energy consumption bills = address_res.find('div', {'id': 'bills-affected'}) diff --git a/serverless.yml b/serverless.yml index d2d8f50a..38d8da89 100644 --- a/serverless.yml +++ b/serverless.yml @@ -66,7 +66,7 @@ functions: - sqs: arn: arn:aws:sqs:${self:provider.region}:${aws:accountId}:model-engine-queue batchSize: 1 - maximumConcurrency: 5 # Heavily restricts concurrency to avoid overwhelming the ldmbda limits + maximumConcurrency: 10 # Heavily restricts concurrency to avoid overwhelming the ldmbda limits resources: From 8f7e9e0bdece3a0073aff017d32ebcfa3d6050a1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 10 Dec 2025 19:17:19 +0000 Subject: [PATCH 2/2] simplified fuel code --- asset_list/app.py | 34 +++++++++++++++++++++++ backend/Property.py | 24 +++------------- backend/SearchEpc.py | 2 +- etl/customers/lincs_rural/prepare_data.py | 16 +++++++++++ 4 files changed, 55 insertions(+), 21 deletions(-) diff --git a/asset_list/app.py b/asset_list/app.py index cbb2cd93..dfd7aa46 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -59,6 +59,40 @@ def app(): Property UPRN """ + # Lambeth: + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lambeth/December 10th" + data_filename = "lambeth_sw2_leigham court estate.xlsx" + sheet_name = "Sheet1" + postcode_column = 'Postcode' + address1_column = "Address" + address1_method = None + fulladdress_column = None + address_cols_to_concat = ["Address"] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = None + landlord_built_form = None + landlord_wall_construction = None + landlord_roof_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "row_id" + landlord_sap = None + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_id_colnames = [] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = None + asset_list_header = 0 + landlord_block_reference = None + # Maps addresses to uprn in problematic cases manual_uprn_map = {} diff --git a/backend/Property.py b/backend/Property.py index cbcb9aa3..31991702 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -1416,30 +1416,14 @@ class Property: if not self.is_ashp_valid(measures=["air_source_heat_pump"]): return self.current_energy_consumption - # If the property currently has an electric boiler, it will still benefit from the ASHP efficiency gain - remap_fuel_sources = [ - "Natural Gas", "LPG", "Wood Logs", "Oil", "Electricity", "Coal", "Smokeless Fuel", - "Natural Gas + Solar Thermal", "Anthracite", "Wood Pellets", "LPG + Solar Thermal", - "Natural Gas (Community Scheme)" - ] - - heating_energy_source = self.heating_energy_source - hot_water_energy_source = self.hot_water_energy_source heating_consumption = self.energy_consumption_estimates["unadjusted"]["heating"] hotwater_consumption = self.energy_consumption_estimates["unadjusted"]["hot_water"] - if (heating_energy_source not in remap_fuel_sources) or ( - hot_water_energy_source not in remap_fuel_sources + ["Electricity + Solar Thermal"] - ): - raise NotImplementedError("Have not implemented estimating electrical consumption for this fuel type") + # Adjust the heating consumption to reflect the expected efficiency of an ASHP - broadly 3.0 COP + heating_consumption = heating_consumption / (assumed_ashp_efficiency / 100) - if heating_energy_source in remap_fuel_sources: - # Adjust the heating consumption to reflect the expected efficiency of an ASHP - heating_consumption = heating_consumption / (assumed_ashp_efficiency / 100) - - if hot_water_energy_source in remap_fuel_sources: - # Adjust the hot water consumption to reflect the expected efficiency of an ASHP - hotwater_consumption = hotwater_consumption / (assumed_ashp_efficiency / 100) + # Adjust the hot water consumption to reflect the expected efficiency of an ASHP + hotwater_consumption = hotwater_consumption / (assumed_ashp_efficiency / 100) electric_consumption = ( heating_consumption + diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index a193a65f..cb465239 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -563,7 +563,7 @@ class SearchEpc: uprn = hash(self.address1 + self.postcode) if self.fast: - return newest_epc, [], {}, "", "", None + return newest_epc, [], {}, "", "", None, "" # Retrieve postcode and address address_epc, postcode_epc, address_postal_town = self.format_address(newest_epc=newest_epc) diff --git a/etl/customers/lincs_rural/prepare_data.py b/etl/customers/lincs_rural/prepare_data.py index 675179a8..0a3be7fe 100644 --- a/etl/customers/lincs_rural/prepare_data.py +++ b/etl/customers/lincs_rural/prepare_data.py @@ -52,6 +52,8 @@ for _, x in tqdm(data.iterrows(), total=len(data)): standardised_ara_list.append( { "landlord_property_id": x["Property Ref."], + "domna_address_1": find_epc_data["address1"], + "postcode": find_epc_data["postcode"], "landlord_property_type": epc_searcher.newest_epc.get("property-type"), "landlord_built_form": epc_searcher.newest_epc.get("built-form"), "landlord_heating_system": epc_searcher.newest_epc.get("mainheat-description", ""), @@ -73,3 +75,17 @@ for _, x in tqdm(data.iterrows(), total=len(data)): }) missed_df = pd.DataFrame(missed) + +# Store +standardised_ara_df = pd.DataFrame(standardised_ara_list) +standardised_ara_df.to_excel( + "/Users/khalimconn-kowlessar/Downloads/lincs_rural_standardised_ara_nov_2025.xlsx", + index=False, + sheet_name="Standardised Asset List" +) +# Store missed +missed_df.to_excel( + "/Users/khalimconn-kowlessar/Downloads/lincs_rural_missed_nov_2025.xlsx", + index=False, + sheet_name="Missed Properties" +)