From 110cb8070ce78823d2bd9edcca5d5d95222a9da4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 10 Dec 2025 18:42:25 +0000 Subject: [PATCH] [Cincreased concurrency of backend --- etl/customers/lincs_rural/prepare_data.py | 71 ++++++++++++++++--- .../data_cleanse.py | 6 ++ etl/find_my_epc/RetrieveFindMyEpc.py | 13 ++-- serverless.yml | 2 +- 4 files changed, 74 insertions(+), 18 deletions(-) create mode 100644 etl/customers/peabody/Nov 2025 Consulting Project/data_cleanse.py diff --git a/etl/customers/lincs_rural/prepare_data.py b/etl/customers/lincs_rural/prepare_data.py index db7a9087..675179a8 100644 --- a/etl/customers/lincs_rural/prepare_data.py +++ b/etl/customers/lincs_rural/prepare_data.py @@ -1,8 +1,15 @@ """ Rough script to prepare the data for Lincs Rural project """ +from tqdm import tqdm import pandas as pd +import os +from dotenv import load_dotenv from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from backend.SearchEpc import SearchEpc + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") data = pd.read_excel( "/Users/khalimconn-kowlessar/Downloads/MASTER LIST EPCS UPDATED November 2025 Domna Homes.xlsx", @@ -11,16 +18,58 @@ data = pd.read_excel( # We have property RRNs - we need UPRN -for _, x in data.iterrows(): - rrn = x["EPC Ref."] +standardised_ara_list = [] +missed = [] +for _, x in tqdm(data.iterrows(), total=len(data)): + try: + rrn = x["EPC Ref."] - # Fetch from find my epc - retriever = RetrieveFindMyEpc( - address="", - postcode="", - rrn=rrn, - address_postal_town="", - sap_rating=x["Actual"] - ) + # Fetch from find my epc + retriever = RetrieveFindMyEpc( + address="", + postcode="", + rrn=rrn, + address_postal_town="", + ) - find_epc_data = retriever.retrieve_all_find_my_epc_data() + find_epc_data = retriever.retrieve_newest_find_my_epc_data(rrn=rrn) + + # Find the UPRN + epc_searcher = SearchEpc( + address1=str(find_epc_data["address1"]), + postcode=str(find_epc_data["postcode"]), + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=False, + full_address=",".join([find_epc_data["address1"], find_epc_data["address2"]]), + max_retries=5, + ) + epc_searcher.find_property(skip_os=True) + + # Append in format we need + # Stuff we need: + standardised_ara_list.append( + { + "landlord_property_id": x["Property Ref."], + "landlord_property_type": epc_searcher.newest_epc.get("property-type"), + "landlord_built_form": epc_searcher.newest_epc.get("built-form"), + "landlord_heating_system": epc_searcher.newest_epc.get("mainheat-description", ""), + "epc_os_uprn": epc_searcher.newest_epc.get("uprn"), + "domna_property_id": x["Property Ref."], + "domna_full_address": epc_searcher.newest_epc.get( + "address", ", ".join([ + find_epc_data["address1"], + find_epc_data["address2"], + ]) + ), + } + ) + except Exception as e: + missed.append({ + "property_ref": x["Property Ref."], + "rrn": x["EPC Ref."], + "error": str(e) + }) + +missed_df = pd.DataFrame(missed) diff --git a/etl/customers/peabody/Nov 2025 Consulting Project/data_cleanse.py b/etl/customers/peabody/Nov 2025 Consulting Project/data_cleanse.py new file mode 100644 index 00000000..a1be533d --- /dev/null +++ b/etl/customers/peabody/Nov 2025 Consulting Project/data_cleanse.py @@ -0,0 +1,6 @@ +""" +We have found, within the Peabody data, a large volume of properties with missing and incorrects +UPRNS and incorrect address data. We want to flag these records and also find missings where we can + +We also have duplicate UPRNS that should be flagged +""" diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index eb330948..cf6659f9 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -465,12 +465,13 @@ class RetrieveFindMyEpc: potential_rating = ratings.split(".")[1] current_sap = int(current_rating.split(' ')[-1]) - if current_sap != self.sap_rating: - # This means we likely have the wrong data. If we are in this scenario, we return nothing - return { - "epc_certificate": None, - "page_source": None, - } + if self.sap_rating: + if current_sap != self.sap_rating: + # This means we likely have the wrong data. If we are in this scenario, we return nothing + return { + "epc_certificate": None, + "page_source": None, + } # Retrieve the energy consumption bills = address_res.find('div', {'id': 'bills-affected'}) diff --git a/serverless.yml b/serverless.yml index d2d8f50a..38d8da89 100644 --- a/serverless.yml +++ b/serverless.yml @@ -66,7 +66,7 @@ functions: - sqs: arn: arn:aws:sqs:${self:provider.region}:${aws:accountId}:model-engine-queue batchSize: 1 - maximumConcurrency: 5 # Heavily restricts concurrency to avoid overwhelming the ldmbda limits + maximumConcurrency: 10 # Heavily restricts concurrency to avoid overwhelming the ldmbda limits resources: