[Cincreased concurrency of backend

This commit is contained in:
Khalim Conn-Kowlessar 2025-12-10 18:42:25 +00:00
parent b271ad5c97
commit 110cb8070c
4 changed files with 74 additions and 18 deletions

View file

@ -1,8 +1,15 @@
"""
Rough script to prepare the data for Lincs Rural project
"""
from tqdm import tqdm
import pandas as pd
import os
from dotenv import load_dotenv
from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
from backend.SearchEpc import SearchEpc
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
data = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/MASTER LIST EPCS UPDATED November 2025 Domna Homes.xlsx",
@ -11,16 +18,58 @@ data = pd.read_excel(
# We have property RRNs - we need UPRN
for _, x in data.iterrows():
rrn = x["EPC Ref."]
standardised_ara_list = []
missed = []
for _, x in tqdm(data.iterrows(), total=len(data)):
try:
rrn = x["EPC Ref."]
# Fetch from find my epc
retriever = RetrieveFindMyEpc(
address="",
postcode="",
rrn=rrn,
address_postal_town="",
sap_rating=x["Actual"]
)
# Fetch from find my epc
retriever = RetrieveFindMyEpc(
address="",
postcode="",
rrn=rrn,
address_postal_town="",
)
find_epc_data = retriever.retrieve_all_find_my_epc_data()
find_epc_data = retriever.retrieve_newest_find_my_epc_data(rrn=rrn)
# Find the UPRN
epc_searcher = SearchEpc(
address1=str(find_epc_data["address1"]),
postcode=str(find_epc_data["postcode"]),
auth_token=EPC_AUTH_TOKEN,
os_api_key="",
property_type=None,
fast=False,
full_address=",".join([find_epc_data["address1"], find_epc_data["address2"]]),
max_retries=5,
)
epc_searcher.find_property(skip_os=True)
# Append in format we need
# Stuff we need:
standardised_ara_list.append(
{
"landlord_property_id": x["Property Ref."],
"landlord_property_type": epc_searcher.newest_epc.get("property-type"),
"landlord_built_form": epc_searcher.newest_epc.get("built-form"),
"landlord_heating_system": epc_searcher.newest_epc.get("mainheat-description", ""),
"epc_os_uprn": epc_searcher.newest_epc.get("uprn"),
"domna_property_id": x["Property Ref."],
"domna_full_address": epc_searcher.newest_epc.get(
"address", ", ".join([
find_epc_data["address1"],
find_epc_data["address2"],
])
),
}
)
except Exception as e:
missed.append({
"property_ref": x["Property Ref."],
"rrn": x["EPC Ref."],
"error": str(e)
})
missed_df = pd.DataFrame(missed)

View file

@ -0,0 +1,6 @@
"""
We have found, within the Peabody data, a large volume of properties with missing and incorrects
UPRNS and incorrect address data. We want to flag these records and also find missings where we can
We also have duplicate UPRNS that should be flagged
"""

View file

@ -465,12 +465,13 @@ class RetrieveFindMyEpc:
potential_rating = ratings.split(".")[1]
current_sap = int(current_rating.split(' ')[-1])
if current_sap != self.sap_rating:
# This means we likely have the wrong data. If we are in this scenario, we return nothing
return {
"epc_certificate": None,
"page_source": None,
}
if self.sap_rating:
if current_sap != self.sap_rating:
# This means we likely have the wrong data. If we are in this scenario, we return nothing
return {
"epc_certificate": None,
"page_source": None,
}
# Retrieve the energy consumption
bills = address_res.find('div', {'id': 'bills-affected'})

View file

@ -66,7 +66,7 @@ functions:
- sqs:
arn: arn:aws:sqs:${self:provider.region}:${aws:accountId}:model-engine-queue
batchSize: 1
maximumConcurrency: 5 # Heavily restricts concurrency to avoid overwhelming the ldmbda limits
maximumConcurrency: 10 # Heavily restricts concurrency to avoid overwhelming the ldmbda limits
resources: