From 9e7ed1efd5b5cbc99a536023e8f7401b1f227c45 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 1 Dec 2025 09:35:07 +0000 Subject: [PATCH] making epc searching more specific --- backend/SearchEpc.py | 14 +++++++-- etl/find_my_epc/RetrieveFindMyEpc.py | 43 ++++++++++++++++++++++++---- 2 files changed, 48 insertions(+), 9 deletions(-) diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 686843c3..5ceac5f9 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -447,11 +447,19 @@ class SearchEpc: ] elif best_match1[1] > best_match2[1]: - # Get all of the scores - rows_filtered = [r for r in rows if ", ".join([r["address"], r["posttown"]]) == best_match1[0]] + # Get all of the scores - make sure we keep uprn + rows_filtered = [ + r for r in rows if + ( + (", ".join([r["address"], r["posttown"]]) == best_match1[0]) or + (str(r["uprn"]) == str(self.uprn)) + ) + ] else: # Get all of the scores - rows_filtered = [r for r in rows if r["address"] == best_match2[0]] + rows_filtered = [ + r for r in rows if (r["address"] == best_match2[0]) or (str(r["uprn"]) == str(self.uprn)) + ] # If we have multiple, we filter on newest lodgment date if len(rows_filtered) > 1: diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index a7767273..5bb0e89c 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -22,7 +22,9 @@ class RetrieveFindMyEpc: 'Chrome/111.0.0.0 Safari/537.36' } - def __init__(self, address: str, postcode: str, rrn: str = None, address_postal_town: str = ""): + def __init__( + self, address: str, postcode: str, rrn: str = None, address_postal_town: str = "", sap_rating: int = None + ): """ This class is tasked with retrieving the latest EPC data from the find my epc website :param address: The address of the property @@ -40,6 +42,8 @@ class RetrieveFindMyEpc: if self.address_postal_town: self.address_postal_town = self.address_postal_town.replace(",", "").replace(" ", "").lower() + self.sap_rating = sap_rating + @staticmethod def extract_low_carbon_sources(soup): # Find the section header @@ -351,7 +355,7 @@ class RetrieveFindMyEpc: postcode_res = BeautifulSoup(postcode_response.text, features="html.parser") rows = postcode_res.find_all('tr', class_='govuk-table__row') - extracted_table = [] + extracted_table, backup_flat = [], [] for row in rows: # Extract the address and URL address_tag = row.find('a', class_='govuk-link') @@ -373,6 +377,17 @@ class RetrieveFindMyEpc: ) if no_primary_match and no_backup_match: + if self.address_cleaned.startswith("flat"): + # We have a flat address, so we can try and match without the flat number + flat_removed_address = self.address_cleaned[4:] + if extracted_address_cleaned.startswith(flat_removed_address): + # We have a backup match + backup_flat.append( + { + "extracted_address": extracted_address, + "extracted_address_url": extracted_address_url, + } + ) continue # If the address is a match, we can extract the data @@ -391,9 +406,12 @@ class RetrieveFindMyEpc: } ) - if not extracted_table: + if not extracted_table and not backup_flat: raise ValueError("No EPC found") + if not extracted_table: + extracted_table = deepcopy(backup_flat) + if len(extracted_table) > 1: # We take the one with the most recent expiry date extracted_table = sorted(extracted_table, key=lambda x: x['expiry_date'], reverse=True) @@ -439,6 +457,12 @@ class RetrieveFindMyEpc: potential_rating = ratings.split(".")[1] current_sap = int(current_rating.split(' ')[-1]) + if current_sap != self.sap_rating: + raise ValueError( + f"SAP rating mismatch: expected {self.sap_rating}, got {current_sap} for address {self.address}, " + f"postcode {self.postcode}" + ) + # Retrieve the energy consumption bills = address_res.find('div', {'id': 'bills-affected'}) bills_list = bills.find_all('li') @@ -736,12 +760,15 @@ class RetrieveFindMyEpc: return formatted_recommendations @classmethod - def get_from_epc(cls, epc, epc_page_source=None, rrn=None, address_postal_town=None): + def get_from_epc(cls, epc, epc_page_source=None, rrn=None, address_postal_town=None, sap_rating=None): if epc_page_source is not None and rrn is None: raise ValueError("rrn must be provided if epc_page_source is provided") - searcher = cls(address=epc["address"], postcode=epc["postcode"], address_postal_town=address_postal_town) + searcher = cls( + address=epc["address"], postcode=epc["postcode"], address_postal_town=address_postal_town, + sap_rating=sap_rating + ) find_epc_data = searcher.retrieve_newest_find_my_epc_data(epc_page_source=epc_page_source, rrn=rrn) non_invasive_recommendations = { @@ -797,11 +824,15 @@ class RetrieveFindMyEpc: modified[k] = config_address attempts.append(modified) + sap_rating = float(epc["current-energy-efficiency"]) + # Iterate attempts last_error = None for idx, attempt in enumerate(attempts, start=1): try: - return cls.get_from_epc(attempt, epc_page, rrn=rrn, address_postal_town=address_postal_town) + return cls.get_from_epc( + attempt, epc_page, rrn=rrn, address_postal_town=address_postal_town, sap_rating=sap_rating + ) except Exception as e: last_error = e logger.error(f"Attempt {idx} failed: {e}")