Added more robust address selection

2026-07-27 23:35:01 +00:00 · 2024-07-02 11:55:23 +01:00 · 2024-07-02 11:55:23 +01:00 · 3b3c6c3cc4
commit 3b3c6c3cc4
parent 1db6dfebdf
1 changed files with 69 additions and 16 deletions
--- a/etl/bill_savings/data_collection.py
+++ b/etl/bill_savings/data_collection.py
@ -1,4 +1,6 @@
 import time
+from datetime import datetime, timedelta
+from dateutil.relativedelta import relativedelta

 import requests
 import inspect
@ -8,6 +10,7 @@ from bs4 import BeautifulSoup
 from etl.epc.settings import EARLIEST_EPC_DATE
 from pathlib import Path
 import numpy as np
+from utils.s3 import save_pickle_to_s3

 src_file_path = inspect.getfile(lambda: None)

@ -18,7 +21,13 @@ SEARCH_POSTCODE_URL = (
 BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"


-def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str):
+def calculate_expiry_date(lodgement_date):
+    lodgement_date_dt = datetime.strptime(lodgement_date, '%Y-%m-%d')
+    expiry_date_dt = lodgement_date_dt + relativedelta(years=10) - timedelta(days=1)
+    return expiry_date_dt.strftime('%d %B %Y')
+
+
+def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str, expected_expiry_date: str):
    """
    For a post code and address, we pull out all the required data from the find my epc website
    """
@ -31,22 +40,52 @@ def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str):
    postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
    postcode_response = requests.get(postcode_search, headers=headers)

-    postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
-    address_links_full = postcode_res.findAll('a', {'class': 'govuk-link', 'rel': 'nofollow'})
-    address_links = {element.text.lstrip().rstrip(): BASE_ENERGY_URL + element['href'] for element in
-                     address_links_full}
-
    address_cleaned = address.replace(",", "").replace(" ", "").lower()
-    address_links_cleaned = [
-        x.replace(",", "").replace(" ", "").lower() for x in list(address_links.keys())
-    ]
+    postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
+    rows = postcode_res.find_all('tr', class_='govuk-table__row')

-    index_of_address = [key.startswith(address_cleaned) for key in address_links_cleaned]
-    if sum(index_of_address) > 1:
-        # If we have two or more addresses, we can't be sure which one is the correct one so we exit for simplicity
+    extracted_table = []
+    for row in rows:
+        # Extract the address and URL
+        address_tag = row.find('a', class_='govuk-link')
+        if address_tag is None:
+            continue
+        extracted_address = None
+        extracted_address_url = None
+        if address_tag:
+            extracted_address = address_tag.text.strip()
+            extracted_address_url = address_tag['href']
+
+            extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower()
+            if not extracted_address_cleaned.startswith(address_cleaned):
+                continue
+
+            # If the address is a match, we can extract the data
+
+        # Extract the expiry date
+        expiry_date_tag = row.find('td', class_='govuk-table__cell date')
+        expiry_date = None
+        if expiry_date_tag is not None:
+            expiry_date = expiry_date_tag.parent.find('span').text.strip()
+
+        extracted_table.append(
+            {
+                "extracted_address": extracted_address,
+                "extracted_address_url": extracted_address_url,
+                "expiry_date": expiry_date
+            }
+        )
+
+    extracted_table = [entry for entry in extracted_table if entry['expiry_date'] == expected_expiry_date]
+
+    if len(extracted_table) > 1:
+        print("Multiple candidates found, skipping for now")
        return None
-    chosen_epc = address_links[list(address_links.keys())[np.where(index_of_address)[0][0]]]

+    if not extracted_table:
+        raise Exception("Fix me")
+
+    chosen_epc = BASE_ENERGY_URL + extracted_table[0]['extracted_address_url']
    epc_certificate = chosen_epc.split('/')[-1]

    address_response = requests.get(chosen_epc, headers=headers)
@ -83,7 +122,6 @@ def app():
    :return:
    """

-    cleaned_data = {}
    epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]

    sample_size = 100
@ -96,6 +134,10 @@ def app():
        # Take just date before the date threshold
        data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]

+        data = data[~pd.isnull(data["uprn"])]
+        # Take just the newest EPC per uprn, based on lodgement-date
+        data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")
+
        data = data.sample(sample_size)
        # We use the addreess data to find the related information

@ -107,12 +149,23 @@ def app():
            uprn = int(property_data["uprn"])
            address = property_data["address1"]
            postcode = property_data["postcode"]
+            expected_expiry_date = calculate_expiry_date(property_data["lodgement-date"])

            response = retrieve_find_my_epc_data(
                uprn=uprn,
                postcode=postcode,
-                address=address
+                address=address,
+                expected_expiry_date=expected_expiry_date
            )
+            if response is None:
+                continue
            collected_data.append(response)

-        energy_consumption_data.extend(energy_consumption_data)
+        energy_consumption_data.extend(collected_data)
+
+    # Store the pickle in s3
+    save_time = datetime.now()
+    save_pickle_to_s3(
+        energy_consumption_data, bucket_name="retrofit-datalake-dev",
+        s3_file_name=f"energy_consumption_data/{save_time}.pkl"
+    )