From 3b3c6c3cc4bd8e028efef268ac1ef797e72134ff Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 2 Jul 2024 11:55:23 +0100 Subject: [PATCH] Added more robust address selection --- etl/bill_savings/data_collection.py | 85 +++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 16 deletions(-) diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py index 22b12c6e..793c13c4 100644 --- a/etl/bill_savings/data_collection.py +++ b/etl/bill_savings/data_collection.py @@ -1,4 +1,6 @@ import time +from datetime import datetime, timedelta +from dateutil.relativedelta import relativedelta import requests import inspect @@ -8,6 +10,7 @@ from bs4 import BeautifulSoup from etl.epc.settings import EARLIEST_EPC_DATE from pathlib import Path import numpy as np +from utils.s3 import save_pickle_to_s3 src_file_path = inspect.getfile(lambda: None) @@ -18,7 +21,13 @@ SEARCH_POSTCODE_URL = ( BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk" -def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str): +def calculate_expiry_date(lodgement_date): + lodgement_date_dt = datetime.strptime(lodgement_date, '%Y-%m-%d') + expiry_date_dt = lodgement_date_dt + relativedelta(years=10) - timedelta(days=1) + return expiry_date_dt.strftime('%d %B %Y') + + +def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str, expected_expiry_date: str): """ For a post code and address, we pull out all the required data from the find my epc website """ @@ -31,22 +40,52 @@ def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str): postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input) postcode_response = requests.get(postcode_search, headers=headers) - postcode_res = BeautifulSoup(postcode_response.text, features="html.parser") - address_links_full = postcode_res.findAll('a', {'class': 'govuk-link', 'rel': 'nofollow'}) - address_links = {element.text.lstrip().rstrip(): BASE_ENERGY_URL + element['href'] for element in - address_links_full} - address_cleaned = address.replace(",", "").replace(" ", "").lower() - address_links_cleaned = [ - x.replace(",", "").replace(" ", "").lower() for x in list(address_links.keys()) - ] + postcode_res = BeautifulSoup(postcode_response.text, features="html.parser") + rows = postcode_res.find_all('tr', class_='govuk-table__row') - index_of_address = [key.startswith(address_cleaned) for key in address_links_cleaned] - if sum(index_of_address) > 1: - # If we have two or more addresses, we can't be sure which one is the correct one so we exit for simplicity + extracted_table = [] + for row in rows: + # Extract the address and URL + address_tag = row.find('a', class_='govuk-link') + if address_tag is None: + continue + extracted_address = None + extracted_address_url = None + if address_tag: + extracted_address = address_tag.text.strip() + extracted_address_url = address_tag['href'] + + extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower() + if not extracted_address_cleaned.startswith(address_cleaned): + continue + + # If the address is a match, we can extract the data + + # Extract the expiry date + expiry_date_tag = row.find('td', class_='govuk-table__cell date') + expiry_date = None + if expiry_date_tag is not None: + expiry_date = expiry_date_tag.parent.find('span').text.strip() + + extracted_table.append( + { + "extracted_address": extracted_address, + "extracted_address_url": extracted_address_url, + "expiry_date": expiry_date + } + ) + + extracted_table = [entry for entry in extracted_table if entry['expiry_date'] == expected_expiry_date] + + if len(extracted_table) > 1: + print("Multiple candidates found, skipping for now") return None - chosen_epc = address_links[list(address_links.keys())[np.where(index_of_address)[0][0]]] + if not extracted_table: + raise Exception("Fix me") + + chosen_epc = BASE_ENERGY_URL + extracted_table[0]['extracted_address_url'] epc_certificate = chosen_epc.split('/')[-1] address_response = requests.get(chosen_epc, headers=headers) @@ -83,7 +122,6 @@ def app(): :return: """ - cleaned_data = {} epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()] sample_size = 100 @@ -96,6 +134,10 @@ def app(): # Take just date before the date threshold data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] + data = data[~pd.isnull(data["uprn"])] + # Take just the newest EPC per uprn, based on lodgement-date + data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn") + data = data.sample(sample_size) # We use the addreess data to find the related information @@ -107,12 +149,23 @@ def app(): uprn = int(property_data["uprn"]) address = property_data["address1"] postcode = property_data["postcode"] + expected_expiry_date = calculate_expiry_date(property_data["lodgement-date"]) response = retrieve_find_my_epc_data( uprn=uprn, postcode=postcode, - address=address + address=address, + expected_expiry_date=expected_expiry_date ) + if response is None: + continue collected_data.append(response) - energy_consumption_data.extend(energy_consumption_data) + energy_consumption_data.extend(collected_data) + + # Store the pickle in s3 + save_time = datetime.now() + save_pickle_to_s3( + energy_consumption_data, bucket_name="retrofit-datalake-dev", + s3_file_name=f"energy_consumption_data/{save_time}.pkl" + )