Added more robust address selection

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-02 11:55:23 +01:00
parent 1db6dfebdf
commit 3b3c6c3cc4

View file

@ -1,4 +1,6 @@
import time
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import requests
import inspect
@ -8,6 +10,7 @@ from bs4 import BeautifulSoup
from etl.epc.settings import EARLIEST_EPC_DATE
from pathlib import Path
import numpy as np
from utils.s3 import save_pickle_to_s3
src_file_path = inspect.getfile(lambda: None)
@ -18,7 +21,13 @@ SEARCH_POSTCODE_URL = (
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str):
def calculate_expiry_date(lodgement_date):
lodgement_date_dt = datetime.strptime(lodgement_date, '%Y-%m-%d')
expiry_date_dt = lodgement_date_dt + relativedelta(years=10) - timedelta(days=1)
return expiry_date_dt.strftime('%d %B %Y')
def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str, expected_expiry_date: str):
"""
For a post code and address, we pull out all the required data from the find my epc website
"""
@ -31,22 +40,52 @@ def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str):
postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
postcode_response = requests.get(postcode_search, headers=headers)
postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
address_links_full = postcode_res.findAll('a', {'class': 'govuk-link', 'rel': 'nofollow'})
address_links = {element.text.lstrip().rstrip(): BASE_ENERGY_URL + element['href'] for element in
address_links_full}
address_cleaned = address.replace(",", "").replace(" ", "").lower()
address_links_cleaned = [
x.replace(",", "").replace(" ", "").lower() for x in list(address_links.keys())
]
postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
rows = postcode_res.find_all('tr', class_='govuk-table__row')
index_of_address = [key.startswith(address_cleaned) for key in address_links_cleaned]
if sum(index_of_address) > 1:
# If we have two or more addresses, we can't be sure which one is the correct one so we exit for simplicity
extracted_table = []
for row in rows:
# Extract the address and URL
address_tag = row.find('a', class_='govuk-link')
if address_tag is None:
continue
extracted_address = None
extracted_address_url = None
if address_tag:
extracted_address = address_tag.text.strip()
extracted_address_url = address_tag['href']
extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower()
if not extracted_address_cleaned.startswith(address_cleaned):
continue
# If the address is a match, we can extract the data
# Extract the expiry date
expiry_date_tag = row.find('td', class_='govuk-table__cell date')
expiry_date = None
if expiry_date_tag is not None:
expiry_date = expiry_date_tag.parent.find('span').text.strip()
extracted_table.append(
{
"extracted_address": extracted_address,
"extracted_address_url": extracted_address_url,
"expiry_date": expiry_date
}
)
extracted_table = [entry for entry in extracted_table if entry['expiry_date'] == expected_expiry_date]
if len(extracted_table) > 1:
print("Multiple candidates found, skipping for now")
return None
chosen_epc = address_links[list(address_links.keys())[np.where(index_of_address)[0][0]]]
if not extracted_table:
raise Exception("Fix me")
chosen_epc = BASE_ENERGY_URL + extracted_table[0]['extracted_address_url']
epc_certificate = chosen_epc.split('/')[-1]
address_response = requests.get(chosen_epc, headers=headers)
@ -83,7 +122,6 @@ def app():
:return:
"""
cleaned_data = {}
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
sample_size = 100
@ -96,6 +134,10 @@ def app():
# Take just date before the date threshold
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
data = data[~pd.isnull(data["uprn"])]
# Take just the newest EPC per uprn, based on lodgement-date
data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")
data = data.sample(sample_size)
# We use the addreess data to find the related information
@ -107,12 +149,23 @@ def app():
uprn = int(property_data["uprn"])
address = property_data["address1"]
postcode = property_data["postcode"]
expected_expiry_date = calculate_expiry_date(property_data["lodgement-date"])
response = retrieve_find_my_epc_data(
uprn=uprn,
postcode=postcode,
address=address
address=address,
expected_expiry_date=expected_expiry_date
)
if response is None:
continue
collected_data.append(response)
energy_consumption_data.extend(energy_consumption_data)
energy_consumption_data.extend(collected_data)
# Store the pickle in s3
save_time = datetime.now()
save_pickle_to_s3(
energy_consumption_data, bucket_name="retrofit-datalake-dev",
s3_file_name=f"energy_consumption_data/{save_time}.pkl"
)