mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added more robust address selection
This commit is contained in:
parent
1db6dfebdf
commit
3b3c6c3cc4
1 changed files with 69 additions and 16 deletions
|
|
@ -1,4 +1,6 @@
|
|||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
import requests
|
||||
import inspect
|
||||
|
|
@ -8,6 +10,7 @@ from bs4 import BeautifulSoup
|
|||
from etl.epc.settings import EARLIEST_EPC_DATE
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
from utils.s3 import save_pickle_to_s3
|
||||
|
||||
src_file_path = inspect.getfile(lambda: None)
|
||||
|
||||
|
|
@ -18,7 +21,13 @@ SEARCH_POSTCODE_URL = (
|
|||
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
|
||||
|
||||
|
||||
def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str):
|
||||
def calculate_expiry_date(lodgement_date):
|
||||
lodgement_date_dt = datetime.strptime(lodgement_date, '%Y-%m-%d')
|
||||
expiry_date_dt = lodgement_date_dt + relativedelta(years=10) - timedelta(days=1)
|
||||
return expiry_date_dt.strftime('%d %B %Y')
|
||||
|
||||
|
||||
def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str, expected_expiry_date: str):
|
||||
"""
|
||||
For a post code and address, we pull out all the required data from the find my epc website
|
||||
"""
|
||||
|
|
@ -31,22 +40,52 @@ def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str):
|
|||
postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
|
||||
postcode_response = requests.get(postcode_search, headers=headers)
|
||||
|
||||
postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
|
||||
address_links_full = postcode_res.findAll('a', {'class': 'govuk-link', 'rel': 'nofollow'})
|
||||
address_links = {element.text.lstrip().rstrip(): BASE_ENERGY_URL + element['href'] for element in
|
||||
address_links_full}
|
||||
|
||||
address_cleaned = address.replace(",", "").replace(" ", "").lower()
|
||||
address_links_cleaned = [
|
||||
x.replace(",", "").replace(" ", "").lower() for x in list(address_links.keys())
|
||||
]
|
||||
postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
|
||||
rows = postcode_res.find_all('tr', class_='govuk-table__row')
|
||||
|
||||
index_of_address = [key.startswith(address_cleaned) for key in address_links_cleaned]
|
||||
if sum(index_of_address) > 1:
|
||||
# If we have two or more addresses, we can't be sure which one is the correct one so we exit for simplicity
|
||||
extracted_table = []
|
||||
for row in rows:
|
||||
# Extract the address and URL
|
||||
address_tag = row.find('a', class_='govuk-link')
|
||||
if address_tag is None:
|
||||
continue
|
||||
extracted_address = None
|
||||
extracted_address_url = None
|
||||
if address_tag:
|
||||
extracted_address = address_tag.text.strip()
|
||||
extracted_address_url = address_tag['href']
|
||||
|
||||
extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower()
|
||||
if not extracted_address_cleaned.startswith(address_cleaned):
|
||||
continue
|
||||
|
||||
# If the address is a match, we can extract the data
|
||||
|
||||
# Extract the expiry date
|
||||
expiry_date_tag = row.find('td', class_='govuk-table__cell date')
|
||||
expiry_date = None
|
||||
if expiry_date_tag is not None:
|
||||
expiry_date = expiry_date_tag.parent.find('span').text.strip()
|
||||
|
||||
extracted_table.append(
|
||||
{
|
||||
"extracted_address": extracted_address,
|
||||
"extracted_address_url": extracted_address_url,
|
||||
"expiry_date": expiry_date
|
||||
}
|
||||
)
|
||||
|
||||
extracted_table = [entry for entry in extracted_table if entry['expiry_date'] == expected_expiry_date]
|
||||
|
||||
if len(extracted_table) > 1:
|
||||
print("Multiple candidates found, skipping for now")
|
||||
return None
|
||||
chosen_epc = address_links[list(address_links.keys())[np.where(index_of_address)[0][0]]]
|
||||
|
||||
if not extracted_table:
|
||||
raise Exception("Fix me")
|
||||
|
||||
chosen_epc = BASE_ENERGY_URL + extracted_table[0]['extracted_address_url']
|
||||
epc_certificate = chosen_epc.split('/')[-1]
|
||||
|
||||
address_response = requests.get(chosen_epc, headers=headers)
|
||||
|
|
@ -83,7 +122,6 @@ def app():
|
|||
:return:
|
||||
"""
|
||||
|
||||
cleaned_data = {}
|
||||
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
|
||||
sample_size = 100
|
||||
|
|
@ -96,6 +134,10 @@ def app():
|
|||
# Take just date before the date threshold
|
||||
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
|
||||
|
||||
data = data[~pd.isnull(data["uprn"])]
|
||||
# Take just the newest EPC per uprn, based on lodgement-date
|
||||
data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")
|
||||
|
||||
data = data.sample(sample_size)
|
||||
# We use the addreess data to find the related information
|
||||
|
||||
|
|
@ -107,12 +149,23 @@ def app():
|
|||
uprn = int(property_data["uprn"])
|
||||
address = property_data["address1"]
|
||||
postcode = property_data["postcode"]
|
||||
expected_expiry_date = calculate_expiry_date(property_data["lodgement-date"])
|
||||
|
||||
response = retrieve_find_my_epc_data(
|
||||
uprn=uprn,
|
||||
postcode=postcode,
|
||||
address=address
|
||||
address=address,
|
||||
expected_expiry_date=expected_expiry_date
|
||||
)
|
||||
if response is None:
|
||||
continue
|
||||
collected_data.append(response)
|
||||
|
||||
energy_consumption_data.extend(energy_consumption_data)
|
||||
energy_consumption_data.extend(collected_data)
|
||||
|
||||
# Store the pickle in s3
|
||||
save_time = datetime.now()
|
||||
save_pickle_to_s3(
|
||||
energy_consumption_data, bucket_name="retrofit-datalake-dev",
|
||||
s3_file_name=f"energy_consumption_data/{save_time}.pkl"
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue