mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
227 lines
8.3 KiB
Python
227 lines
8.3 KiB
Python
import time
|
||
from datetime import datetime, timedelta
|
||
from dateutil.relativedelta import relativedelta
|
||
|
||
import requests
|
||
import inspect
|
||
import pandas as pd
|
||
from tqdm import tqdm
|
||
from bs4 import BeautifulSoup
|
||
from etl.epc.settings import EARLIEST_EPC_DATE
|
||
from pathlib import Path
|
||
import numpy as np
|
||
from utils.s3 import save_pickle_to_s3
|
||
|
||
src_file_path = inspect.getfile(lambda: None)
|
||
|
||
EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"
|
||
SEARCH_POSTCODE_URL = (
|
||
"https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
|
||
)
|
||
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
|
||
|
||
|
||
def calculate_expiry_date(lodgement_date):
|
||
lodgement_date_dt = datetime.strptime(lodgement_date, '%Y-%m-%d')
|
||
expiry_date_dt = lodgement_date_dt + relativedelta(years=10) - timedelta(days=1)
|
||
return expiry_date_dt.strftime('%-d %B %Y')
|
||
|
||
|
||
def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str, expected_expiry_date: str):
|
||
"""
|
||
For a post code and address, we pull out all the required data from the find my epc website
|
||
"""
|
||
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||
'Chrome/111.0.0.0 Safari/537.36'
|
||
}
|
||
postcode_input = postcode.replace(" ", "+")
|
||
postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
|
||
postcode_response = requests.get(postcode_search, headers=headers)
|
||
|
||
address_cleaned = address.replace(",", "").replace(" ", "").lower()
|
||
postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
|
||
rows = postcode_res.find_all('tr', class_='govuk-table__row')
|
||
|
||
extracted_table = []
|
||
for row in rows:
|
||
# Extract the address and URL
|
||
address_tag = row.find('a', class_='govuk-link')
|
||
if address_tag is None:
|
||
continue
|
||
extracted_address = None
|
||
extracted_address_url = None
|
||
if address_tag:
|
||
extracted_address = address_tag.text.strip()
|
||
extracted_address_url = address_tag['href']
|
||
|
||
extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower()
|
||
if not extracted_address_cleaned.startswith(address_cleaned):
|
||
continue
|
||
|
||
# If the address is a match, we can extract the data
|
||
|
||
# Extract the expiry date
|
||
expiry_date_tag = row.find('td', class_='govuk-table__cell date')
|
||
expiry_date = None
|
||
if expiry_date_tag is not None:
|
||
expiry_date = expiry_date_tag.parent.find('span').text.strip()
|
||
|
||
extracted_table.append(
|
||
{
|
||
"extracted_address": extracted_address,
|
||
"extracted_address_url": extracted_address_url,
|
||
"expiry_date": expiry_date
|
||
}
|
||
)
|
||
|
||
extracted_table = [entry for entry in extracted_table if entry['expiry_date'] == expected_expiry_date]
|
||
|
||
if len(extracted_table) > 1:
|
||
print("Multiple candidates found, skipping for now")
|
||
return None
|
||
|
||
if not extracted_table:
|
||
print("No candidates found, skipping for now")
|
||
return None
|
||
|
||
chosen_epc = BASE_ENERGY_URL + extracted_table[0]['extracted_address_url']
|
||
epc_certificate = chosen_epc.split('/')[-1]
|
||
|
||
address_response = requests.get(chosen_epc, headers=headers)
|
||
address_res = BeautifulSoup(address_response.text, features="html.parser")
|
||
|
||
ratings = address_res.find('desc', {'id': 'svg-desc'}).text
|
||
current_rating = ratings.split(".")[0]
|
||
potential_rating = ratings.split(".")[1]
|
||
|
||
# Retrieve the energy consumption
|
||
bills = address_res.find('div', {'id': 'bills-affected'})
|
||
bills_list = bills.find_all('li')
|
||
if not bills_list:
|
||
# If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information
|
||
heating_text = None
|
||
hot_water_text = None
|
||
else:
|
||
heating_text = bills_list[0].text
|
||
hot_water_text = bills_list[1].text
|
||
|
||
# Search for the assessment informaton
|
||
assessment_information = address_res.find('div', {'id': 'information'})
|
||
# Parse this information
|
||
rows = assessment_information.find_all('div', class_='govuk-summary-list__row')
|
||
# Create a dictionary to hold the parsed information
|
||
assessment_data = {}
|
||
for row in rows:
|
||
key = row.find('dt').text.strip()
|
||
if key == "Type of assessment":
|
||
# We dont reliably extract this
|
||
continue
|
||
value_tag = row.find('dd')
|
||
|
||
# Check if value contains a link (email)
|
||
if value_tag.find('a'):
|
||
value = value_tag.find('a').text.strip()
|
||
elif value_tag.find('summary'):
|
||
value = value_tag.find('span').text.strip()
|
||
else:
|
||
value = value_tag.text.strip()
|
||
|
||
assessment_data[key] = value
|
||
|
||
expected_keys = [
|
||
'Assessor’s name', 'Telephone', 'Email', 'Accreditation scheme', 'Assessor’s ID', 'Assessor’s declaration',
|
||
'Date of assessment', 'Date of certificate'
|
||
]
|
||
# Check we have all the expected keys
|
||
for key in expected_keys:
|
||
if key not in assessment_data:
|
||
raise ValueError(f"Missing key: {key}")
|
||
|
||
resulting_data = {
|
||
'extracted_uprn': uprn,
|
||
'extracted_address': address,
|
||
'epc_certificate': epc_certificate,
|
||
'current_epc_rating': current_rating.split(' ')[-6],
|
||
'current_epc_efficiency': int(current_rating.split(' ')[-1]),
|
||
'potential_epc_rating': potential_rating.split(' ')[-6],
|
||
"potential_epc_efficiency": int(potential_rating.split(' ')[-1]),
|
||
"heating_text": heating_text,
|
||
"hot_water_text": hot_water_text,
|
||
**assessment_data
|
||
}
|
||
|
||
return resulting_data
|
||
|
||
|
||
def app():
|
||
"""
|
||
This application is tasked with pulling a large quantity of data from the find my epc website, containing the
|
||
estimated energy consumption for properties
|
||
:return:
|
||
"""
|
||
|
||
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
|
||
|
||
sample_size = 500
|
||
|
||
energy_consumption_data = []
|
||
for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
|
||
try:
|
||
# Skip the first 50
|
||
if i < 256:
|
||
continue
|
||
|
||
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||
# Rename the columns to the same format as the api returns
|
||
data.columns = [c.replace("_", "-").lower() for c in data.columns]
|
||
|
||
# Take just date before the date threshold
|
||
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
|
||
|
||
data = data[~pd.isnull(data["uprn"])]
|
||
# Take just the newest EPC per uprn, based on lodgement-date
|
||
data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")
|
||
|
||
data = data.sample(sample_size, replace=False)
|
||
# We use the addreess data to find the related information
|
||
|
||
collected_data = []
|
||
for _, property_data in data.iterrows():
|
||
time.sleep(np.random.uniform(0.2, 1.5))
|
||
|
||
uprn = int(property_data["uprn"])
|
||
address = property_data["address1"]
|
||
postcode = property_data["postcode"]
|
||
expected_expiry_date = calculate_expiry_date(property_data["lodgement-date"])
|
||
|
||
response = retrieve_find_my_epc_data(
|
||
uprn=uprn,
|
||
postcode=postcode,
|
||
address=address,
|
||
expected_expiry_date=expected_expiry_date
|
||
)
|
||
if response is None:
|
||
continue
|
||
collected_data.append(
|
||
{
|
||
**response,
|
||
"epc": property_data.to_dict(),
|
||
"epc_directory": str(directory)
|
||
}
|
||
)
|
||
|
||
energy_consumption_data.extend(collected_data)
|
||
except Exception as e:
|
||
print(f"Error for directory {directory}: {e}")
|
||
# If we have an error, then we wait for a bit since it's likely due to timeout
|
||
time.sleep(300)
|
||
continue
|
||
|
||
# Store the pickle in s3
|
||
save_time = datetime.now()
|
||
save_pickle_to_s3(
|
||
energy_consumption_data, bucket_name="retrofit-datalake-dev",
|
||
s3_file_name=f"energy_consumption_data/{save_time}.pkl"
|
||
)
|