Model/etl/bill_savings/data_collection.py
2024-09-30 11:53:04 +01:00

227 lines
8.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import time
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import requests
import inspect
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from etl.epc.settings import EARLIEST_EPC_DATE
from pathlib import Path
import numpy as np
from utils.s3 import save_pickle_to_s3
src_file_path = inspect.getfile(lambda: None)
EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"
SEARCH_POSTCODE_URL = (
"https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
)
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
def calculate_expiry_date(lodgement_date):
lodgement_date_dt = datetime.strptime(lodgement_date, '%Y-%m-%d')
expiry_date_dt = lodgement_date_dt + relativedelta(years=10) - timedelta(days=1)
return expiry_date_dt.strftime('%-d %B %Y')
def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str, expected_expiry_date: str):
"""
For a post code and address, we pull out all the required data from the find my epc website
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/111.0.0.0 Safari/537.36'
}
postcode_input = postcode.replace(" ", "+")
postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
postcode_response = requests.get(postcode_search, headers=headers)
address_cleaned = address.replace(",", "").replace(" ", "").lower()
postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
rows = postcode_res.find_all('tr', class_='govuk-table__row')
extracted_table = []
for row in rows:
# Extract the address and URL
address_tag = row.find('a', class_='govuk-link')
if address_tag is None:
continue
extracted_address = None
extracted_address_url = None
if address_tag:
extracted_address = address_tag.text.strip()
extracted_address_url = address_tag['href']
extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower()
if not extracted_address_cleaned.startswith(address_cleaned):
continue
# If the address is a match, we can extract the data
# Extract the expiry date
expiry_date_tag = row.find('td', class_='govuk-table__cell date')
expiry_date = None
if expiry_date_tag is not None:
expiry_date = expiry_date_tag.parent.find('span').text.strip()
extracted_table.append(
{
"extracted_address": extracted_address,
"extracted_address_url": extracted_address_url,
"expiry_date": expiry_date
}
)
extracted_table = [entry for entry in extracted_table if entry['expiry_date'] == expected_expiry_date]
if len(extracted_table) > 1:
print("Multiple candidates found, skipping for now")
return None
if not extracted_table:
print("No candidates found, skipping for now")
return None
chosen_epc = BASE_ENERGY_URL + extracted_table[0]['extracted_address_url']
epc_certificate = chosen_epc.split('/')[-1]
address_response = requests.get(chosen_epc, headers=headers)
address_res = BeautifulSoup(address_response.text, features="html.parser")
ratings = address_res.find('desc', {'id': 'svg-desc'}).text
current_rating = ratings.split(".")[0]
potential_rating = ratings.split(".")[1]
# Retrieve the energy consumption
bills = address_res.find('div', {'id': 'bills-affected'})
bills_list = bills.find_all('li')
if not bills_list:
# If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information
heating_text = None
hot_water_text = None
else:
heating_text = bills_list[0].text
hot_water_text = bills_list[1].text
# Search for the assessment informaton
assessment_information = address_res.find('div', {'id': 'information'})
# Parse this information
rows = assessment_information.find_all('div', class_='govuk-summary-list__row')
# Create a dictionary to hold the parsed information
assessment_data = {}
for row in rows:
key = row.find('dt').text.strip()
if key == "Type of assessment":
# We dont reliably extract this
continue
value_tag = row.find('dd')
# Check if value contains a link (email)
if value_tag.find('a'):
value = value_tag.find('a').text.strip()
elif value_tag.find('summary'):
value = value_tag.find('span').text.strip()
else:
value = value_tag.text.strip()
assessment_data[key] = value
expected_keys = [
'Assessors name', 'Telephone', 'Email', 'Accreditation scheme', 'Assessors ID', 'Assessors declaration',
'Date of assessment', 'Date of certificate'
]
# Check we have all the expected keys
for key in expected_keys:
if key not in assessment_data:
raise ValueError(f"Missing key: {key}")
resulting_data = {
'extracted_uprn': uprn,
'extracted_address': address,
'epc_certificate': epc_certificate,
'current_epc_rating': current_rating.split(' ')[-6],
'current_epc_efficiency': int(current_rating.split(' ')[-1]),
'potential_epc_rating': potential_rating.split(' ')[-6],
"potential_epc_efficiency": int(potential_rating.split(' ')[-1]),
"heating_text": heating_text,
"hot_water_text": hot_water_text,
**assessment_data
}
return resulting_data
def app():
"""
This application is tasked with pulling a large quantity of data from the find my epc website, containing the
estimated energy consumption for properties
:return:
"""
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
sample_size = 500
energy_consumption_data = []
for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
try:
# Skip the first 50
if i < 256:
continue
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
# Rename the columns to the same format as the api returns
data.columns = [c.replace("_", "-").lower() for c in data.columns]
# Take just date before the date threshold
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
data = data[~pd.isnull(data["uprn"])]
# Take just the newest EPC per uprn, based on lodgement-date
data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")
data = data.sample(sample_size, replace=False)
# We use the addreess data to find the related information
collected_data = []
for _, property_data in data.iterrows():
time.sleep(np.random.uniform(0.2, 1.5))
uprn = int(property_data["uprn"])
address = property_data["address1"]
postcode = property_data["postcode"]
expected_expiry_date = calculate_expiry_date(property_data["lodgement-date"])
response = retrieve_find_my_epc_data(
uprn=uprn,
postcode=postcode,
address=address,
expected_expiry_date=expected_expiry_date
)
if response is None:
continue
collected_data.append(
{
**response,
"epc": property_data.to_dict(),
"epc_directory": str(directory)
}
)
energy_consumption_data.extend(collected_data)
except Exception as e:
print(f"Error for directory {directory}: {e}")
# If we have an error, then we wait for a bit since it's likely due to timeout
time.sleep(300)
continue
# Store the pickle in s3
save_time = datetime.now()
save_pickle_to_s3(
energy_consumption_data, bucket_name="retrofit-datalake-dev",
s3_file_name=f"energy_consumption_data/{save_time}.pkl"
)