import time from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta import requests import inspect import pandas as pd from tqdm import tqdm from bs4 import BeautifulSoup from etl.epc.settings import EARLIEST_EPC_DATE from pathlib import Path import numpy as np from utils.s3 import save_pickle_to_s3 src_file_path = inspect.getfile(lambda: None) EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates" SEARCH_POSTCODE_URL = ( "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}" ) BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk" def calculate_expiry_date(lodgement_date): lodgement_date_dt = datetime.strptime(lodgement_date, '%Y-%m-%d') expiry_date_dt = lodgement_date_dt + relativedelta(years=10) - timedelta(days=1) return expiry_date_dt.strftime('%-d %B %Y') def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str, expected_expiry_date: str): """ For a post code and address, we pull out all the required data from the find my epc website """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/111.0.0.0 Safari/537.36' } postcode_input = postcode.replace(" ", "+") postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input) postcode_response = requests.get(postcode_search, headers=headers) address_cleaned = address.replace(",", "").replace(" ", "").lower() postcode_res = BeautifulSoup(postcode_response.text, features="html.parser") rows = postcode_res.find_all('tr', class_='govuk-table__row') extracted_table = [] for row in rows: # Extract the address and URL address_tag = row.find('a', class_='govuk-link') if address_tag is None: continue extracted_address = None extracted_address_url = None if address_tag: extracted_address = address_tag.text.strip() extracted_address_url = address_tag['href'] extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower() if not extracted_address_cleaned.startswith(address_cleaned): continue # If the address is a match, we can extract the data # Extract the expiry date expiry_date_tag = row.find('td', class_='govuk-table__cell date') expiry_date = None if expiry_date_tag is not None: expiry_date = expiry_date_tag.parent.find('span').text.strip() extracted_table.append( { "extracted_address": extracted_address, "extracted_address_url": extracted_address_url, "expiry_date": expiry_date } ) extracted_table = [entry for entry in extracted_table if entry['expiry_date'] == expected_expiry_date] if len(extracted_table) > 1: print("Multiple candidates found, skipping for now") return None if not extracted_table: print("No candidates found, skipping for now") return None chosen_epc = BASE_ENERGY_URL + extracted_table[0]['extracted_address_url'] epc_certificate = chosen_epc.split('/')[-1] address_response = requests.get(chosen_epc, headers=headers) address_res = BeautifulSoup(address_response.text, features="html.parser") ratings = address_res.find('desc', {'id': 'svg-desc'}).text current_rating = ratings.split(".")[0] potential_rating = ratings.split(".")[1] # Retrieve the energy consumption bills = address_res.find('div', {'id': 'bills-affected'}) bills_list = bills.find_all('li') if not bills_list: # If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information heating_text = None hot_water_text = None else: heating_text = bills_list[0].text hot_water_text = bills_list[1].text # Search for the assessment informaton assessment_information = address_res.find('div', {'id': 'information'}) # Parse this information rows = assessment_information.find_all('div', class_='govuk-summary-list__row') # Create a dictionary to hold the parsed information assessment_data = {} for row in rows: key = row.find('dt').text.strip() if key == "Type of assessment": # We dont reliably extract this continue value_tag = row.find('dd') # Check if value contains a link (email) if value_tag.find('a'): value = value_tag.find('a').text.strip() elif value_tag.find('summary'): value = value_tag.find('span').text.strip() else: value = value_tag.text.strip() assessment_data[key] = value expected_keys = [ 'Assessor’s name', 'Telephone', 'Email', 'Accreditation scheme', 'Assessor’s ID', 'Assessor’s declaration', 'Date of assessment', 'Date of certificate' ] # Check we have all the expected keys for key in expected_keys: if key not in assessment_data: raise ValueError(f"Missing key: {key}") resulting_data = { 'extracted_uprn': uprn, 'extracted_address': address, 'epc_certificate': epc_certificate, 'current_epc_rating': current_rating.split(' ')[-6], 'current_epc_efficiency': int(current_rating.split(' ')[-1]), 'potential_epc_rating': potential_rating.split(' ')[-6], "potential_epc_efficiency": int(potential_rating.split(' ')[-1]), "heating_text": heating_text, "hot_water_text": hot_water_text, **assessment_data } return resulting_data def app(): """ This application is tasked with pulling a large quantity of data from the find my epc website, containing the estimated energy consumption for properties :return: """ epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()] sample_size = 500 energy_consumption_data = [] for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)): try: # Skip the first 50 if i < 256: continue data = pd.read_csv(directory / "certificates.csv", low_memory=False) # Rename the columns to the same format as the api returns data.columns = [c.replace("_", "-").lower() for c in data.columns] # Take just date before the date threshold data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] data = data[~pd.isnull(data["uprn"])] # Take just the newest EPC per uprn, based on lodgement-date data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn") data = data.sample(sample_size, replace=False) # We use the addreess data to find the related information collected_data = [] for _, property_data in data.iterrows(): time.sleep(np.random.uniform(0.2, 1.5)) uprn = int(property_data["uprn"]) address = property_data["address1"] postcode = property_data["postcode"] expected_expiry_date = calculate_expiry_date(property_data["lodgement-date"]) response = retrieve_find_my_epc_data( uprn=uprn, postcode=postcode, address=address, expected_expiry_date=expected_expiry_date ) if response is None: continue collected_data.append( { **response, "epc": property_data.to_dict(), "epc_directory": str(directory) } ) energy_consumption_data.extend(collected_data) except Exception as e: print(f"Error for directory {directory}: {e}") # If we have an error, then we wait for a bit since it's likely due to timeout time.sleep(300) continue # Store the pickle in s3 save_time = datetime.now() save_pickle_to_s3( energy_consumption_data, bucket_name="retrofit-datalake-dev", s3_file_name=f"energy_consumption_data/{save_time}.pkl" )