From 1db6dfebdfa29854315ea2896e33bf779a0a9ddb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 2 Jul 2024 11:06:11 +0100 Subject: [PATCH] created basic data collection process --- etl/bill_savings/data_collection.py | 110 ++++++++++++++++++++++------ 1 file changed, 86 insertions(+), 24 deletions(-) diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py index 25023894..22b12c6e 100644 --- a/etl/bill_savings/data_collection.py +++ b/etl/bill_savings/data_collection.py @@ -1,13 +1,79 @@ +import time + +import requests import inspect import pandas as pd from tqdm import tqdm -from etl.epc_clean.EpcClean import EpcClean +from bs4 import BeautifulSoup from etl.epc.settings import EARLIEST_EPC_DATE from pathlib import Path +import numpy as np src_file_path = inspect.getfile(lambda: None) EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates" +SEARCH_POSTCODE_URL = ( + "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}" +) +BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk" + + +def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str): + """ + For a post code and address, we pull out all the required data from the find my epc website + """ + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/111.0.0.0 Safari/537.36' + } + postcode_input = postcode.replace(" ", "+") + postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input) + postcode_response = requests.get(postcode_search, headers=headers) + + postcode_res = BeautifulSoup(postcode_response.text, features="html.parser") + address_links_full = postcode_res.findAll('a', {'class': 'govuk-link', 'rel': 'nofollow'}) + address_links = {element.text.lstrip().rstrip(): BASE_ENERGY_URL + element['href'] for element in + address_links_full} + + address_cleaned = address.replace(",", "").replace(" ", "").lower() + address_links_cleaned = [ + x.replace(",", "").replace(" ", "").lower() for x in list(address_links.keys()) + ] + + index_of_address = [key.startswith(address_cleaned) for key in address_links_cleaned] + if sum(index_of_address) > 1: + # If we have two or more addresses, we can't be sure which one is the correct one so we exit for simplicity + return None + chosen_epc = address_links[list(address_links.keys())[np.where(index_of_address)[0][0]]] + + epc_certificate = chosen_epc.split('/')[-1] + + address_response = requests.get(chosen_epc, headers=headers) + address_res = BeautifulSoup(address_response.text, features="html.parser") + + ratings = address_res.find('desc', {'id': 'svg-desc'}).text + current_rating = ratings.split(".")[0] + potential_rating = ratings.split(".")[1] + + # Retrieve the energy consumption + bills = address_res.find('div', {'id': 'bills-affected'}) + heating_text = bills.find_all('li')[0].text + hot_water_text = bills.find_all('li')[1].text + + resulting_data = { + 'uprn': uprn, + 'address': address, + 'epc_certificate': epc_certificate, + 'current_epc_rating': current_rating.split(' ')[-6], + 'current_epc_efficiency': int(current_rating.split(' ')[-1]), + 'potential_epc_rating': potential_rating.split(' ')[-6], + "potential_epc_efficiency": int(potential_rating.split(' ')[-1]), + "heating_text": heating_text, + "hot_water_text": hot_water_text, + } + + return resulting_data def app(): @@ -20,7 +86,9 @@ def app(): cleaned_data = {} epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()] - data = [] + sample_size = 100 + + energy_consumption_data = [] for directory in tqdm(epc_directories): data = pd.read_csv(directory / "certificates.csv", low_memory=False) # Rename the columns to the same format as the api returns @@ -28,29 +96,23 @@ def app(): # Take just date before the date threshold data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] - data = data[~pd.isnull(data["uprn"])] - data = data[data["mains-gas-flag"] == "N"] - data = data[data["main-fuel"] == "electricity (not community)"] - data[data["current-energy-efficiency"].astype(float) > 80]["uprn"].astype(int) + data = data.sample(sample_size) + # We use the addreess data to find the related information - # Convert to list of dictioaries as returned by the api - data = data.to_dict("records") + collected_data = [] + for _, property_data in data.iterrows(): + # Sleep for a random time between 0.1 and 1.5 seconds + time.sleep(np.random.uniform(0.1, 1.5)) - # Incorporate input data into cleaning - cleaner = EpcClean(data) + uprn = int(property_data["uprn"]) + address = property_data["address1"] + postcode = property_data["postcode"] - cleaner.clean() - # Extended cleaned_data - for k, data in cleaner.cleaned.items(): - if k not in cleaned_data: - cleaned_data[k] = data - else: - existing_descriptions = [x["original_description"] for x in cleaned_data[k]] - new_data = [x for x in data if x["original_description"] not in existing_descriptions] - cleaned_data[k].extend(new_data) + response = retrieve_find_my_epc_data( + uprn=uprn, + postcode=postcode, + address=address + ) + collected_data.append(response) - # Basic check to make sure all descriptions are unique - for _, cleaned in cleaned_data.items(): - descriptions = [x["original_description"] for x in cleaned] - if len(descriptions) != len(set(descriptions)): - raise ValueError("Duplicated descriptions found, check me") + energy_consumption_data.extend(energy_consumption_data)