created basic data collection process

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-02 11:06:11 +01:00
parent eac2046765
commit 1db6dfebdf

View file

@ -1,13 +1,79 @@
import time
import requests
import inspect
import pandas as pd
from tqdm import tqdm
from etl.epc_clean.EpcClean import EpcClean
from bs4 import BeautifulSoup
from etl.epc.settings import EARLIEST_EPC_DATE
from pathlib import Path
import numpy as np
src_file_path = inspect.getfile(lambda: None)
EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"
SEARCH_POSTCODE_URL = (
"https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
)
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str):
"""
For a post code and address, we pull out all the required data from the find my epc website
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/111.0.0.0 Safari/537.36'
}
postcode_input = postcode.replace(" ", "+")
postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
postcode_response = requests.get(postcode_search, headers=headers)
postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
address_links_full = postcode_res.findAll('a', {'class': 'govuk-link', 'rel': 'nofollow'})
address_links = {element.text.lstrip().rstrip(): BASE_ENERGY_URL + element['href'] for element in
address_links_full}
address_cleaned = address.replace(",", "").replace(" ", "").lower()
address_links_cleaned = [
x.replace(",", "").replace(" ", "").lower() for x in list(address_links.keys())
]
index_of_address = [key.startswith(address_cleaned) for key in address_links_cleaned]
if sum(index_of_address) > 1:
# If we have two or more addresses, we can't be sure which one is the correct one so we exit for simplicity
return None
chosen_epc = address_links[list(address_links.keys())[np.where(index_of_address)[0][0]]]
epc_certificate = chosen_epc.split('/')[-1]
address_response = requests.get(chosen_epc, headers=headers)
address_res = BeautifulSoup(address_response.text, features="html.parser")
ratings = address_res.find('desc', {'id': 'svg-desc'}).text
current_rating = ratings.split(".")[0]
potential_rating = ratings.split(".")[1]
# Retrieve the energy consumption
bills = address_res.find('div', {'id': 'bills-affected'})
heating_text = bills.find_all('li')[0].text
hot_water_text = bills.find_all('li')[1].text
resulting_data = {
'uprn': uprn,
'address': address,
'epc_certificate': epc_certificate,
'current_epc_rating': current_rating.split(' ')[-6],
'current_epc_efficiency': int(current_rating.split(' ')[-1]),
'potential_epc_rating': potential_rating.split(' ')[-6],
"potential_epc_efficiency": int(potential_rating.split(' ')[-1]),
"heating_text": heating_text,
"hot_water_text": hot_water_text,
}
return resulting_data
def app():
@ -20,7 +86,9 @@ def app():
cleaned_data = {}
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
data = []
sample_size = 100
energy_consumption_data = []
for directory in tqdm(epc_directories):
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
# Rename the columns to the same format as the api returns
@ -28,29 +96,23 @@ def app():
# Take just date before the date threshold
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
data = data[~pd.isnull(data["uprn"])]
data = data[data["mains-gas-flag"] == "N"]
data = data[data["main-fuel"] == "electricity (not community)"]
data[data["current-energy-efficiency"].astype(float) > 80]["uprn"].astype(int)
data = data.sample(sample_size)
# We use the addreess data to find the related information
# Convert to list of dictioaries as returned by the api
data = data.to_dict("records")
collected_data = []
for _, property_data in data.iterrows():
# Sleep for a random time between 0.1 and 1.5 seconds
time.sleep(np.random.uniform(0.1, 1.5))
# Incorporate input data into cleaning
cleaner = EpcClean(data)
uprn = int(property_data["uprn"])
address = property_data["address1"]
postcode = property_data["postcode"]
cleaner.clean()
# Extended cleaned_data
for k, data in cleaner.cleaned.items():
if k not in cleaned_data:
cleaned_data[k] = data
else:
existing_descriptions = [x["original_description"] for x in cleaned_data[k]]
new_data = [x for x in data if x["original_description"] not in existing_descriptions]
cleaned_data[k].extend(new_data)
response = retrieve_find_my_epc_data(
uprn=uprn,
postcode=postcode,
address=address
)
collected_data.append(response)
# Basic check to make sure all descriptions are unique
for _, cleaned in cleaned_data.items():
descriptions = [x["original_description"] for x in cleaned]
if len(descriptions) != len(set(descriptions)):
raise ValueError("Duplicated descriptions found, check me")
energy_consumption_data.extend(energy_consumption_data)