mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
created basic data collection process
This commit is contained in:
parent
eac2046765
commit
1db6dfebdf
1 changed files with 86 additions and 24 deletions
|
|
@ -1,13 +1,79 @@
|
|||
import time
|
||||
|
||||
import requests
|
||||
import inspect
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from etl.epc_clean.EpcClean import EpcClean
|
||||
from bs4 import BeautifulSoup
|
||||
from etl.epc.settings import EARLIEST_EPC_DATE
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
|
||||
src_file_path = inspect.getfile(lambda: None)
|
||||
|
||||
EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"
|
||||
SEARCH_POSTCODE_URL = (
|
||||
"https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
|
||||
)
|
||||
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
|
||||
|
||||
|
||||
def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str):
|
||||
"""
|
||||
For a post code and address, we pull out all the required data from the find my epc website
|
||||
"""
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/111.0.0.0 Safari/537.36'
|
||||
}
|
||||
postcode_input = postcode.replace(" ", "+")
|
||||
postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
|
||||
postcode_response = requests.get(postcode_search, headers=headers)
|
||||
|
||||
postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
|
||||
address_links_full = postcode_res.findAll('a', {'class': 'govuk-link', 'rel': 'nofollow'})
|
||||
address_links = {element.text.lstrip().rstrip(): BASE_ENERGY_URL + element['href'] for element in
|
||||
address_links_full}
|
||||
|
||||
address_cleaned = address.replace(",", "").replace(" ", "").lower()
|
||||
address_links_cleaned = [
|
||||
x.replace(",", "").replace(" ", "").lower() for x in list(address_links.keys())
|
||||
]
|
||||
|
||||
index_of_address = [key.startswith(address_cleaned) for key in address_links_cleaned]
|
||||
if sum(index_of_address) > 1:
|
||||
# If we have two or more addresses, we can't be sure which one is the correct one so we exit for simplicity
|
||||
return None
|
||||
chosen_epc = address_links[list(address_links.keys())[np.where(index_of_address)[0][0]]]
|
||||
|
||||
epc_certificate = chosen_epc.split('/')[-1]
|
||||
|
||||
address_response = requests.get(chosen_epc, headers=headers)
|
||||
address_res = BeautifulSoup(address_response.text, features="html.parser")
|
||||
|
||||
ratings = address_res.find('desc', {'id': 'svg-desc'}).text
|
||||
current_rating = ratings.split(".")[0]
|
||||
potential_rating = ratings.split(".")[1]
|
||||
|
||||
# Retrieve the energy consumption
|
||||
bills = address_res.find('div', {'id': 'bills-affected'})
|
||||
heating_text = bills.find_all('li')[0].text
|
||||
hot_water_text = bills.find_all('li')[1].text
|
||||
|
||||
resulting_data = {
|
||||
'uprn': uprn,
|
||||
'address': address,
|
||||
'epc_certificate': epc_certificate,
|
||||
'current_epc_rating': current_rating.split(' ')[-6],
|
||||
'current_epc_efficiency': int(current_rating.split(' ')[-1]),
|
||||
'potential_epc_rating': potential_rating.split(' ')[-6],
|
||||
"potential_epc_efficiency": int(potential_rating.split(' ')[-1]),
|
||||
"heating_text": heating_text,
|
||||
"hot_water_text": hot_water_text,
|
||||
}
|
||||
|
||||
return resulting_data
|
||||
|
||||
|
||||
def app():
|
||||
|
|
@ -20,7 +86,9 @@ def app():
|
|||
cleaned_data = {}
|
||||
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
|
||||
data = []
|
||||
sample_size = 100
|
||||
|
||||
energy_consumption_data = []
|
||||
for directory in tqdm(epc_directories):
|
||||
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||||
# Rename the columns to the same format as the api returns
|
||||
|
|
@ -28,29 +96,23 @@ def app():
|
|||
# Take just date before the date threshold
|
||||
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
|
||||
|
||||
data = data[~pd.isnull(data["uprn"])]
|
||||
data = data[data["mains-gas-flag"] == "N"]
|
||||
data = data[data["main-fuel"] == "electricity (not community)"]
|
||||
data[data["current-energy-efficiency"].astype(float) > 80]["uprn"].astype(int)
|
||||
data = data.sample(sample_size)
|
||||
# We use the addreess data to find the related information
|
||||
|
||||
# Convert to list of dictioaries as returned by the api
|
||||
data = data.to_dict("records")
|
||||
collected_data = []
|
||||
for _, property_data in data.iterrows():
|
||||
# Sleep for a random time between 0.1 and 1.5 seconds
|
||||
time.sleep(np.random.uniform(0.1, 1.5))
|
||||
|
||||
# Incorporate input data into cleaning
|
||||
cleaner = EpcClean(data)
|
||||
uprn = int(property_data["uprn"])
|
||||
address = property_data["address1"]
|
||||
postcode = property_data["postcode"]
|
||||
|
||||
cleaner.clean()
|
||||
# Extended cleaned_data
|
||||
for k, data in cleaner.cleaned.items():
|
||||
if k not in cleaned_data:
|
||||
cleaned_data[k] = data
|
||||
else:
|
||||
existing_descriptions = [x["original_description"] for x in cleaned_data[k]]
|
||||
new_data = [x for x in data if x["original_description"] not in existing_descriptions]
|
||||
cleaned_data[k].extend(new_data)
|
||||
response = retrieve_find_my_epc_data(
|
||||
uprn=uprn,
|
||||
postcode=postcode,
|
||||
address=address
|
||||
)
|
||||
collected_data.append(response)
|
||||
|
||||
# Basic check to make sure all descriptions are unique
|
||||
for _, cleaned in cleaned_data.items():
|
||||
descriptions = [x["original_description"] for x in cleaned]
|
||||
if len(descriptions) != len(set(descriptions)):
|
||||
raise ValueError("Duplicated descriptions found, check me")
|
||||
energy_consumption_data.extend(energy_consumption_data)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue