mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
created basic data collection process
This commit is contained in:
parent
eac2046765
commit
1db6dfebdf
1 changed files with 86 additions and 24 deletions
|
|
@ -1,13 +1,79 @@
|
||||||
|
import time
|
||||||
|
|
||||||
|
import requests
|
||||||
import inspect
|
import inspect
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from etl.epc_clean.EpcClean import EpcClean
|
from bs4 import BeautifulSoup
|
||||||
from etl.epc.settings import EARLIEST_EPC_DATE
|
from etl.epc.settings import EARLIEST_EPC_DATE
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
src_file_path = inspect.getfile(lambda: None)
|
src_file_path = inspect.getfile(lambda: None)
|
||||||
|
|
||||||
EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"
|
EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"
|
||||||
|
SEARCH_POSTCODE_URL = (
|
||||||
|
"https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
|
||||||
|
)
|
||||||
|
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
|
||||||
|
|
||||||
|
|
||||||
|
def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str):
|
||||||
|
"""
|
||||||
|
For a post code and address, we pull out all the required data from the find my epc website
|
||||||
|
"""
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||||
|
'Chrome/111.0.0.0 Safari/537.36'
|
||||||
|
}
|
||||||
|
postcode_input = postcode.replace(" ", "+")
|
||||||
|
postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
|
||||||
|
postcode_response = requests.get(postcode_search, headers=headers)
|
||||||
|
|
||||||
|
postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
|
||||||
|
address_links_full = postcode_res.findAll('a', {'class': 'govuk-link', 'rel': 'nofollow'})
|
||||||
|
address_links = {element.text.lstrip().rstrip(): BASE_ENERGY_URL + element['href'] for element in
|
||||||
|
address_links_full}
|
||||||
|
|
||||||
|
address_cleaned = address.replace(",", "").replace(" ", "").lower()
|
||||||
|
address_links_cleaned = [
|
||||||
|
x.replace(",", "").replace(" ", "").lower() for x in list(address_links.keys())
|
||||||
|
]
|
||||||
|
|
||||||
|
index_of_address = [key.startswith(address_cleaned) for key in address_links_cleaned]
|
||||||
|
if sum(index_of_address) > 1:
|
||||||
|
# If we have two or more addresses, we can't be sure which one is the correct one so we exit for simplicity
|
||||||
|
return None
|
||||||
|
chosen_epc = address_links[list(address_links.keys())[np.where(index_of_address)[0][0]]]
|
||||||
|
|
||||||
|
epc_certificate = chosen_epc.split('/')[-1]
|
||||||
|
|
||||||
|
address_response = requests.get(chosen_epc, headers=headers)
|
||||||
|
address_res = BeautifulSoup(address_response.text, features="html.parser")
|
||||||
|
|
||||||
|
ratings = address_res.find('desc', {'id': 'svg-desc'}).text
|
||||||
|
current_rating = ratings.split(".")[0]
|
||||||
|
potential_rating = ratings.split(".")[1]
|
||||||
|
|
||||||
|
# Retrieve the energy consumption
|
||||||
|
bills = address_res.find('div', {'id': 'bills-affected'})
|
||||||
|
heating_text = bills.find_all('li')[0].text
|
||||||
|
hot_water_text = bills.find_all('li')[1].text
|
||||||
|
|
||||||
|
resulting_data = {
|
||||||
|
'uprn': uprn,
|
||||||
|
'address': address,
|
||||||
|
'epc_certificate': epc_certificate,
|
||||||
|
'current_epc_rating': current_rating.split(' ')[-6],
|
||||||
|
'current_epc_efficiency': int(current_rating.split(' ')[-1]),
|
||||||
|
'potential_epc_rating': potential_rating.split(' ')[-6],
|
||||||
|
"potential_epc_efficiency": int(potential_rating.split(' ')[-1]),
|
||||||
|
"heating_text": heating_text,
|
||||||
|
"hot_water_text": hot_water_text,
|
||||||
|
}
|
||||||
|
|
||||||
|
return resulting_data
|
||||||
|
|
||||||
|
|
||||||
def app():
|
def app():
|
||||||
|
|
@ -20,7 +86,9 @@ def app():
|
||||||
cleaned_data = {}
|
cleaned_data = {}
|
||||||
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
|
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
|
||||||
|
|
||||||
data = []
|
sample_size = 100
|
||||||
|
|
||||||
|
energy_consumption_data = []
|
||||||
for directory in tqdm(epc_directories):
|
for directory in tqdm(epc_directories):
|
||||||
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||||||
# Rename the columns to the same format as the api returns
|
# Rename the columns to the same format as the api returns
|
||||||
|
|
@ -28,29 +96,23 @@ def app():
|
||||||
# Take just date before the date threshold
|
# Take just date before the date threshold
|
||||||
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
|
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
|
||||||
|
|
||||||
data = data[~pd.isnull(data["uprn"])]
|
data = data.sample(sample_size)
|
||||||
data = data[data["mains-gas-flag"] == "N"]
|
# We use the addreess data to find the related information
|
||||||
data = data[data["main-fuel"] == "electricity (not community)"]
|
|
||||||
data[data["current-energy-efficiency"].astype(float) > 80]["uprn"].astype(int)
|
|
||||||
|
|
||||||
# Convert to list of dictioaries as returned by the api
|
collected_data = []
|
||||||
data = data.to_dict("records")
|
for _, property_data in data.iterrows():
|
||||||
|
# Sleep for a random time between 0.1 and 1.5 seconds
|
||||||
|
time.sleep(np.random.uniform(0.1, 1.5))
|
||||||
|
|
||||||
# Incorporate input data into cleaning
|
uprn = int(property_data["uprn"])
|
||||||
cleaner = EpcClean(data)
|
address = property_data["address1"]
|
||||||
|
postcode = property_data["postcode"]
|
||||||
|
|
||||||
cleaner.clean()
|
response = retrieve_find_my_epc_data(
|
||||||
# Extended cleaned_data
|
uprn=uprn,
|
||||||
for k, data in cleaner.cleaned.items():
|
postcode=postcode,
|
||||||
if k not in cleaned_data:
|
address=address
|
||||||
cleaned_data[k] = data
|
)
|
||||||
else:
|
collected_data.append(response)
|
||||||
existing_descriptions = [x["original_description"] for x in cleaned_data[k]]
|
|
||||||
new_data = [x for x in data if x["original_description"] not in existing_descriptions]
|
|
||||||
cleaned_data[k].extend(new_data)
|
|
||||||
|
|
||||||
# Basic check to make sure all descriptions are unique
|
energy_consumption_data.extend(energy_consumption_data)
|
||||||
for _, cleaned in cleaned_data.items():
|
|
||||||
descriptions = [x["original_description"] for x in cleaned]
|
|
||||||
if len(descriptions) != len(set(descriptions)):
|
|
||||||
raise ValueError("Duplicated descriptions found, check me")
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue