add find my epc pipeline

This commit is contained in:
Michael Duong 2024-05-16 18:00:37 +01:00
parent 162c1200e2
commit 2c8f0c2251

View file

@ -0,0 +1,149 @@
# This script takes in a a list of properties
# Will be postcode and address
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
SEARCH_POSTCODE_URL = "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
def retrieve_find_my_epc_data(postcode: str, address: str):
"""
For a post code and address, we pull out all the required data from the find my epc website
"""
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
postcode_input = postcode.replace(" ", "+")
postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
postcode_response = requests.get(postcode_search, headers=headers)
postcode_res = BeautifulSoup(postcode_response.text)
address_links_full = postcode_res.findAll('a', {'class': 'govuk-link', 'rel': 'nofollow'})
address_links = {element.text.lstrip().rstrip(): BASE_ENERGY_URL + element['href'] for element in address_links_full}
chosen_epc = address_links[address + ', ' + postcode]
# # TODO: get the address match working properly
# chosen_epc = address_links[list(address_links.keys())[2]]
epc_certificate = chosen_epc.split('/')[-1]
address_response = requests.get(chosen_epc, headers=headers)
address_res = BeautifulSoup(address_response.text)
print("## Energy rating - current and potential")
ratings = address_res.find('desc', {'id': 'svg-desc'}).text
print('### Current EPC rating')
current_rating = ratings.split(".")[0]
print("##### " + current_rating)
print('### Potential EPC rating')
potential_rating = ratings.split(".")[1]
print("##### " + potential_rating)
new_property_df = pd.DataFrame(
{'address': [address],
'epc_certificate': [epc_certificate],
'current_epc_rating': [current_rating.split(' ')[-6]],
'current_epc_efficiency': [current_rating.split(' ')[-1]],
'potential_epc_rating': [potential_rating.split(' ')[-6]],
"potential_epc_efficiency": [potential_rating.split(' ')[-1]]}
)
print("Find assessor")
assessor_block = address_res.find('div', {'class': 'epc-contact-assessor'})
assessor_fields = assessor_block.find_all('dd', {"class": 'govuk-summary-list__value govuk-!-width-one-half'})
assessor_name = assessor_fields[0].text.strip()
assessor_number = assessor_fields[1].text.strip()
assessor_email = assessor_fields[2].text.strip()
new_property_df['assessor_name'] = assessor_name
new_property_df['assessor_number'] = assessor_number
new_property_df['assessor_email'] = assessor_email
return new_property_df
# print('### Changes that can be made:')
# improvements = address_res.find('div', {"class": "govuk-body printable-area epb-recommended-improvements"})
# if improvements is None:
# print("No changes suggested")
# else:
# changes = improvements.find_all('h3')
# changes_impact = improvements.find_all('dl', {"class": 'govuk-summary-list'})
# for element in zip(changes, changes_impact):
# improvement_header = element[0].text
# print("#### " + improvement_header)
# improvement_text = element[1].text
# print(improvement_text)
# col_name = improvement_header.split(":")[1]
# cost = element[1].find('dd', {"class": "govuk-summary-list__value"}).text.lstrip().rstrip()
# impact = element[1].find('text', {"class": "govuk-!-font-weight-bold"}).text.split(" ")
# impact_num = impact[0]
# impact_cat = impact[1]
# print(cost)
# new_property_df[col_name] = True
# # cost_column = col_name + '-cost'
# # new_property_df.assign(cost_column=cost)
# new_property_df[col_name + '-cost'] = cost
# new_property_df[col_name + '-impact_num'] = impact_num
# new_property_df[col_name + '-impact_cat'] = impact_cat
# data = pd.concat([data, new_property_df])
# data.to_csv('./portfolio.csv')
def main():
"""
Main pipeline function to take in a predefined list of properties and extract names of contractors
"""
# Load in list of properties
# addresses = [
# {
# "postcode": "BB1 1XD",
# "address": "5 Wasdale Avenue, Blackburn"
# },
# {
# "postcode": "BB1 8ED",
# "address": "21 Carlton Road"
# }
# ]
addresses_df = pd.read_excel("places_for_people_EPC_data.xlsx")
find_my_epc_data_list = []
for i, row in addresses_df.head(2).iterrows():
address_data = retrieve_find_my_epc_data(
postcode=row['POSTCODE'],
address=row['Matched EPC Address']
)
find_my_epc_data_list.append(address_data)
# for address in tqdm(addresses):
# address_data = retrieve_find_my_epc_data(
# postcode=address['postcode'],
# address=address['address']
# )
# find_my_epc_data_list.append(address_data)
find_my_epc_data = pd.concat(find_my_epc_data_list)
find_my_epc_data.to_parquet('find_my_epc_data.parquet')
if __name__ == "__main__":
main()