From 2c8f0c2251455275fbf684e4765aa0e7233d3eb3 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 16 May 2024 18:00:37 +0100 Subject: [PATCH] add find my epc pipeline --- .../find_my_epc_pipeline.py | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 etl/epc_recommendations/find_my_epc_pipeline.py diff --git a/etl/epc_recommendations/find_my_epc_pipeline.py b/etl/epc_recommendations/find_my_epc_pipeline.py new file mode 100644 index 00000000..1001e8f5 --- /dev/null +++ b/etl/epc_recommendations/find_my_epc_pipeline.py @@ -0,0 +1,149 @@ +# This script takes in a a list of properties +# Will be postcode and address + +import requests +import numpy as np +import pandas as pd +from bs4 import BeautifulSoup +from tqdm import tqdm + +SEARCH_POSTCODE_URL = "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}" +BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk" + +def retrieve_find_my_epc_data(postcode: str, address: str): + """ + For a post code and address, we pull out all the required data from the find my epc website + """ + + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'} + postcode_input = postcode.replace(" ", "+") + postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input) + postcode_response = requests.get(postcode_search, headers=headers) + + postcode_res = BeautifulSoup(postcode_response.text) + address_links_full = postcode_res.findAll('a', {'class': 'govuk-link', 'rel': 'nofollow'}) + address_links = {element.text.lstrip().rstrip(): BASE_ENERGY_URL + element['href'] for element in address_links_full} + + chosen_epc = address_links[address + ', ' + postcode] + + # # TODO: get the address match working properly + # chosen_epc = address_links[list(address_links.keys())[2]] + + epc_certificate = chosen_epc.split('/')[-1] + + address_response = requests.get(chosen_epc, headers=headers) + address_res = BeautifulSoup(address_response.text) + + + print("## Energy rating - current and potential") + ratings = address_res.find('desc', {'id': 'svg-desc'}).text + + print('### Current EPC rating') + current_rating = ratings.split(".")[0] + print("##### " + current_rating) + + print('### Potential EPC rating') + potential_rating = ratings.split(".")[1] + print("##### " + potential_rating) + + new_property_df = pd.DataFrame( + {'address': [address], + 'epc_certificate': [epc_certificate], + 'current_epc_rating': [current_rating.split(' ')[-6]], + 'current_epc_efficiency': [current_rating.split(' ')[-1]], + 'potential_epc_rating': [potential_rating.split(' ')[-6]], + "potential_epc_efficiency": [potential_rating.split(' ')[-1]]} + ) + + print("Find assessor") + assessor_block = address_res.find('div', {'class': 'epc-contact-assessor'}) + assessor_fields = assessor_block.find_all('dd', {"class": 'govuk-summary-list__value govuk-!-width-one-half'}) + assessor_name = assessor_fields[0].text.strip() + assessor_number = assessor_fields[1].text.strip() + assessor_email = assessor_fields[2].text.strip() + + new_property_df['assessor_name'] = assessor_name + new_property_df['assessor_number'] = assessor_number + new_property_df['assessor_email'] = assessor_email + + return new_property_df + + # print('### Changes that can be made:') + # improvements = address_res.find('div', {"class": "govuk-body printable-area epb-recommended-improvements"}) + + # if improvements is None: + # print("No changes suggested") + # else: + # changes = improvements.find_all('h3') + # changes_impact = improvements.find_all('dl', {"class": 'govuk-summary-list'}) + + # for element in zip(changes, changes_impact): + # improvement_header = element[0].text + # print("#### " + improvement_header) + + # improvement_text = element[1].text + # print(improvement_text) + + # col_name = improvement_header.split(":")[1] + # cost = element[1].find('dd', {"class": "govuk-summary-list__value"}).text.lstrip().rstrip() + + # impact = element[1].find('text', {"class": "govuk-!-font-weight-bold"}).text.split(" ") + # impact_num = impact[0] + # impact_cat = impact[1] + # print(cost) + # new_property_df[col_name] = True + # # cost_column = col_name + '-cost' + # # new_property_df.assign(cost_column=cost) + # new_property_df[col_name + '-cost'] = cost + # new_property_df[col_name + '-impact_num'] = impact_num + # new_property_df[col_name + '-impact_cat'] = impact_cat + + + # data = pd.concat([data, new_property_df]) + # data.to_csv('./portfolio.csv') + + + +def main(): + """ + Main pipeline function to take in a predefined list of properties and extract names of contractors + """ + + # Load in list of properties + # addresses = [ + # { + # "postcode": "BB1 1XD", + # "address": "5 Wasdale Avenue, Blackburn" + # }, + # { + # "postcode": "BB1 8ED", + # "address": "21 Carlton Road" + # } + # ] + + addresses_df = pd.read_excel("places_for_people_EPC_data.xlsx") + + find_my_epc_data_list = [] + for i, row in addresses_df.head(2).iterrows(): + address_data = retrieve_find_my_epc_data( + postcode=row['POSTCODE'], + address=row['Matched EPC Address'] + ) + + find_my_epc_data_list.append(address_data) + + + # for address in tqdm(addresses): + # address_data = retrieve_find_my_epc_data( + # postcode=address['postcode'], + # address=address['address'] + # ) + + # find_my_epc_data_list.append(address_data) + + find_my_epc_data = pd.concat(find_my_epc_data_list) + + find_my_epc_data.to_parquet('find_my_epc_data.parquet') + +if __name__ == "__main__": + main() \ No newline at end of file