mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
add find my epc pipeline
This commit is contained in:
parent
162c1200e2
commit
2c8f0c2251
1 changed files with 149 additions and 0 deletions
149
etl/epc_recommendations/find_my_epc_pipeline.py
Normal file
149
etl/epc_recommendations/find_my_epc_pipeline.py
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
# This script takes in a a list of properties
|
||||
# Will be postcode and address
|
||||
|
||||
import requests
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
from tqdm import tqdm
|
||||
|
||||
SEARCH_POSTCODE_URL = "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
|
||||
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
|
||||
|
||||
def retrieve_find_my_epc_data(postcode: str, address: str):
|
||||
"""
|
||||
For a post code and address, we pull out all the required data from the find my epc website
|
||||
"""
|
||||
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
|
||||
postcode_input = postcode.replace(" ", "+")
|
||||
postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
|
||||
postcode_response = requests.get(postcode_search, headers=headers)
|
||||
|
||||
postcode_res = BeautifulSoup(postcode_response.text)
|
||||
address_links_full = postcode_res.findAll('a', {'class': 'govuk-link', 'rel': 'nofollow'})
|
||||
address_links = {element.text.lstrip().rstrip(): BASE_ENERGY_URL + element['href'] for element in address_links_full}
|
||||
|
||||
chosen_epc = address_links[address + ', ' + postcode]
|
||||
|
||||
# # TODO: get the address match working properly
|
||||
# chosen_epc = address_links[list(address_links.keys())[2]]
|
||||
|
||||
epc_certificate = chosen_epc.split('/')[-1]
|
||||
|
||||
address_response = requests.get(chosen_epc, headers=headers)
|
||||
address_res = BeautifulSoup(address_response.text)
|
||||
|
||||
|
||||
print("## Energy rating - current and potential")
|
||||
ratings = address_res.find('desc', {'id': 'svg-desc'}).text
|
||||
|
||||
print('### Current EPC rating')
|
||||
current_rating = ratings.split(".")[0]
|
||||
print("##### " + current_rating)
|
||||
|
||||
print('### Potential EPC rating')
|
||||
potential_rating = ratings.split(".")[1]
|
||||
print("##### " + potential_rating)
|
||||
|
||||
new_property_df = pd.DataFrame(
|
||||
{'address': [address],
|
||||
'epc_certificate': [epc_certificate],
|
||||
'current_epc_rating': [current_rating.split(' ')[-6]],
|
||||
'current_epc_efficiency': [current_rating.split(' ')[-1]],
|
||||
'potential_epc_rating': [potential_rating.split(' ')[-6]],
|
||||
"potential_epc_efficiency": [potential_rating.split(' ')[-1]]}
|
||||
)
|
||||
|
||||
print("Find assessor")
|
||||
assessor_block = address_res.find('div', {'class': 'epc-contact-assessor'})
|
||||
assessor_fields = assessor_block.find_all('dd', {"class": 'govuk-summary-list__value govuk-!-width-one-half'})
|
||||
assessor_name = assessor_fields[0].text.strip()
|
||||
assessor_number = assessor_fields[1].text.strip()
|
||||
assessor_email = assessor_fields[2].text.strip()
|
||||
|
||||
new_property_df['assessor_name'] = assessor_name
|
||||
new_property_df['assessor_number'] = assessor_number
|
||||
new_property_df['assessor_email'] = assessor_email
|
||||
|
||||
return new_property_df
|
||||
|
||||
# print('### Changes that can be made:')
|
||||
# improvements = address_res.find('div', {"class": "govuk-body printable-area epb-recommended-improvements"})
|
||||
|
||||
# if improvements is None:
|
||||
# print("No changes suggested")
|
||||
# else:
|
||||
# changes = improvements.find_all('h3')
|
||||
# changes_impact = improvements.find_all('dl', {"class": 'govuk-summary-list'})
|
||||
|
||||
# for element in zip(changes, changes_impact):
|
||||
# improvement_header = element[0].text
|
||||
# print("#### " + improvement_header)
|
||||
|
||||
# improvement_text = element[1].text
|
||||
# print(improvement_text)
|
||||
|
||||
# col_name = improvement_header.split(":")[1]
|
||||
# cost = element[1].find('dd', {"class": "govuk-summary-list__value"}).text.lstrip().rstrip()
|
||||
|
||||
# impact = element[1].find('text', {"class": "govuk-!-font-weight-bold"}).text.split(" ")
|
||||
# impact_num = impact[0]
|
||||
# impact_cat = impact[1]
|
||||
# print(cost)
|
||||
# new_property_df[col_name] = True
|
||||
# # cost_column = col_name + '-cost'
|
||||
# # new_property_df.assign(cost_column=cost)
|
||||
# new_property_df[col_name + '-cost'] = cost
|
||||
# new_property_df[col_name + '-impact_num'] = impact_num
|
||||
# new_property_df[col_name + '-impact_cat'] = impact_cat
|
||||
|
||||
|
||||
# data = pd.concat([data, new_property_df])
|
||||
# data.to_csv('./portfolio.csv')
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main pipeline function to take in a predefined list of properties and extract names of contractors
|
||||
"""
|
||||
|
||||
# Load in list of properties
|
||||
# addresses = [
|
||||
# {
|
||||
# "postcode": "BB1 1XD",
|
||||
# "address": "5 Wasdale Avenue, Blackburn"
|
||||
# },
|
||||
# {
|
||||
# "postcode": "BB1 8ED",
|
||||
# "address": "21 Carlton Road"
|
||||
# }
|
||||
# ]
|
||||
|
||||
addresses_df = pd.read_excel("places_for_people_EPC_data.xlsx")
|
||||
|
||||
find_my_epc_data_list = []
|
||||
for i, row in addresses_df.head(2).iterrows():
|
||||
address_data = retrieve_find_my_epc_data(
|
||||
postcode=row['POSTCODE'],
|
||||
address=row['Matched EPC Address']
|
||||
)
|
||||
|
||||
find_my_epc_data_list.append(address_data)
|
||||
|
||||
|
||||
# for address in tqdm(addresses):
|
||||
# address_data = retrieve_find_my_epc_data(
|
||||
# postcode=address['postcode'],
|
||||
# address=address['address']
|
||||
# )
|
||||
|
||||
# find_my_epc_data_list.append(address_data)
|
||||
|
||||
find_my_epc_data = pd.concat(find_my_epc_data_list)
|
||||
|
||||
find_my_epc_data.to_parquet('find_my_epc_data.parquet')
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Reference in a new issue