From 0da0d5480f9e83ee321c95aa3e85d6fd59b752c9 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 16 May 2024 18:48:56 +0100 Subject: [PATCH] add better logic --- .../find_my_epc_pipeline.py | 28 ++----------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/etl/epc_recommendations/find_my_epc_pipeline.py b/etl/epc_recommendations/find_my_epc_pipeline.py index 1001e8f5..4971c1b4 100644 --- a/etl/epc_recommendations/find_my_epc_pipeline.py +++ b/etl/epc_recommendations/find_my_epc_pipeline.py @@ -24,10 +24,8 @@ def retrieve_find_my_epc_data(postcode: str, address: str): address_links_full = postcode_res.findAll('a', {'class': 'govuk-link', 'rel': 'nofollow'}) address_links = {element.text.lstrip().rstrip(): BASE_ENERGY_URL + element['href'] for element in address_links_full} - chosen_epc = address_links[address + ', ' + postcode] - - # # TODO: get the address match working properly - # chosen_epc = address_links[list(address_links.keys())[2]] + index_of_address = [key.startswith(address) for key in list(address_links.keys())] + chosen_epc = address_links[list(address_links.keys())[np.where(index_of_address)[0][0]]] epc_certificate = chosen_epc.split('/')[-1] @@ -110,21 +108,10 @@ def main(): """ # Load in list of properties - # addresses = [ - # { - # "postcode": "BB1 1XD", - # "address": "5 Wasdale Avenue, Blackburn" - # }, - # { - # "postcode": "BB1 8ED", - # "address": "21 Carlton Road" - # } - # ] - addresses_df = pd.read_excel("places_for_people_EPC_data.xlsx") find_my_epc_data_list = [] - for i, row in addresses_df.head(2).iterrows(): + for i, row in tqdm(addresses_df.iterrows()): address_data = retrieve_find_my_epc_data( postcode=row['POSTCODE'], address=row['Matched EPC Address'] @@ -132,15 +119,6 @@ def main(): find_my_epc_data_list.append(address_data) - - # for address in tqdm(addresses): - # address_data = retrieve_find_my_epc_data( - # postcode=address['postcode'], - # address=address['address'] - # ) - - # find_my_epc_data_list.append(address_data) - find_my_epc_data = pd.concat(find_my_epc_data_list) find_my_epc_data.to_parquet('find_my_epc_data.parquet')