diff --git a/etl/epc_recommendations/find_my_epc_pipeline.py b/etl/epc_recommendations/find_my_epc_pipeline.py index 778df390..e7b55d58 100644 --- a/etl/epc_recommendations/find_my_epc_pipeline.py +++ b/etl/epc_recommendations/find_my_epc_pipeline.py @@ -10,7 +10,7 @@ from tqdm import tqdm SEARCH_POSTCODE_URL = "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}" BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk" -def retrieve_find_my_epc_data(postcode: str, address: str): +def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str): """ For a post code and address, we pull out all the required data from the find my epc website """ @@ -46,12 +46,15 @@ def retrieve_find_my_epc_data(postcode: str, address: str): print("##### " + potential_rating) new_property_df = pd.DataFrame( - {'address': [address], - 'epc_certificate': [epc_certificate], - 'current_epc_rating': [current_rating.split(' ')[-6]], - 'current_epc_efficiency': [current_rating.split(' ')[-1]], - 'potential_epc_rating': [potential_rating.split(' ')[-6]], - "potential_epc_efficiency": [potential_rating.split(' ')[-1]]} + { + 'uprn': [uprn], + 'address': [address], + 'epc_certificate': [epc_certificate], + 'current_epc_rating': [current_rating.split(' ')[-6]], + 'current_epc_efficiency': [current_rating.split(' ')[-1]], + 'potential_epc_rating': [potential_rating.split(' ')[-6]], + "potential_epc_efficiency": [potential_rating.split(' ')[-1]] + } ) print("Find assessor") @@ -109,11 +112,17 @@ def main(): """ # Load in list of properties - addresses_df = pd.read_excel("places_for_people_EPC_data.xlsx") + base_addresses_df = pd.read_excel("places_for_people_EPC_data.xlsx") + + addresses_df = base_addresses_df[~base_addresses_df['uprn'].isnull()].copy().reset_index(drop=True) + + addresses_df['uprn'] = addresses_df['uprn'].astype(int) find_my_epc_data_list = [] - for i, row in tqdm(addresses_df.iterrows()): + for i, row in tqdm(addresses_df.tail(3).iterrows()): + address_data = retrieve_find_my_epc_data( + uprn=row['uprn'], postcode=row['POSTCODE'], address=row['Matched EPC Address'] ) @@ -124,5 +133,9 @@ def main(): find_my_epc_data.to_parquet('find_my_epc_data.parquet') + final_df = pd.merge(left=base_addresses_df, right=find_my_epc_data, left_on='Matched EPC Address', right_on='address', how='left') + + final_df.to_parquet('final_df.parquet') + if __name__ == "__main__": main() \ No newline at end of file