add fix for missing uprns

This commit is contained in:
Michael Duong 2024-05-16 19:48:53 +01:00
parent c9891b8023
commit 982499962f

View file

@ -10,7 +10,7 @@ from tqdm import tqdm
SEARCH_POSTCODE_URL = "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
def retrieve_find_my_epc_data(postcode: str, address: str):
def retrieve_find_my_epc_data(uprn: int, postcode: str, address: str):
"""
For a post code and address, we pull out all the required data from the find my epc website
"""
@ -46,12 +46,15 @@ def retrieve_find_my_epc_data(postcode: str, address: str):
print("##### " + potential_rating)
new_property_df = pd.DataFrame(
{'address': [address],
'epc_certificate': [epc_certificate],
'current_epc_rating': [current_rating.split(' ')[-6]],
'current_epc_efficiency': [current_rating.split(' ')[-1]],
'potential_epc_rating': [potential_rating.split(' ')[-6]],
"potential_epc_efficiency": [potential_rating.split(' ')[-1]]}
{
'uprn': [uprn],
'address': [address],
'epc_certificate': [epc_certificate],
'current_epc_rating': [current_rating.split(' ')[-6]],
'current_epc_efficiency': [current_rating.split(' ')[-1]],
'potential_epc_rating': [potential_rating.split(' ')[-6]],
"potential_epc_efficiency": [potential_rating.split(' ')[-1]]
}
)
print("Find assessor")
@ -109,11 +112,17 @@ def main():
"""
# Load in list of properties
addresses_df = pd.read_excel("places_for_people_EPC_data.xlsx")
base_addresses_df = pd.read_excel("places_for_people_EPC_data.xlsx")
addresses_df = base_addresses_df[~base_addresses_df['uprn'].isnull()].copy().reset_index(drop=True)
addresses_df['uprn'] = addresses_df['uprn'].astype(int)
find_my_epc_data_list = []
for i, row in tqdm(addresses_df.iterrows()):
for i, row in tqdm(addresses_df.tail(3).iterrows()):
address_data = retrieve_find_my_epc_data(
uprn=row['uprn'],
postcode=row['POSTCODE'],
address=row['Matched EPC Address']
)
@ -124,5 +133,9 @@ def main():
find_my_epc_data.to_parquet('find_my_epc_data.parquet')
final_df = pd.merge(left=base_addresses_df, right=find_my_epc_data, left_on='Matched EPC Address', right_on='address', how='left')
final_df.to_parquet('final_df.parquet')
if __name__ == "__main__":
main()