From d65ce731c06e8f31f4d6c495da9b9ec86531faf6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 10 Aug 2024 02:18:27 +0100 Subject: [PATCH] minor --- etl/bill_savings/data_collection.py | 91 +++++++++++++++-------------- etl/bill_savings/training.py | 2 +- 2 files changed, 49 insertions(+), 44 deletions(-) diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py index 0341b885..a073a70e 100644 --- a/etl/bill_savings/data_collection.py +++ b/etl/bill_savings/data_collection.py @@ -132,51 +132,56 @@ def app(): energy_consumption_data = [] for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)): - - # Skip the first 50 - if i < 18: - continue - - data = pd.read_csv(directory / "certificates.csv", low_memory=False) - # Rename the columns to the same format as the api returns - data.columns = [c.replace("_", "-").lower() for c in data.columns] - - # Take just date before the date threshold - data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] - - data = data[~pd.isnull(data["uprn"])] - # Take just the newest EPC per uprn, based on lodgement-date - data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn") - - data = data.sample(sample_size, replace=False) - # We use the addreess data to find the related information - - collected_data = [] - for _, property_data in data.iterrows(): - time.sleep(np.random.uniform(0.2, 1.5)) - - uprn = int(property_data["uprn"]) - address = property_data["address1"] - postcode = property_data["postcode"] - expected_expiry_date = calculate_expiry_date(property_data["lodgement-date"]) - - response = retrieve_find_my_epc_data( - uprn=uprn, - postcode=postcode, - address=address, - expected_expiry_date=expected_expiry_date - ) - if response is None: + try: + # Skip the first 50 + if i < 40: continue - collected_data.append( - { - **response, - "epc": property_data.to_dict(), - "epc_directory": str(directory) - } - ) - energy_consumption_data.extend(collected_data) + data = pd.read_csv(directory / "certificates.csv", low_memory=False) + # Rename the columns to the same format as the api returns + data.columns = [c.replace("_", "-").lower() for c in data.columns] + + # Take just date before the date threshold + data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] + + data = data[~pd.isnull(data["uprn"])] + # Take just the newest EPC per uprn, based on lodgement-date + data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn") + + data = data.sample(sample_size, replace=False) + # We use the addreess data to find the related information + + collected_data = [] + for _, property_data in data.iterrows(): + time.sleep(np.random.uniform(0.2, 1.5)) + + uprn = int(property_data["uprn"]) + address = property_data["address1"] + postcode = property_data["postcode"] + expected_expiry_date = calculate_expiry_date(property_data["lodgement-date"]) + + response = retrieve_find_my_epc_data( + uprn=uprn, + postcode=postcode, + address=address, + expected_expiry_date=expected_expiry_date + ) + if response is None: + continue + collected_data.append( + { + **response, + "epc": property_data.to_dict(), + "epc_directory": str(directory) + } + ) + + energy_consumption_data.extend(collected_data) + except Exception as e: + print(f"Error for directory {directory}: {e}") + # If we have an error, then we wait for a bit since it's likely due to timeout + time.sleep(300) + continue # Store the pickle in s3 save_time = datetime.now() diff --git a/etl/bill_savings/training.py b/etl/bill_savings/training.py index 5d89a79e..df60298b 100644 --- a/etl/bill_savings/training.py +++ b/etl/bill_savings/training.py @@ -1,7 +1,7 @@ from pprint import pprint import msgpack from utils.s3 import read_from_s3 -from training_data.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel +from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel def handler():