From d65ce731c06e8f31f4d6c495da9b9ec86531faf6 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sat, 10 Aug 2024 02:18:27 +0100
Subject: [PATCH] minor

---
 etl/bill_savings/data_collection.py | 91 +++++++++++++++--------------
 etl/bill_savings/training.py        |  2 +-
 2 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py
index 0341b885..a073a70e 100644
--- a/etl/bill_savings/data_collection.py
+++ b/etl/bill_savings/data_collection.py
@@ -132,51 +132,56 @@ def app():
 
     energy_consumption_data = []
     for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
-
-        # Skip the first 50
-        if i < 18:
-            continue
-
-        data = pd.read_csv(directory / "certificates.csv", low_memory=False)
-        # Rename the columns to the same format as the api returns
-        data.columns = [c.replace("_", "-").lower() for c in data.columns]
-
-        # Take just date before the date threshold
-        data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
-
-        data = data[~pd.isnull(data["uprn"])]
-        # Take just the newest EPC per uprn, based on lodgement-date
-        data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")
-
-        data = data.sample(sample_size, replace=False)
-        # We use the addreess data to find the related information
-
-        collected_data = []
-        for _, property_data in data.iterrows():
-            time.sleep(np.random.uniform(0.2, 1.5))
-
-            uprn = int(property_data["uprn"])
-            address = property_data["address1"]
-            postcode = property_data["postcode"]
-            expected_expiry_date = calculate_expiry_date(property_data["lodgement-date"])
-
-            response = retrieve_find_my_epc_data(
-                uprn=uprn,
-                postcode=postcode,
-                address=address,
-                expected_expiry_date=expected_expiry_date
-            )
-            if response is None:
+        try:
+            # Skip the first 50
+            if i < 40:
                 continue
-            collected_data.append(
-                {
-                    **response,
-                    "epc": property_data.to_dict(),
-                    "epc_directory": str(directory)
-                }
-            )
 
-        energy_consumption_data.extend(collected_data)
+            data = pd.read_csv(directory / "certificates.csv", low_memory=False)
+            # Rename the columns to the same format as the api returns
+            data.columns = [c.replace("_", "-").lower() for c in data.columns]
+
+            # Take just date before the date threshold
+            data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
+
+            data = data[~pd.isnull(data["uprn"])]
+            # Take just the newest EPC per uprn, based on lodgement-date
+            data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")
+
+            data = data.sample(sample_size, replace=False)
+            # We use the addreess data to find the related information
+
+            collected_data = []
+            for _, property_data in data.iterrows():
+                time.sleep(np.random.uniform(0.2, 1.5))
+
+                uprn = int(property_data["uprn"])
+                address = property_data["address1"]
+                postcode = property_data["postcode"]
+                expected_expiry_date = calculate_expiry_date(property_data["lodgement-date"])
+
+                response = retrieve_find_my_epc_data(
+                    uprn=uprn,
+                    postcode=postcode,
+                    address=address,
+                    expected_expiry_date=expected_expiry_date
+                )
+                if response is None:
+                    continue
+                collected_data.append(
+                    {
+                        **response,
+                        "epc": property_data.to_dict(),
+                        "epc_directory": str(directory)
+                    }
+                )
+
+            energy_consumption_data.extend(collected_data)
+        except Exception as e:
+            print(f"Error for directory {directory}: {e}")
+            # If we have an error, then we wait for a bit since it's likely due to timeout
+            time.sleep(300)
+            continue
 
     # Store the pickle in s3
     save_time = datetime.now()
diff --git a/etl/bill_savings/training.py b/etl/bill_savings/training.py
index 5d89a79e..df60298b 100644
--- a/etl/bill_savings/training.py
+++ b/etl/bill_savings/training.py
@@ -1,7 +1,7 @@
 from pprint import pprint
 import msgpack
 from utils.s3 import read_from_s3
-from training_data.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel
+from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel
 
 
 def handler():