From 7790822e76cd968e0af10b76ff451e73d2362b56 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 2 Jul 2024 16:06:01 +0100
Subject: [PATCH] making the data objects dictionaries for different targets

---
 etl/bill_savings/EnergyConsumptionModel.py | 109 +++++++++++++++------
 etl/bill_savings/data_collection.py        |   6 +-
 2 files changed, 81 insertions(+), 34 deletions(-)

diff --git a/etl/bill_savings/EnergyConsumptionModel.py b/etl/bill_savings/EnergyConsumptionModel.py
index 2ca88da5..d2c77e48 100644
--- a/etl/bill_savings/EnergyConsumptionModel.py
+++ b/etl/bill_savings/EnergyConsumptionModel.py
@@ -3,51 +3,87 @@ from datetime import datetime
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LinearRegression
 from sklearn.metrics import mean_squared_error, r2_score
-from utils.s3 import save_pickle_to_s3, read_pickle_from_s3
+from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet
 
 
 class EnergyConsumptionModel:
-    FEATURES = ['feature_1', 'feature_2']
+    FEATURES = {
+        "heating_kwh": [
+            "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
+            "heating-cost-current",
+        ],
+        "hot_water_kwh": [
+            "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
+            "hot-water-cost-current"
+        ]
+    }
     TARGETS = ['heating_kwh', 'hot_water_kwh']
+    CATEGORICAL_COLUMNS = ["lodgement-year", "lodgement-month"]
+    NUMERICAL_COLUMNS = ["current-energy-efficiency", "energy-consumption-current", "heating-cost-current",
+                         "hot-water-cost-current"]
 
     def __init__(self, model_paths=None):
         self.models = {}
         self.model_paths = model_paths or {}
         self.data = None
+        self.dummy_columns = None
 
-        self.X_train = None
-        self.X_test = None
-        self.y_train = None
-        self.y_test = None
+        self.x_train = {}
+        self.x_test = {}
+        self.y_train = {}
+        self.y_test = {}
 
         if model_paths:
             for target, path in model_paths.items():
                 self.models[target] = read_pickle_from_s3(bucket_name="retrofit-model-directory-dev", s3_file_name=path)
 
     def read_dataset(self, file_path):
-        self.data = pd.read_csv(file_path)
+        self.data = read_dataframe_from_s3_parquet(bucket_name="retrofit-data-dev", file_key=file_path)
 
     def feature_engineering(self):
-        # Example feature engineering steps
-        self.data['feature_1'] = self.data['original_feature_1'] ** 2
-        self.data['feature_2'] = self.data['original_feature_2'] ** 0.5
-        # Add more feature engineering steps as required
+        # Extract date features
+        self.data["lodgement-date"] = pd.to_datetime(self.data["lodgement-date"])
+        self.data["lodgement-year"] = self.data["lodgement-date"].dt.year
+        self.data["lodgement-month"] = self.data["lodgement-date"].dt.month
+
+        # Convert data types
+        self.data[self.NUMERICAL_COLUMNS] = self.data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric)
+        self.data[self.CATEGORICAL_COLUMNS] = self.data[self.CATEGORICAL_COLUMNS].astype(str)
+
+        # Convert categorical columns to dummies
+        self.data = pd.get_dummies(self.data, columns=self.CATEGORICAL_COLUMNS, drop_first=True)
+
+        # Store the dummy columns
+        self.dummy_columns = {}
+        for target in self.TARGETS:
+            target_features = self.FEATURES[target]
+            dummy_feature_columns = []
+            for feature in target_features:
+                if feature in self.CATEGORICAL_COLUMNS:
+                    dummy_feature_columns.extend([col for col in self.data.columns if col.startswith(feature + '_')])
+                else:
+                    dummy_feature_columns.append(feature)
+            self.dummy_columns[target] = dummy_feature_columns
 
     def split_dataset(self, target, test_size=0.2, random_state=42):
-        X = self.data[self.FEATURES]
+
+        if target not in self.TARGETS:
+            raise ValueError(f"Target {target} not in {self.TARGETS}")
+
+        x = self.data[self.dummy_columns[target]]
         y = self.data[target]
-        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
-            X, y, test_size=test_size, random_state=random_state
+        self.x_train[target], self.x_test[target], self.y_train[target], self.y_test[target] = train_test_split(
+            x, y, test_size=test_size, random_state=random_state
         )
 
     def fit_model(self, target):
         self.models[target] = LinearRegression()
-        self.models[target].fit(self.X_train, self.y_train)
+        self.models[target].fit(self.x_train[target], self.y_train[target])
 
     def evaluate_model(self, target):
-        y_pred = self.models[target].predict(self.X_test)
-        mse = mean_squared_error(self.y_test, y_pred)
-        r2 = r2_score(self.y_test, y_pred)
+        y_pred = self.models[target].predict(self.x_test[target])
+        mse = mean_squared_error(self.y_test[target], y_pred)
+        r2 = r2_score(self.y_test[target], y_pred)
         return {'MSE': mse, 'R2': r2}
 
     def save_model(self, target):
@@ -67,23 +103,32 @@ class EnergyConsumptionModel:
 
     def transform_new_data(self, new_data):
         # Apply the same transformations as in feature_engineering
-        new_data['feature_1'] = new_data['original_feature_1'] ** 2
-        new_data['feature_2'] = new_data['original_feature_2'] ** 0.5
-        return new_data[self.FEATURES]
+        new_data["lodgement-date"] = pd.to_datetime(new_data["lodgement-date"])
+        new_data["lodgement-year"] = new_data["lodgement-date"].dt.year
+        new_data["lodgement-month"] = new_data["lodgement-date"].dt.month
+
+        # Convert categorical columns to dummies
+        new_data = pd.get_dummies(new_data, columns=self.CATEGORICAL_COLUMNS, drop_first=True)
+
+        # Align new data with the dummy columns from training data
+        new_data = new_data.reindex(columns=self.dummy_columns, fill_value=0)
+
+        return new_data.drop(columns=[target for target in self.TARGETS if target in new_data.columns])
+
 
 # Example usage:
-# model = EnergyConsumptionModel()
-# model.read_dataset('/mnt/data/energy_consumption_dataset.csv')
-# model.feature_engineering()
+model = EnergyConsumptionModel()
+model.read_dataset('energy_consumption/2024-07-02/energy_consumption_dataset.parquet')
+model.feature_engineering()
 
 # For heating_kwh
-# model.split_dataset(target='heating_kwh')
-# model.fit_model(target='heating_kwh')
-# print(model.evaluate_model(target='heating_kwh'))
-# model.save_model(target='heating_kwh')
+model.split_dataset(target='heating_kwh')
+model.fit_model(target='heating_kwh')
+print(model.evaluate_model(target='heating_kwh'))
+model.save_model(target='heating_kwh')
 
 # For hot_water_kwh
-# model.split_dataset(target='hot_water_kwh')
-# model.fit_model(target='hot_water_kwh')
-# print(model.evaluate_model(target='hot_water_kwh'))
-# model.save_model(target='hot_water_kwh')
+model.split_dataset(target='hot_water_kwh')
+model.fit_model(target='hot_water_kwh')
+print(model.evaluate_model(target='hot_water_kwh'))
+model.save_model(target='hot_water_kwh')
diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py
index 3b503122..79afa936 100644
--- a/etl/bill_savings/data_collection.py
+++ b/etl/bill_savings/data_collection.py
@@ -132,6 +132,9 @@ def app():
 
     energy_consumption_data = []
     for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
+        # Skip the first 50
+        if i < 50:
+            continue
 
         data = pd.read_csv(directory / "certificates.csv", low_memory=False)
         # Rename the columns to the same format as the api returns
@@ -148,8 +151,7 @@ def app():
 
         collected_data = []
         for _, property_data in data.iterrows():
-            # Sleep for a random time between 0.1 and 1.4 seconds
-            time.sleep(np.random.uniform(0.1, 1.4))
+            time.sleep(np.random.uniform(0.3, 2))
 
             uprn = int(property_data["uprn"])
             address = property_data["address1"]