From 523ca28b686da9ef292fcb2dd3f88cb268936d52 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 5 Jul 2024 12:49:23 +0100
Subject: [PATCH] Added new score_new_data function and setting up training
 script

---
 etl/bill_savings/EnergyConsumptionModel.py | 104 ++++-----------------
 etl/bill_savings/training.py               |   5 +
 2 files changed, 24 insertions(+), 85 deletions(-)
 create mode 100644 etl/bill_savings/training.py

diff --git a/etl/bill_savings/EnergyConsumptionModel.py b/etl/bill_savings/EnergyConsumptionModel.py
index b616be08..14ece803 100644
--- a/etl/bill_savings/EnergyConsumptionModel.py
+++ b/etl/bill_savings/EnergyConsumptionModel.py
@@ -26,8 +26,6 @@ class EnergyConsumptionModel:
             "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
             "low-energy-lighting", "environment-impact-current", "energy-tariff",
             "county", "construction-age-band", "co2-emissions-current",
-            # TODO: Testing
-            "lighting-cost-current", "hot-water-cost-current", "current-energy-rating"
         ],
         "hot_water_kwh": [
             "lodgement-year", "lodgement-month",
@@ -144,9 +142,9 @@ class EnergyConsumptionModel:
             self.data = self.data.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
 
         # Modify number of heated rooms and number of habitable rooms
-        self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply(
-            lambda x: "16_or_more" if x > 15 else str(x)
-        )
+        # self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply(
+        #     lambda x: "16_or_more" if x > 15 else str(x)
+        # )
         # self.data["number-habitable-rooms"] = self.data["number-habitable-rooms"].apply(
         #     lambda x: "10+" if x > 10 else str(x)
         # )
@@ -398,93 +396,29 @@ class EnergyConsumptionModel:
         if target not in self.models:
             raise ValueError(f"Model for target {target} not loaded or trained")
 
-        new_data_transformed = self.transform_new_data(new_data, target)
-        return self.models[target].predict(new_data_transformed)
+        # Verify that self.data is None
+        if self.data is not None:
+            raise ValueError("self.data is not None. Ensure that self.data is reset before scoring new data.")
 
-    def transform_new_data(self, new_data, target):
-        """Applies the same transformations to new data as were applied to the training data."""
+        # Temporarily set self.data to new data
+        self.data = new_data.copy()
 
-        # TODO THis should jsut use our other transformation function
-        new_data["lodgement-date"] = pd.to_datetime(new_data["lodgement-date"])
-        new_data["lodgement-year"] = new_data["lodgement-date"].dt.year
-        new_data["lodgement-month"] = new_data["lodgement-date"].dt.month
+        # Run feature engineering
+        self.feature_engineering()
 
-        # Convert categorical columns to dummies
-        new_data = pd.get_dummies(new_data, columns=self.CATEGORICAL_COLUMNS, drop_first=True)
+        # Select the transformed data
+        new_data_transformed = self.data[self.dummy_columns[target]]
 
-        # Align new data with the dummy columns from training data
-        new_data = new_data.reindex(columns=self.dummy_columns[target], fill_value=0)
+        # Ensure the columns match the selected features
+        new_data_transformed = new_data_transformed[self.selected_features[target]]
 
-        # Select the features used by the model
-        new_data = new_data[self.selected_features[target]]
+        # Generate predictions
+        predictions = self.models[target].predict(new_data_transformed)
 
-        return new_data
+        # Reset self.data to None
+        self.data = None
 
-    def error_analysis(self, target, top_n=10, unique_threshold=0.8):
-        """
-        Perform error analysis on the provided model and dataset.
-
-        Parameters:
-        - target: The target variable to analyze.
-        - top_n: Number of top residuals to consider for analysis.
-        - unique_threshold: Threshold to exclude columns with high unique values.
-
-        Returns:
-        - summary: Dictionary summarizing common features among poorly performing rows.
-        """
-
-        # Calculate predictions and residuals
-        y_train_pred = self.models[target].predict(self.x_train[target])
-        y_test_pred = self.models[target].predict(self.x_test[target])
-
-        train_residuals = self.y_train[target] - y_train_pred
-        test_residuals = self.y_test[target] - y_test_pred
-
-        # Identify top N poorly performing rows by absolute residuals
-        top_train_indices = train_residuals.abs().nlargest(top_n).index
-        top_test_indices = test_residuals.abs().nlargest(top_n).index
-
-        top_train_data = self.input_data.loc[top_train_indices]
-        top_test_data = self.input_data.loc[top_test_indices]
-
-        # Automatically detect and exclude columns
-        def exclude_columns(data, threshold):
-            exclude_cols = []
-            num_rows = data.shape[0]
-            for col in data.columns:
-                if data[col].dtype == 'object' and data[col].nunique() / num_rows >= threshold:
-                    exclude_cols.append(col)
-            return exclude_cols
-
-        exclude_cols = exclude_columns(top_train_data, unique_threshold)
-
-        top_train_data = top_train_data.drop(columns=exclude_cols)
-        top_test_data = top_test_data.drop(columns=exclude_cols)
-
-        # One-hot encode categorical variables
-        categorical_columns = top_train_data.select_dtypes(include=['object']).columns.tolist()
-        top_train_data_encoded = pd.get_dummies(top_train_data, columns=categorical_columns, drop_first=True)
-        top_test_data_encoded = pd.get_dummies(top_test_data, columns=categorical_columns, drop_first=True)
-
-        # Ensure all original columns are included in the encoded data
-        top_train_data_encoded = top_train_data_encoded.reindex(columns=self.input_data.columns, fill_value=0)
-        top_test_data_encoded = top_test_data_encoded.reindex(columns=self.input_data.columns, fill_value=0)
-
-        # Correlation analysis with residuals
-        train_corr = top_train_data_encoded.corrwith(train_residuals.loc[top_train_indices])
-        test_corr = top_test_data_encoded.corrwith(test_residuals.loc[top_test_indices])
-
-        # Return summaries
-        summary = {
-            "train_summary": top_train_data.describe(include='all').T,
-            "test_summary": top_test_data.describe(include='all').T,
-            "train_corr": train_corr,
-            "test_corr": test_corr,
-            "top_train_data": top_train_data,
-            "top_test_data": top_test_data
-        }
-
-        return summary
+        return predictions
 
 
 # Usage:
diff --git a/etl/bill_savings/training.py b/etl/bill_savings/training.py
new file mode 100644
index 00000000..2c29c317
--- /dev/null
+++ b/etl/bill_savings/training.py
@@ -0,0 +1,5 @@
+def hanlder():
+    """
+    This function is used to train the model and store the final models in s3 as pickles
+    :return:
+    """