From 14417c37dfe9dcbe5ba717d84e83199c2d58181f Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 2 Jul 2024 18:13:23 +0100
Subject: [PATCH] error analysis - not working though

---
 etl/bill_savings/EnergyConsumptionModel.py | 60 +++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/etl/bill_savings/EnergyConsumptionModel.py b/etl/bill_savings/EnergyConsumptionModel.py
index 51972a36..27fcc518 100644
--- a/etl/bill_savings/EnergyConsumptionModel.py
+++ b/etl/bill_savings/EnergyConsumptionModel.py
@@ -15,7 +15,8 @@ class EnergyConsumptionModel:
     FEATURES = {
         "heating_kwh": [
             "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
-            "heating-cost-current", "total-floor-area", "number-heated-rooms", "number-habitable-rooms",
+            "heating-cost-current", "total-floor-area", "number-heated-rooms",
+            # "number-habitable-rooms",
             # "mainheat-energy-eff", "mainheat-description", "main-fuel",
         ],
         "hot_water_kwh": [
@@ -214,6 +215,63 @@ class EnergyConsumptionModel:
 
         return new_data
 
+    def error_analysis(self, target, top_n=10, unique_threshold=0.8):
+        """
+        Perform error analysis on the provided model and dataset.
+        """
+
+        # Calculate predictions and residuals
+        y_train_pred = self.models[target].predict(self.x_train[target])
+        y_test_pred = self.models[target].predict(self.x_test[target])
+
+        train_residuals = self.y_train[target] - y_train_pred
+        test_residuals = self.y_test[target] - y_test_pred
+
+        # Identify top N poorly performing rows by absolute residuals
+        top_train_indices = train_residuals.abs().nlargest(top_n).index
+        top_test_indices = test_residuals.abs().nlargest(top_n).index
+
+        top_train_data = self.input_data.loc[top_train_indices]
+        top_test_data = self.input_data.loc[top_test_indices]
+
+        def exclude_columns(data, threshold):
+            exclude_cols = []
+            num_rows = data.shape[0]
+            for col in data.columns:
+                if data[col].dtype == 'object' and data[col].nunique() / num_rows >= threshold:
+                    exclude_cols.append(col)
+            return exclude_cols
+
+        exclude_cols = exclude_columns(top_train_data, unique_threshold)
+
+        top_train_data = top_train_data.drop(columns=exclude_cols)
+        top_test_data = top_test_data.drop(columns=exclude_cols)
+
+        # TODO: Not working
+
+        # One-hot encode categorical variables
+        categorical_columns = top_train_data.select_dtypes(include=['object']).columns.tolist()
+        top_train_data_encoded = pd.get_dummies(top_train_data, columns=categorical_columns, drop_first=True)
+        top_test_data_encoded = pd.get_dummies(top_test_data, columns=categorical_columns, drop_first=True)
+
+        # Align the encoded data with the training data
+        top_train_data_encoded = top_train_data_encoded.reindex(columns=self.x_train[target].columns, fill_value=0)
+        top_test_data_encoded = top_test_data_encoded.reindex(columns=self.x_test[target].columns, fill_value=0)
+
+        # Correlation analysis with residuals
+        train_corr = top_train_data_encoded.corrwith(train_residuals.loc[top_train_indices])
+        test_corr = top_test_data_encoded.corrwith(test_residuals.loc[top_test_indices])
+
+        # Return summaries
+        summary = {
+            "train_corr": train_corr,
+            "test_corr": test_corr,
+            "top_train_data": top_train_data,
+            "top_test_data": top_test_data
+        }
+
+        return summary
+
 
 # Example usage:
 model = EnergyConsumptionModel()