error analysis - not working though

2026-07-27 23:35:01 +00:00 · 2024-07-02 18:13:23 +01:00 · 2024-07-02 18:13:23 +01:00 · 14417c37df
commit 14417c37df
parent 0a1f728f37
1 changed files with 59 additions and 1 deletions
--- a/etl/bill_savings/EnergyConsumptionModel.py
+++ b/etl/bill_savings/EnergyConsumptionModel.py
@ -15,7 +15,8 @@ class EnergyConsumptionModel:
    FEATURES = {
        "heating_kwh": [
            "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
-            "heating-cost-current", "total-floor-area", "number-heated-rooms", "number-habitable-rooms",
+            "heating-cost-current", "total-floor-area", "number-heated-rooms",
+            # "number-habitable-rooms",
            # "mainheat-energy-eff", "mainheat-description", "main-fuel",
        ],
        "hot_water_kwh": [
@ -214,6 +215,63 @@ class EnergyConsumptionModel:

        return new_data

+    def error_analysis(self, target, top_n=10, unique_threshold=0.8):
+        """
+        Perform error analysis on the provided model and dataset.
+        """
+
+        # Calculate predictions and residuals
+        y_train_pred = self.models[target].predict(self.x_train[target])
+        y_test_pred = self.models[target].predict(self.x_test[target])
+
+        train_residuals = self.y_train[target] - y_train_pred
+        test_residuals = self.y_test[target] - y_test_pred
+
+        # Identify top N poorly performing rows by absolute residuals
+        top_train_indices = train_residuals.abs().nlargest(top_n).index
+        top_test_indices = test_residuals.abs().nlargest(top_n).index
+
+        top_train_data = self.input_data.loc[top_train_indices]
+        top_test_data = self.input_data.loc[top_test_indices]
+
+        def exclude_columns(data, threshold):
+            exclude_cols = []
+            num_rows = data.shape[0]
+            for col in data.columns:
+                if data[col].dtype == 'object' and data[col].nunique() / num_rows >= threshold:
+                    exclude_cols.append(col)
+            return exclude_cols
+
+        exclude_cols = exclude_columns(top_train_data, unique_threshold)
+
+        top_train_data = top_train_data.drop(columns=exclude_cols)
+        top_test_data = top_test_data.drop(columns=exclude_cols)
+
+        # TODO: Not working
+
+        # One-hot encode categorical variables
+        categorical_columns = top_train_data.select_dtypes(include=['object']).columns.tolist()
+        top_train_data_encoded = pd.get_dummies(top_train_data, columns=categorical_columns, drop_first=True)
+        top_test_data_encoded = pd.get_dummies(top_test_data, columns=categorical_columns, drop_first=True)
+
+        # Align the encoded data with the training data
+        top_train_data_encoded = top_train_data_encoded.reindex(columns=self.x_train[target].columns, fill_value=0)
+        top_test_data_encoded = top_test_data_encoded.reindex(columns=self.x_test[target].columns, fill_value=0)
+
+        # Correlation analysis with residuals
+        train_corr = top_train_data_encoded.corrwith(train_residuals.loc[top_train_indices])
+        test_corr = top_test_data_encoded.corrwith(test_residuals.loc[top_test_indices])
+
+        # Return summaries
+        summary = {
+            "train_corr": train_corr,
+            "test_corr": test_corr,
+            "top_train_data": top_train_data,
+            "top_test_data": top_test_data
+        }
+
+        return summary
+

 # Example usage:
 model = EnergyConsumptionModel()