From 14417c37dfe9dcbe5ba717d84e83199c2d58181f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 2 Jul 2024 18:13:23 +0100 Subject: [PATCH] error analysis - not working though --- etl/bill_savings/EnergyConsumptionModel.py | 60 +++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/etl/bill_savings/EnergyConsumptionModel.py b/etl/bill_savings/EnergyConsumptionModel.py index 51972a36..27fcc518 100644 --- a/etl/bill_savings/EnergyConsumptionModel.py +++ b/etl/bill_savings/EnergyConsumptionModel.py @@ -15,7 +15,8 @@ class EnergyConsumptionModel: FEATURES = { "heating_kwh": [ "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current", - "heating-cost-current", "total-floor-area", "number-heated-rooms", "number-habitable-rooms", + "heating-cost-current", "total-floor-area", "number-heated-rooms", + # "number-habitable-rooms", # "mainheat-energy-eff", "mainheat-description", "main-fuel", ], "hot_water_kwh": [ @@ -214,6 +215,63 @@ class EnergyConsumptionModel: return new_data + def error_analysis(self, target, top_n=10, unique_threshold=0.8): + """ + Perform error analysis on the provided model and dataset. + """ + + # Calculate predictions and residuals + y_train_pred = self.models[target].predict(self.x_train[target]) + y_test_pred = self.models[target].predict(self.x_test[target]) + + train_residuals = self.y_train[target] - y_train_pred + test_residuals = self.y_test[target] - y_test_pred + + # Identify top N poorly performing rows by absolute residuals + top_train_indices = train_residuals.abs().nlargest(top_n).index + top_test_indices = test_residuals.abs().nlargest(top_n).index + + top_train_data = self.input_data.loc[top_train_indices] + top_test_data = self.input_data.loc[top_test_indices] + + def exclude_columns(data, threshold): + exclude_cols = [] + num_rows = data.shape[0] + for col in data.columns: + if data[col].dtype == 'object' and data[col].nunique() / num_rows >= threshold: + exclude_cols.append(col) + return exclude_cols + + exclude_cols = exclude_columns(top_train_data, unique_threshold) + + top_train_data = top_train_data.drop(columns=exclude_cols) + top_test_data = top_test_data.drop(columns=exclude_cols) + + # TODO: Not working + + # One-hot encode categorical variables + categorical_columns = top_train_data.select_dtypes(include=['object']).columns.tolist() + top_train_data_encoded = pd.get_dummies(top_train_data, columns=categorical_columns, drop_first=True) + top_test_data_encoded = pd.get_dummies(top_test_data, columns=categorical_columns, drop_first=True) + + # Align the encoded data with the training data + top_train_data_encoded = top_train_data_encoded.reindex(columns=self.x_train[target].columns, fill_value=0) + top_test_data_encoded = top_test_data_encoded.reindex(columns=self.x_test[target].columns, fill_value=0) + + # Correlation analysis with residuals + train_corr = top_train_data_encoded.corrwith(train_residuals.loc[top_train_indices]) + test_corr = top_test_data_encoded.corrwith(test_residuals.loc[top_test_indices]) + + # Return summaries + summary = { + "train_corr": train_corr, + "test_corr": test_corr, + "top_train_data": top_train_data, + "top_test_data": top_test_data + } + + return summary + # Example usage: model = EnergyConsumptionModel()