From c59aff412cf00ea77fda5734f12d522f2a3e6b73 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 4 Jul 2023 16:28:58 +0100 Subject: [PATCH] testing rf importance and permutation importance --- model_data/analysis/SapModel.py | 85 ++++++++++++++++++++------------- 1 file changed, 53 insertions(+), 32 deletions(-) diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py index ca2d45b2..87b71cf6 100644 --- a/model_data/analysis/SapModel.py +++ b/model_data/analysis/SapModel.py @@ -156,6 +156,32 @@ class SapModel: return model_data + @staticmethod + def clean_missings(model_data): + # Cleaning of energy-tariff and construction-age-band hurt prediction performance, indicating there is + # potentially + # a notable difference between a "" missing and a "NO DATA!" missing, worth differentiating + + model_data["construction-age-band"].value_counts() + + model_data["mechanical-ventilation"] = np.where( + model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"] + ) + + model_data["solar-water-heating-flag"] = np.where( + model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"] + ) + + model_data["glazed-type"] = np.where( + model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"] + ) + + model_data["glazed-area"] = np.where( + model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"] + ) + + return model_data + def create_dataset(self): model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES] model_data = model_data.reset_index(drop=True) @@ -164,38 +190,7 @@ class SapModel: # Append on u-values model_data = self._append_cleaned_data(model_data) - def clean_missings(model_data): - CLEANING_COLS = ["mechanical-ventilation", "energy-tariff", "solar-water-heating-flag", "glazed-type", ""] - model_data["construction-age-band"].value_counts() - - model_data["mechanical-ventilation"] = np.where( - model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"] - ) - - # REVIEW THIS - # model_data["energy-tariff"] = np.where( - # model_data["energy-tariff"] == "", "Unknown", model_data["mechanical-ventilation"] - # ) - # - model_data["solar-water-heating-flag"] = np.where( - model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"] - ) - - model_data["glazed-type"] = np.where( - model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"] - ) - - model_data["glazed-area"] = np.where( - model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"] - ) - - # model_data["construction-age-band"] = np.where( - # model_data["construction-age-band"] == "", "NO DATA!", model_data["construction-age-band"] - # ) - - return model_data - - model_data = clean_missings(model_data) + model_data = self.clean_missings(model_data) # Convert transaction_type model_data = self._convert_transaction_type(model_data) @@ -292,6 +287,32 @@ class SapModel: train_x = train_x.drop(columns=to_drop) test_x = test_x[train_x.columns] + from sklearn.ensemble import RandomForestRegressor + from sklearn.inspection import permutation_importance + + rf = RandomForestRegressor(random_state=self.random_state) + rf.fit(train_x, self.train_y) + + # Print the name and importance of each feature + importance_df = [] + for feature, importance in zip(train_x.columns, rf.feature_importances_): + importance_df.append( + { + "Feature": feature, + "rf_importance": importance + } + ) + importance_df = pd.DataFrame(importance_df) + importance_df = importance_df.sort_values(by="rf_importance", ascending=False) + + perm_importance = permutation_importance(rf, test_x, self.test_y, scoring='neg_mean_squared_error') + perm_importance_df = pd.DataFrame( + { + "Feature": test_x.columns, + "perm_importance": perm_importance.importances_mean + } + ).sort_values(by="perm_importance", ascending=False) + # make regression model model = sm.OLS(self.train_y, train_x) # fit model and print results