testing rf importance and permutation importance

2026-07-27 23:35:01 +00:00 · 2023-07-04 16:28:58 +01:00 · 2023-07-04 16:28:58 +01:00 · c59aff412c
commit c59aff412c
parent 58edd9a255
1 changed files with 53 additions and 32 deletions
--- a/model_data/analysis/SapModel.py
+++ b/model_data/analysis/SapModel.py
@ -156,6 +156,32 @@ class SapModel:

        return model_data

+    @staticmethod
+    def clean_missings(model_data):
+        # Cleaning of energy-tariff and construction-age-band hurt prediction performance, indicating there is
+        # potentially
+        # a notable difference between a "" missing and a "NO DATA!" missing, worth differentiating
+
+        model_data["construction-age-band"].value_counts()
+
+        model_data["mechanical-ventilation"] = np.where(
+            model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"]
+        )
+
+        model_data["solar-water-heating-flag"] = np.where(
+            model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"]
+        )
+
+        model_data["glazed-type"] = np.where(
+            model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"]
+        )
+
+        model_data["glazed-area"] = np.where(
+            model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"]
+        )
+
+        return model_data
+
    def create_dataset(self):
        model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
        model_data = model_data.reset_index(drop=True)
@ -164,38 +190,7 @@ class SapModel:
        # Append on u-values
        model_data = self._append_cleaned_data(model_data)

-        def clean_missings(model_data):
-            CLEANING_COLS = ["mechanical-ventilation", "energy-tariff", "solar-water-heating-flag", "glazed-type", ""]
-            model_data["construction-age-band"].value_counts()
-
-            model_data["mechanical-ventilation"] = np.where(
-                model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"]
-            )
-
-            # REVIEW THIS
-            # model_data["energy-tariff"] = np.where(
-            #     model_data["energy-tariff"] == "", "Unknown", model_data["mechanical-ventilation"]
-            # )
-            #
-            model_data["solar-water-heating-flag"] = np.where(
-                model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"]
-            )
-
-            model_data["glazed-type"] = np.where(
-                model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"]
-            )
-
-            model_data["glazed-area"] = np.where(
-                model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"]
-            )
-
-            # model_data["construction-age-band"] = np.where(
-            #     model_data["construction-age-band"] == "", "NO DATA!", model_data["construction-age-band"]
-            # )
-
-            return model_data
-
-        model_data = clean_missings(model_data)
+        model_data = self.clean_missings(model_data)

        # Convert transaction_type
        model_data = self._convert_transaction_type(model_data)
@ -292,6 +287,32 @@ class SapModel:
        train_x = train_x.drop(columns=to_drop)
        test_x = test_x[train_x.columns]

+        from sklearn.ensemble import RandomForestRegressor
+        from sklearn.inspection import permutation_importance
+
+        rf = RandomForestRegressor(random_state=self.random_state)
+        rf.fit(train_x, self.train_y)
+
+        # Print the name and importance of each feature
+        importance_df = []
+        for feature, importance in zip(train_x.columns, rf.feature_importances_):
+            importance_df.append(
+                {
+                    "Feature": feature,
+                    "rf_importance": importance
+                }
+            )
+        importance_df = pd.DataFrame(importance_df)
+        importance_df = importance_df.sort_values(by="rf_importance", ascending=False)
+
+        perm_importance = permutation_importance(rf, test_x, self.test_y, scoring='neg_mean_squared_error')
+        perm_importance_df = pd.DataFrame(
+            {
+                "Feature": test_x.columns,
+                "perm_importance": perm_importance.importances_mean
+            }
+        ).sort_values(by="perm_importance", ascending=False)
+
        # make regression model
        model = sm.OLS(self.train_y, train_x)
        # fit model and print results