From c59aff412cf00ea77fda5734f12d522f2a3e6b73 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 4 Jul 2023 16:28:58 +0100
Subject: [PATCH] testing rf importance and permutation importance

---
 model_data/analysis/SapModel.py | 85 ++++++++++++++++++++-------------
 1 file changed, 53 insertions(+), 32 deletions(-)

diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py
index ca2d45b2..87b71cf6 100644
--- a/model_data/analysis/SapModel.py
+++ b/model_data/analysis/SapModel.py
@@ -156,6 +156,32 @@ class SapModel:
 
         return model_data
 
+    @staticmethod
+    def clean_missings(model_data):
+        # Cleaning of energy-tariff and construction-age-band hurt prediction performance, indicating there is
+        # potentially
+        # a notable difference between a "" missing and a "NO DATA!" missing, worth differentiating
+
+        model_data["construction-age-band"].value_counts()
+
+        model_data["mechanical-ventilation"] = np.where(
+            model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"]
+        )
+
+        model_data["solar-water-heating-flag"] = np.where(
+            model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"]
+        )
+
+        model_data["glazed-type"] = np.where(
+            model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"]
+        )
+
+        model_data["glazed-area"] = np.where(
+            model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"]
+        )
+
+        return model_data
+
     def create_dataset(self):
         model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
         model_data = model_data.reset_index(drop=True)
@@ -164,38 +190,7 @@ class SapModel:
         # Append on u-values
         model_data = self._append_cleaned_data(model_data)
 
-        def clean_missings(model_data):
-            CLEANING_COLS = ["mechanical-ventilation", "energy-tariff", "solar-water-heating-flag", "glazed-type", ""]
-            model_data["construction-age-band"].value_counts()
-
-            model_data["mechanical-ventilation"] = np.where(
-                model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"]
-            )
-
-            # REVIEW THIS
-            # model_data["energy-tariff"] = np.where(
-            #     model_data["energy-tariff"] == "", "Unknown", model_data["mechanical-ventilation"]
-            # )
-            #
-            model_data["solar-water-heating-flag"] = np.where(
-                model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"]
-            )
-
-            model_data["glazed-type"] = np.where(
-                model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"]
-            )
-
-            model_data["glazed-area"] = np.where(
-                model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"]
-            )
-
-            # model_data["construction-age-band"] = np.where(
-            #     model_data["construction-age-band"] == "", "NO DATA!", model_data["construction-age-band"]
-            # )
-
-            return model_data
-
-        model_data = clean_missings(model_data)
+        model_data = self.clean_missings(model_data)
 
         # Convert transaction_type
         model_data = self._convert_transaction_type(model_data)
@@ -292,6 +287,32 @@ class SapModel:
         train_x = train_x.drop(columns=to_drop)
         test_x = test_x[train_x.columns]
 
+        from sklearn.ensemble import RandomForestRegressor
+        from sklearn.inspection import permutation_importance
+
+        rf = RandomForestRegressor(random_state=self.random_state)
+        rf.fit(train_x, self.train_y)
+
+        # Print the name and importance of each feature
+        importance_df = []
+        for feature, importance in zip(train_x.columns, rf.feature_importances_):
+            importance_df.append(
+                {
+                    "Feature": feature,
+                    "rf_importance": importance
+                }
+            )
+        importance_df = pd.DataFrame(importance_df)
+        importance_df = importance_df.sort_values(by="rf_importance", ascending=False)
+
+        perm_importance = permutation_importance(rf, test_x, self.test_y, scoring='neg_mean_squared_error')
+        perm_importance_df = pd.DataFrame(
+            {
+                "Feature": test_x.columns,
+                "perm_importance": perm_importance.importances_mean
+            }
+        ).sort_values(by="perm_importance", ascending=False)
+
         # make regression model
         model = sm.OLS(self.train_y, train_x)
         # fit model and print results