had a silly bug in cleaning code, which is fixed

2026-07-27 23:35:01 +00:00 · 2023-07-04 16:35:08 +01:00 · 2023-07-04 16:35:08 +01:00 · 68e903e492
commit 68e903e492
parent c59aff412c
1 changed files with 29 additions and 8 deletions
--- a/model_data/analysis/SapModel.py
+++ b/model_data/analysis/SapModel.py
@ -9,7 +9,6 @@ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, e
    median_absolute_error, mean_absolute_percentage_error
 from sklearn.linear_model import Lasso
 from sklearn.preprocessing import StandardScaler
-from sklearn.linear_model import LinearRegression
 import xgboost as xgb

 with open("all_data.pkl", "rb") as f:
@ -151,7 +150,7 @@ class SapModel:

        for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces"]:
            model_data[col] = np.where(
-                model_data[col] == "", "0", model_data["photo-supply"]
+                model_data[col] == "", "0", model_data[col]
            ).astype(float)

        return model_data
@ -343,13 +342,13 @@ class SapModel:
        ).sort_values("actual", ascending=True).merge(self.model_data[["idx", "property-type"]], on="idx")

        # temp hardcoded values
-        best_fit = {'MAPE': 0.042768242654695386, 'Mean Squared Error': 21.606875710236896,
-                    'Mean Absolute Error': 3.293776606279645, 'R2 Score': 0.7930242722318233,
-                    'Explained Variance Score': 0.7930242722318233, 'Median Absolute Error': 2.47686604239054}
+        best_fit = {'MAPE': 0.042824355225087686, 'Mean Squared Error': 21.49263731368226,
+                    'Mean Absolute Error': 3.298755911054327, 'R2 Score': 0.794118580154128,
+                    'Explained Variance Score': 0.794118580154128, 'Median Absolute Error': 2.426789554039914}

-        best_predict = {'MAPE': 0.04397538047202114, 'Mean Squared Error': 22.582856696398935,
-                        'Mean Absolute Error': 3.384549163877968, 'R2 Score': 0.7515887251149801,
-                        'Explained Variance Score': 0.7516508219403573, 'Median Absolute Error': 2.4624472128668344}
+        best_predict = {'MAPE': 0.04413439429441669, 'Mean Squared Error': 22.700373062051142,
+                        'Mean Absolute Error': 3.3961241443022008, 'R2 Score': 0.750296045867001,
+                        'Explained Variance Score': 0.7503518147827141, 'Median Absolute Error': 2.4442017110145855}

        def check_successes(experiment_error, best_error):

@ -641,3 +640,25 @@ self = SapModel(
    data=all_data["data"],
    cleaner=all_data["cleaner"]
 )
+
+from sklearn.ensemble import RandomForestRegressor
+
+rf = RandomForestRegressor(random_state=self.random_state)
+X = self.df.drop(columns=self.RESPONSE)
+for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces"]:
+    X[col] = np.where(
+        X[col] == "", "0", X[col]
+    ).astype(float)
+
+Y = self.df[self.RESPONSE]
+rf.fit(X, Y)
+
+# Print the name and importance of each feature
+importance_df = []
+for feature, importance in zip(train_x.columns, rf.feature_importances_):
+    importance_df.append(
+        {
+            "Feature": feature,
+            "rf_importance": importance
+        }
+    )