Added final model predictions

2026-07-27 23:35:01 +00:00 · 2023-07-05 09:51:39 +01:00 · 2023-07-05 09:51:39 +01:00 · 308eb99afb
commit 308eb99afb
parent 710f446ebc
1 changed files with 74 additions and 22 deletions
--- a/model_data/analysis/SapModel.py
+++ b/model_data/analysis/SapModel.py
@ -103,6 +103,12 @@ class SapModel:
        'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828
    }

+    BEST_FINAL = {
+        'MAPE': 0.04841470773386795, 'Mean Squared Error': 21.323052316630914, 'Mean Absolute Error': 2.988547998636157,
+        'R2 Score': 0.7633662459299112, 'Explained Variance Score': 0.7633785339028832,
+        'Median Absolute Error': 1.9487883489495985
+    }
+
    BUCKET_VARIABLES = [
        "number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion'
    ]
@ -118,14 +124,25 @@ class SapModel:
        self.train_y = None
        self.test_x = None
        self.test_y = None
-        self.results = None
-        self.model_data = None
+
+        self.test_model = None
+        self.final_model = None
+
        self.fit_error = None
        self.predict_error = None
-        self.worst = {"fit_errors": pd.DataFrame(), "x": pd.DataFrame(), "prediction_errors": pd.DataFrame()}
+        self.final_error = None
+        self.worst = {
+            "fit_errors": pd.DataFrame(),
+            "prediction_errors": pd.DataFrame(),
+            "fit_x": pd.DataFrame(),
+            "prediction_x": pd.DataFrame(),
+            "final_errors": pd.DataFrame(),
+            "final_x": pd.DataFrame(),
+        }

        self.fit_df = None
        self.predict_df = None
+        self.final_fit_df = None
        self.diagnosis = {}

    def run(self, plot=False):
@ -307,18 +324,31 @@ class SapModel:
            random_state=self.random_state
        )

-    def remove_zero_std_cols(self, threshold=1e-3):
+    @staticmethod
+    def remove_zero_std_cols(train_x, test_x=None, threshold=1e-3):
+        """
+        Utility function to remove columns that have zero standard deviation from both test and train sets
+        :param train_x: Training data to remove columns from
+        :param test_x: If provided, remove the same columns from the test data
+        :param threshold: float value, if the standard deviation is below this threshold, the column is considered
+                             to have zero standard deviation
+        :return: Tuple of train_x and test_x (if provided). If test_x is not provided, a null placeholder is returned
+        """
        # Compute standard deviations
-        std_devs = self.train_x.std()
+        std_devs = train_x.std()

        # Find columns with zero or near-zero standard deviation
        zero_std_cols = std_devs[std_devs <= threshold].index

        # Drop these columns from the training data
-        self.train_x = self.train_x.drop(zero_std_cols, axis=1)
+        train_x = train_x.drop(zero_std_cols, axis=1)

-        # Ensure the test data has the same columns
-        self.test_x = self.test_x[self.train_x.columns]
+        if test_x is not None:
+            # Ensure the test data has the same columns
+            test_x = test_x[train_x.columns]
+            return train_x, test_x
+
+        return train_x, None

    def fit_model(self):
        """
@ -338,9 +368,7 @@ class SapModel:

        # Create the training and test sets for each run
        self.make_training_test(x)
-
-        self.remove_zero_std_cols()
-
+        self.train_x, self.test_x = self.remove_zero_std_cols(self.train_x, self.test_x)
        self.detect_multi_collinearity()

        # Add a constant to the independent value
@ -354,14 +382,10 @@ class SapModel:
        # make regression model
        model = sm.OLS(self.train_y, train_x)
        # fit model and print results
-        self.results = model.fit()
+        self.test_model = model.fit()

-        train_predictions = self.results.fittedvalues
-        test_predictions = self.results.predict(test_x)
-
-        diagnose = self.test_x.copy()
-        diagnose["predictions"] = test_predictions
-        diagnose["actual"] = self.test_y.values
+        train_predictions = self.test_model.fittedvalues
+        test_predictions = self.test_model.predict(test_x)

        self.fit_error, self.worst["fit_errors"] = self.calculate_regression_metrics(
            y_true=self.train_y, y_pred=train_predictions
@ -375,13 +399,14 @@ class SapModel:
        fit_success = self.check_successes(self.fit_error, self.BEST_FIT)
        predict_success = self.check_successes(self.predict_error, self.BEST_PREDICT)

-        self.model_data['fit'] = self.results.fittedvalues
+        self.model_data['fit'] = self.test_model.fittedvalues
        # The worst errors over index heavily for flats
-        self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
+        self.worst["fit_x"] = self.model_data[self.model_data.index.isin(self.worst["fit_errors"].index)]
+        self.worst["prediction_x"] = self.model_data[self.model_data.index.isin(self.worst["prediction_errors"].index)]

        self.fit_df = pd.DataFrame(
            {
-                "fit": self.results.fittedvalues,
+                "fit": train_predictions,
                "actual": self.train_y,
                "idx": train_idx
            }
@ -398,9 +423,36 @@ class SapModel:
        self.diagnosis = {
            "fit_success": fit_success,
            "predict_success": predict_success,
-            "summary": self.results.summary()
+            "summary": self.test_model.summary()
        }

+        # We're now ready to fit the final model
+        # For the momeent, the pre-processing at the top of this function merely removes columns, so we
+        # just need to remove the columns that were removed from the training data from the final model
+
+        x = sm.add_constant(x)
+        y = x[self.RESPONSE]
+        x = x[self.train_x.columns]
+        idx = x["idx"].copy()
+        x = x.drop(columns=["idx"])
+
+        final_model = sm.OLS(y, x)
+        # fit model and print results
+        self.final_model = final_model.fit()
+        final_predictions = self.final_model.fittedvalues
+
+        self.final_error, self.worst["final_errors"] = self.calculate_regression_metrics(
+            y_true=y, y_pred=final_predictions
+        )
+
+        self.final_fit_df = pd.DataFrame(
+            {
+                "fit": final_predictions,
+                "actual": y,
+                "idx": idx
+            }
+        ).sort_values("actual", ascending=True)
+
    @staticmethod
    def check_successes(experiment_error, best_error):
        """