diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py index 287b8495..0336125a 100644 --- a/model_data/analysis/SapModel.py +++ b/model_data/analysis/SapModel.py @@ -103,6 +103,12 @@ class SapModel: 'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828 } + BEST_FINAL = { + 'MAPE': 0.04841470773386795, 'Mean Squared Error': 21.323052316630914, 'Mean Absolute Error': 2.988547998636157, + 'R2 Score': 0.7633662459299112, 'Explained Variance Score': 0.7633785339028832, + 'Median Absolute Error': 1.9487883489495985 + } + BUCKET_VARIABLES = [ "number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion' ] @@ -118,14 +124,25 @@ class SapModel: self.train_y = None self.test_x = None self.test_y = None - self.results = None - self.model_data = None + + self.test_model = None + self.final_model = None + self.fit_error = None self.predict_error = None - self.worst = {"fit_errors": pd.DataFrame(), "x": pd.DataFrame(), "prediction_errors": pd.DataFrame()} + self.final_error = None + self.worst = { + "fit_errors": pd.DataFrame(), + "prediction_errors": pd.DataFrame(), + "fit_x": pd.DataFrame(), + "prediction_x": pd.DataFrame(), + "final_errors": pd.DataFrame(), + "final_x": pd.DataFrame(), + } self.fit_df = None self.predict_df = None + self.final_fit_df = None self.diagnosis = {} def run(self, plot=False): @@ -307,18 +324,31 @@ class SapModel: random_state=self.random_state ) - def remove_zero_std_cols(self, threshold=1e-3): + @staticmethod + def remove_zero_std_cols(train_x, test_x=None, threshold=1e-3): + """ + Utility function to remove columns that have zero standard deviation from both test and train sets + :param train_x: Training data to remove columns from + :param test_x: If provided, remove the same columns from the test data + :param threshold: float value, if the standard deviation is below this threshold, the column is considered + to have zero standard deviation + :return: Tuple of train_x and test_x (if provided). If test_x is not provided, a null placeholder is returned + """ # Compute standard deviations - std_devs = self.train_x.std() + std_devs = train_x.std() # Find columns with zero or near-zero standard deviation zero_std_cols = std_devs[std_devs <= threshold].index # Drop these columns from the training data - self.train_x = self.train_x.drop(zero_std_cols, axis=1) + train_x = train_x.drop(zero_std_cols, axis=1) - # Ensure the test data has the same columns - self.test_x = self.test_x[self.train_x.columns] + if test_x is not None: + # Ensure the test data has the same columns + test_x = test_x[train_x.columns] + return train_x, test_x + + return train_x, None def fit_model(self): """ @@ -338,9 +368,7 @@ class SapModel: # Create the training and test sets for each run self.make_training_test(x) - - self.remove_zero_std_cols() - + self.train_x, self.test_x = self.remove_zero_std_cols(self.train_x, self.test_x) self.detect_multi_collinearity() # Add a constant to the independent value @@ -354,14 +382,10 @@ class SapModel: # make regression model model = sm.OLS(self.train_y, train_x) # fit model and print results - self.results = model.fit() + self.test_model = model.fit() - train_predictions = self.results.fittedvalues - test_predictions = self.results.predict(test_x) - - diagnose = self.test_x.copy() - diagnose["predictions"] = test_predictions - diagnose["actual"] = self.test_y.values + train_predictions = self.test_model.fittedvalues + test_predictions = self.test_model.predict(test_x) self.fit_error, self.worst["fit_errors"] = self.calculate_regression_metrics( y_true=self.train_y, y_pred=train_predictions @@ -375,13 +399,14 @@ class SapModel: fit_success = self.check_successes(self.fit_error, self.BEST_FIT) predict_success = self.check_successes(self.predict_error, self.BEST_PREDICT) - self.model_data['fit'] = self.results.fittedvalues + self.model_data['fit'] = self.test_model.fittedvalues # The worst errors over index heavily for flats - self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)] + self.worst["fit_x"] = self.model_data[self.model_data.index.isin(self.worst["fit_errors"].index)] + self.worst["prediction_x"] = self.model_data[self.model_data.index.isin(self.worst["prediction_errors"].index)] self.fit_df = pd.DataFrame( { - "fit": self.results.fittedvalues, + "fit": train_predictions, "actual": self.train_y, "idx": train_idx } @@ -398,9 +423,36 @@ class SapModel: self.diagnosis = { "fit_success": fit_success, "predict_success": predict_success, - "summary": self.results.summary() + "summary": self.test_model.summary() } + # We're now ready to fit the final model + # For the momeent, the pre-processing at the top of this function merely removes columns, so we + # just need to remove the columns that were removed from the training data from the final model + + x = sm.add_constant(x) + y = x[self.RESPONSE] + x = x[self.train_x.columns] + idx = x["idx"].copy() + x = x.drop(columns=["idx"]) + + final_model = sm.OLS(y, x) + # fit model and print results + self.final_model = final_model.fit() + final_predictions = self.final_model.fittedvalues + + self.final_error, self.worst["final_errors"] = self.calculate_regression_metrics( + y_true=y, y_pred=final_predictions + ) + + self.final_fit_df = pd.DataFrame( + { + "fit": final_predictions, + "actual": y, + "idx": idx + } + ).sort_values("actual", ascending=True) + @staticmethod def check_successes(experiment_error, best_error): """