additional cleanup

2026-07-27 23:35:01 +00:00 · 2023-07-05 09:53:45 +01:00 · 2023-07-05 09:53:45 +01:00 · a79c2852cc
commit a79c2852cc
parent 308eb99afb
1 changed files with 7 additions and 2 deletions
--- a/model_data/analysis/SapModel.py
+++ b/model_data/analysis/SapModel.py
@ -12,6 +12,9 @@ from sklearn.inspection import permutation_importance

 from statsmodels.stats.outliers_influence import variance_inflation_factor
 from tqdm import tqdm
+from model_data.utils import setup_logger
+
+logger = setup_logger()

 with open("all_data.pkl", "rb") as f:
    all_data = pickle.load(f)
@ -271,6 +274,7 @@ class SapModel:
        return model_data

    def create_dataset(self):
+        logger.info("Creating modelling dataset")
        model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
        model_data = model_data.reset_index(drop=True)
        model_data["idx"] = model_data.index.copy()
@ -353,7 +357,6 @@ class SapModel:
    def fit_model(self):
        """
        Main function to fit the model and produce accuracy metrics
-        :return:
        """

        x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + self.BUCKET_VARIABLES, drop_first=True)
@ -369,6 +372,7 @@ class SapModel:
        # Create the training and test sets for each run
        self.make_training_test(x)
        self.train_x, self.test_x = self.remove_zero_std_cols(self.train_x, self.test_x)
+        logger.info("Detecting multi-collinearity in training dataset")
        self.detect_multi_collinearity()

        # Add a constant to the independent value
@ -379,6 +383,7 @@ class SapModel:
        train_x = train_x.drop(columns=["idx"])
        test_x = test_x.drop(columns=["idx"])

+        logger.info("Fitting testing model")
        # make regression model
        model = sm.OLS(self.train_y, train_x)
        # fit model and print results
@ -429,7 +434,7 @@ class SapModel:
        # We're now ready to fit the final model
        # For the momeent, the pre-processing at the top of this function merely removes columns, so we
        # just need to remove the columns that were removed from the training data from the final model
-
+        logger.info("Fitting final model")
        x = sm.add_constant(x)
        y = x[self.RESPONSE]
        x = x[self.train_x.columns]