additional cleanup

This commit is contained in:
Khalim Conn-Kowlessar 2023-07-05 09:53:45 +01:00
parent 308eb99afb
commit a79c2852cc

View file

@ -12,6 +12,9 @@ from sklearn.inspection import permutation_importance
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tqdm import tqdm
from model_data.utils import setup_logger
logger = setup_logger()
with open("all_data.pkl", "rb") as f:
all_data = pickle.load(f)
@ -271,6 +274,7 @@ class SapModel:
return model_data
def create_dataset(self):
logger.info("Creating modelling dataset")
model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
model_data = model_data.reset_index(drop=True)
model_data["idx"] = model_data.index.copy()
@ -353,7 +357,6 @@ class SapModel:
def fit_model(self):
"""
Main function to fit the model and produce accuracy metrics
:return:
"""
x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + self.BUCKET_VARIABLES, drop_first=True)
@ -369,6 +372,7 @@ class SapModel:
# Create the training and test sets for each run
self.make_training_test(x)
self.train_x, self.test_x = self.remove_zero_std_cols(self.train_x, self.test_x)
logger.info("Detecting multi-collinearity in training dataset")
self.detect_multi_collinearity()
# Add a constant to the independent value
@ -379,6 +383,7 @@ class SapModel:
train_x = train_x.drop(columns=["idx"])
test_x = test_x.drop(columns=["idx"])
logger.info("Fitting testing model")
# make regression model
model = sm.OLS(self.train_y, train_x)
# fit model and print results
@ -429,7 +434,7 @@ class SapModel:
# We're now ready to fit the final model
# For the momeent, the pre-processing at the top of this function merely removes columns, so we
# just need to remove the columns that were removed from the training data from the final model
logger.info("Fitting final model")
x = sm.add_constant(x)
y = x[self.RESPONSE]
x = x[self.train_x.columns]