Added final model predictions

This commit is contained in:
Khalim Conn-Kowlessar 2023-07-05 09:51:39 +01:00
parent 710f446ebc
commit 308eb99afb

View file

@ -103,6 +103,12 @@ class SapModel:
'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828
}
BEST_FINAL = {
'MAPE': 0.04841470773386795, 'Mean Squared Error': 21.323052316630914, 'Mean Absolute Error': 2.988547998636157,
'R2 Score': 0.7633662459299112, 'Explained Variance Score': 0.7633785339028832,
'Median Absolute Error': 1.9487883489495985
}
BUCKET_VARIABLES = [
"number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion'
]
@ -118,14 +124,25 @@ class SapModel:
self.train_y = None
self.test_x = None
self.test_y = None
self.results = None
self.model_data = None
self.test_model = None
self.final_model = None
self.fit_error = None
self.predict_error = None
self.worst = {"fit_errors": pd.DataFrame(), "x": pd.DataFrame(), "prediction_errors": pd.DataFrame()}
self.final_error = None
self.worst = {
"fit_errors": pd.DataFrame(),
"prediction_errors": pd.DataFrame(),
"fit_x": pd.DataFrame(),
"prediction_x": pd.DataFrame(),
"final_errors": pd.DataFrame(),
"final_x": pd.DataFrame(),
}
self.fit_df = None
self.predict_df = None
self.final_fit_df = None
self.diagnosis = {}
def run(self, plot=False):
@ -307,18 +324,31 @@ class SapModel:
random_state=self.random_state
)
def remove_zero_std_cols(self, threshold=1e-3):
@staticmethod
def remove_zero_std_cols(train_x, test_x=None, threshold=1e-3):
"""
Utility function to remove columns that have zero standard deviation from both test and train sets
:param train_x: Training data to remove columns from
:param test_x: If provided, remove the same columns from the test data
:param threshold: float value, if the standard deviation is below this threshold, the column is considered
to have zero standard deviation
:return: Tuple of train_x and test_x (if provided). If test_x is not provided, a null placeholder is returned
"""
# Compute standard deviations
std_devs = self.train_x.std()
std_devs = train_x.std()
# Find columns with zero or near-zero standard deviation
zero_std_cols = std_devs[std_devs <= threshold].index
# Drop these columns from the training data
self.train_x = self.train_x.drop(zero_std_cols, axis=1)
train_x = train_x.drop(zero_std_cols, axis=1)
# Ensure the test data has the same columns
self.test_x = self.test_x[self.train_x.columns]
if test_x is not None:
# Ensure the test data has the same columns
test_x = test_x[train_x.columns]
return train_x, test_x
return train_x, None
def fit_model(self):
"""
@ -338,9 +368,7 @@ class SapModel:
# Create the training and test sets for each run
self.make_training_test(x)
self.remove_zero_std_cols()
self.train_x, self.test_x = self.remove_zero_std_cols(self.train_x, self.test_x)
self.detect_multi_collinearity()
# Add a constant to the independent value
@ -354,14 +382,10 @@ class SapModel:
# make regression model
model = sm.OLS(self.train_y, train_x)
# fit model and print results
self.results = model.fit()
self.test_model = model.fit()
train_predictions = self.results.fittedvalues
test_predictions = self.results.predict(test_x)
diagnose = self.test_x.copy()
diagnose["predictions"] = test_predictions
diagnose["actual"] = self.test_y.values
train_predictions = self.test_model.fittedvalues
test_predictions = self.test_model.predict(test_x)
self.fit_error, self.worst["fit_errors"] = self.calculate_regression_metrics(
y_true=self.train_y, y_pred=train_predictions
@ -375,13 +399,14 @@ class SapModel:
fit_success = self.check_successes(self.fit_error, self.BEST_FIT)
predict_success = self.check_successes(self.predict_error, self.BEST_PREDICT)
self.model_data['fit'] = self.results.fittedvalues
self.model_data['fit'] = self.test_model.fittedvalues
# The worst errors over index heavily for flats
self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
self.worst["fit_x"] = self.model_data[self.model_data.index.isin(self.worst["fit_errors"].index)]
self.worst["prediction_x"] = self.model_data[self.model_data.index.isin(self.worst["prediction_errors"].index)]
self.fit_df = pd.DataFrame(
{
"fit": self.results.fittedvalues,
"fit": train_predictions,
"actual": self.train_y,
"idx": train_idx
}
@ -398,9 +423,36 @@ class SapModel:
self.diagnosis = {
"fit_success": fit_success,
"predict_success": predict_success,
"summary": self.results.summary()
"summary": self.test_model.summary()
}
# We're now ready to fit the final model
# For the momeent, the pre-processing at the top of this function merely removes columns, so we
# just need to remove the columns that were removed from the training data from the final model
x = sm.add_constant(x)
y = x[self.RESPONSE]
x = x[self.train_x.columns]
idx = x["idx"].copy()
x = x.drop(columns=["idx"])
final_model = sm.OLS(y, x)
# fit model and print results
self.final_model = final_model.fit()
final_predictions = self.final_model.fittedvalues
self.final_error, self.worst["final_errors"] = self.calculate_regression_metrics(
y_true=y, y_pred=final_predictions
)
self.final_fit_df = pd.DataFrame(
{
"fit": final_predictions,
"actual": y,
"idx": idx
}
).sort_values("actual", ascending=True)
@staticmethod
def check_successes(experiment_error, best_error):
"""