mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added final model predictions
This commit is contained in:
parent
710f446ebc
commit
308eb99afb
1 changed files with 74 additions and 22 deletions
|
|
@ -103,6 +103,12 @@ class SapModel:
|
|||
'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828
|
||||
}
|
||||
|
||||
BEST_FINAL = {
|
||||
'MAPE': 0.04841470773386795, 'Mean Squared Error': 21.323052316630914, 'Mean Absolute Error': 2.988547998636157,
|
||||
'R2 Score': 0.7633662459299112, 'Explained Variance Score': 0.7633785339028832,
|
||||
'Median Absolute Error': 1.9487883489495985
|
||||
}
|
||||
|
||||
BUCKET_VARIABLES = [
|
||||
"number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion'
|
||||
]
|
||||
|
|
@ -118,14 +124,25 @@ class SapModel:
|
|||
self.train_y = None
|
||||
self.test_x = None
|
||||
self.test_y = None
|
||||
self.results = None
|
||||
self.model_data = None
|
||||
|
||||
self.test_model = None
|
||||
self.final_model = None
|
||||
|
||||
self.fit_error = None
|
||||
self.predict_error = None
|
||||
self.worst = {"fit_errors": pd.DataFrame(), "x": pd.DataFrame(), "prediction_errors": pd.DataFrame()}
|
||||
self.final_error = None
|
||||
self.worst = {
|
||||
"fit_errors": pd.DataFrame(),
|
||||
"prediction_errors": pd.DataFrame(),
|
||||
"fit_x": pd.DataFrame(),
|
||||
"prediction_x": pd.DataFrame(),
|
||||
"final_errors": pd.DataFrame(),
|
||||
"final_x": pd.DataFrame(),
|
||||
}
|
||||
|
||||
self.fit_df = None
|
||||
self.predict_df = None
|
||||
self.final_fit_df = None
|
||||
self.diagnosis = {}
|
||||
|
||||
def run(self, plot=False):
|
||||
|
|
@ -307,18 +324,31 @@ class SapModel:
|
|||
random_state=self.random_state
|
||||
)
|
||||
|
||||
def remove_zero_std_cols(self, threshold=1e-3):
|
||||
@staticmethod
|
||||
def remove_zero_std_cols(train_x, test_x=None, threshold=1e-3):
|
||||
"""
|
||||
Utility function to remove columns that have zero standard deviation from both test and train sets
|
||||
:param train_x: Training data to remove columns from
|
||||
:param test_x: If provided, remove the same columns from the test data
|
||||
:param threshold: float value, if the standard deviation is below this threshold, the column is considered
|
||||
to have zero standard deviation
|
||||
:return: Tuple of train_x and test_x (if provided). If test_x is not provided, a null placeholder is returned
|
||||
"""
|
||||
# Compute standard deviations
|
||||
std_devs = self.train_x.std()
|
||||
std_devs = train_x.std()
|
||||
|
||||
# Find columns with zero or near-zero standard deviation
|
||||
zero_std_cols = std_devs[std_devs <= threshold].index
|
||||
|
||||
# Drop these columns from the training data
|
||||
self.train_x = self.train_x.drop(zero_std_cols, axis=1)
|
||||
train_x = train_x.drop(zero_std_cols, axis=1)
|
||||
|
||||
# Ensure the test data has the same columns
|
||||
self.test_x = self.test_x[self.train_x.columns]
|
||||
if test_x is not None:
|
||||
# Ensure the test data has the same columns
|
||||
test_x = test_x[train_x.columns]
|
||||
return train_x, test_x
|
||||
|
||||
return train_x, None
|
||||
|
||||
def fit_model(self):
|
||||
"""
|
||||
|
|
@ -338,9 +368,7 @@ class SapModel:
|
|||
|
||||
# Create the training and test sets for each run
|
||||
self.make_training_test(x)
|
||||
|
||||
self.remove_zero_std_cols()
|
||||
|
||||
self.train_x, self.test_x = self.remove_zero_std_cols(self.train_x, self.test_x)
|
||||
self.detect_multi_collinearity()
|
||||
|
||||
# Add a constant to the independent value
|
||||
|
|
@ -354,14 +382,10 @@ class SapModel:
|
|||
# make regression model
|
||||
model = sm.OLS(self.train_y, train_x)
|
||||
# fit model and print results
|
||||
self.results = model.fit()
|
||||
self.test_model = model.fit()
|
||||
|
||||
train_predictions = self.results.fittedvalues
|
||||
test_predictions = self.results.predict(test_x)
|
||||
|
||||
diagnose = self.test_x.copy()
|
||||
diagnose["predictions"] = test_predictions
|
||||
diagnose["actual"] = self.test_y.values
|
||||
train_predictions = self.test_model.fittedvalues
|
||||
test_predictions = self.test_model.predict(test_x)
|
||||
|
||||
self.fit_error, self.worst["fit_errors"] = self.calculate_regression_metrics(
|
||||
y_true=self.train_y, y_pred=train_predictions
|
||||
|
|
@ -375,13 +399,14 @@ class SapModel:
|
|||
fit_success = self.check_successes(self.fit_error, self.BEST_FIT)
|
||||
predict_success = self.check_successes(self.predict_error, self.BEST_PREDICT)
|
||||
|
||||
self.model_data['fit'] = self.results.fittedvalues
|
||||
self.model_data['fit'] = self.test_model.fittedvalues
|
||||
# The worst errors over index heavily for flats
|
||||
self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
|
||||
self.worst["fit_x"] = self.model_data[self.model_data.index.isin(self.worst["fit_errors"].index)]
|
||||
self.worst["prediction_x"] = self.model_data[self.model_data.index.isin(self.worst["prediction_errors"].index)]
|
||||
|
||||
self.fit_df = pd.DataFrame(
|
||||
{
|
||||
"fit": self.results.fittedvalues,
|
||||
"fit": train_predictions,
|
||||
"actual": self.train_y,
|
||||
"idx": train_idx
|
||||
}
|
||||
|
|
@ -398,9 +423,36 @@ class SapModel:
|
|||
self.diagnosis = {
|
||||
"fit_success": fit_success,
|
||||
"predict_success": predict_success,
|
||||
"summary": self.results.summary()
|
||||
"summary": self.test_model.summary()
|
||||
}
|
||||
|
||||
# We're now ready to fit the final model
|
||||
# For the momeent, the pre-processing at the top of this function merely removes columns, so we
|
||||
# just need to remove the columns that were removed from the training data from the final model
|
||||
|
||||
x = sm.add_constant(x)
|
||||
y = x[self.RESPONSE]
|
||||
x = x[self.train_x.columns]
|
||||
idx = x["idx"].copy()
|
||||
x = x.drop(columns=["idx"])
|
||||
|
||||
final_model = sm.OLS(y, x)
|
||||
# fit model and print results
|
||||
self.final_model = final_model.fit()
|
||||
final_predictions = self.final_model.fittedvalues
|
||||
|
||||
self.final_error, self.worst["final_errors"] = self.calculate_regression_metrics(
|
||||
y_true=y, y_pred=final_predictions
|
||||
)
|
||||
|
||||
self.final_fit_df = pd.DataFrame(
|
||||
{
|
||||
"fit": final_predictions,
|
||||
"actual": y,
|
||||
"idx": idx
|
||||
}
|
||||
).sort_values("actual", ascending=True)
|
||||
|
||||
@staticmethod
|
||||
def check_successes(experiment_error, best_error):
|
||||
"""
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue