diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py index 8b0013c3..f99cf583 100644 --- a/model_data/analysis/SapModel.py +++ b/model_data/analysis/SapModel.py @@ -55,7 +55,6 @@ class SapModel: "number-habitable-rooms", "constituency", "number-heated-rooms", - "lighting-description", "mainheat-description", "hotwater-description", "main-fuel", @@ -67,6 +66,8 @@ class SapModel: "glazed-type", "glazed-area", "construction-age-band", + # Testing + "lighting-description" ] def __init__(self, data, cleaner, test_size=0.2, random_state=None): @@ -83,7 +84,8 @@ class SapModel: self.results = None self.model_data = None self.fit_error = None - self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()} + self.predict_error = None + self.worst = {"fit_errors": pd.DataFrame(), "x": pd.DataFrame(), "prediction_errors": pd.DataFrame()} self.fit_df = None def run(self, plot=False): @@ -173,7 +175,9 @@ class SapModel: ~pd.isnull(model_data["roof_u_value"]) ] - exclude_features = ["walls-description", "floor-description", "roof-description", "transaction-type"] + exclude_features = [ + "walls-description", "floor-description", "roof-description", "transaction-type" + ] features = [ x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [ @@ -200,6 +204,19 @@ class SapModel: random_state=self.random_state ) + def remove_zero_std_cols(self, threshold=1e-3): + # Compute standard deviations + std_devs = self.train_x.std() + + # Find columns with zero or near-zero standard deviation + zero_std_cols = std_devs[std_devs <= threshold].index + + # Drop these columns from the training data + self.train_x = self.train_x.drop(zero_std_cols, axis=1) + + # Ensure the test data has the same columns + self.test_x = self.test_x[self.train_x.columns] + def fit_model(self): # Dummy out the categorical variables @@ -216,6 +233,8 @@ class SapModel: # Create the training and test sets for each run self.make_training_test(x) + self.remove_zero_std_cols() + # Add a constant to the independent value train_x = sm.add_constant(self.train_x) @@ -225,10 +244,52 @@ class SapModel: # fit model and print results self.results = model.fit() - self.fit_error, self.worst["errors"] = self.calculate_regression_metrics( + self.fit_error, self.worst["fit_errors"] = self.calculate_regression_metrics( y_true=self.train_y, y_pred=self.results.fittedvalues ) + # Predict on new data + predictions = self.results.predict(sm.add_constant(self.test_x)) + self.predict_error, self.worst["prediction_errors"] = self.calculate_regression_metrics( + y_true=self.test_y, y_pred=predictions + ) + + # temp hardcoded values + best_fit = {'MAPE': 0.04138090547359925, 'Mean Squared Error': 20.14558392249143, + 'Mean Absolute Error': 3.2071693100226386, 'R2 Score': 0.8070222206305815, + 'Explained Variance Score': 0.8070222206305815, 'Median Absolute Error': 2.418797962633903} + + best_predict = {'MAPE': 0.04477710915141379, 'Mean Squared Error': 24.121330207821273, + 'Mean Absolute Error': 3.443075571126256, 'R2 Score': 0.7346655266247644, + 'Explained Variance Score': 0.7346701958813864, 'Median Absolute Error': 2.5234727208706076} + + def check_successes(experiment_error, best_error): + + successes = [] + for k in experiment_error: + if k == "Explained Variance Score": + # We want to maximise this so we want experiment error to be higher + successes.append( + { + "measure": k, + "success": experiment_error[k] >= best_error[k], + "difference": abs(experiment_error[k] - best_error[k]) + } + ) + continue + successes.append( + { + "measure": k, + "success": experiment_error[k] <= best_error[k], + "difference": abs(experiment_error[k] - best_error[k]) + } + ) + + return pd.DataFrame(successes) + + check_successes(self.fit_error, best_fit) + check_successes(self.predict_error, best_predict) + self.model_data['fit'] = self.results.fittedvalues # The worst errors over index heavily for flats self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)] @@ -240,6 +301,17 @@ class SapModel: } ).sort_values("actual", ascending=True) + def detect_multi_collinearity(self): + from statsmodels.stats.outliers_influence import variance_inflation_factor + from tqdm import tqdm + # Get the VIFs for each variable + vifs = pd.DataFrame() + vifs["features"] = self.train_x.columns + vifs["vif"] = [variance_inflation_factor(self.train_x.values, i) for i in tqdm(range(self.train_x.shape[1]))] + + # Get the features with the highest VIF + vifs = vifs.sort_values("vif", ascending=False) + @staticmethod def plot_regression(df): # Extract the "fit" and "actual" columns from the dataframe @@ -284,8 +356,6 @@ class SapModel: metrics['R2 Score'] = r2_score(y_true, y_pred) metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred) metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred) - metrics['Mean True Value'] = y_true.mean() - metrics['Mean Predicted Value'] = y_pred.mean() errors = pd.DataFrame() errors['Fit'] = y_true