From 710f446ebc3f829b4059d57c169823ad19007824 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 5 Jul 2023 09:34:14 +0100 Subject: [PATCH] Cleaning up modelling code --- model_data/analysis/SapModel.py | 460 +++++++++++--------------------- 1 file changed, 149 insertions(+), 311 deletions(-) diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py index 3f3d07eb..287b8495 100644 --- a/model_data/analysis/SapModel.py +++ b/model_data/analysis/SapModel.py @@ -7,9 +7,8 @@ from typing import Any, Dict, Tuple from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \ median_absolute_error, mean_absolute_percentage_error -from sklearn.linear_model import Lasso -from sklearn.preprocessing import StandardScaler -import xgboost as xgb +from sklearn.ensemble import RandomForestRegressor +from sklearn.inspection import permutation_importance from statsmodels.stats.outliers_influence import variance_inflation_factor from tqdm import tqdm @@ -91,6 +90,23 @@ class SapModel: "extension-count", ] + # For the moment, we store records of the best performing models as a benchmark for future imporvements + BEST_FIT = { + 'MAPE': 0.04646530042225876, 'Mean Squared Error': 18.635209563729763, + 'Mean Absolute Error': 2.856347408023325, 'R2 Score': 0.800701753826118, + 'Explained Variance Score': 0.800701753826118, 'Median Absolute Error': 1.9026758012120197 + } + + BEST_PREDICT = { + 'MAPE': 0.04346083528432316, 'Mean Squared Error': 21.16036509335514, + 'Mean Absolute Error': 3.0440540802375833, 'R2 Score': 0.7219965012634312, + 'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828 + } + + BUCKET_VARIABLES = [ + "number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion' + ] + def __init__(self, data, cleaner, test_size=0.2, random_state=None): self.df = pd.DataFrame(data) self.cleaner = cleaner @@ -107,7 +123,10 @@ class SapModel: self.fit_error = None self.predict_error = None self.worst = {"fit_errors": pd.DataFrame(), "x": pd.DataFrame(), "prediction_errors": pd.DataFrame()} + self.fit_df = None + self.predict_df = None + self.diagnosis = {} def run(self, plot=False): """ @@ -163,83 +182,49 @@ class SapModel: model_data = model_data.drop(columns=["transaction-type"]) return model_data + @staticmethod + def bucket_and_fill(df, column_name, n_bins=10): + """ + Simple utility function to bucket up features into bins and then fill any missing values with "NO_RECORD" + :param df: Dataframe of features to be binned + :param column_name: Name of the column to be binned + :param n_bins: Number of bins to use + :return: Dataframe with binned column + """ + # Check if the column is numerical + if np.issubdtype(df[column_name].dtype, np.number): + # Create a new categorical column from numerical one by binning the data + df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str) + # Replace missing data with "NO_RECORD" + df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD") + df[column_name + "_bucket"] = np.where( + df[column_name + "_bucket"] == "nan", + "NO_RECORD", + df[column_name + "_bucket"] + ) + return df + def _clean_numericals(self, model_data): # Try binning numericals - def bucket_and_fill(df, column_name, n_bins=10): - # Check if the column is numerical - if np.issubdtype(df[column_name].dtype, np.number): - # Create a new categorical column from numerical one by binning the data - df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str) - # Replace missing data with "NO_RECORD" - df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD") - df[column_name + "_bucket"] = np.where( - df[column_name + "_bucket"] == "nan", - "NO_RECORD", - df[column_name + "_bucket"] - ) - return df + remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in self.BUCKET_VARIABLES] - bucket_variables = ["number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', - 'multi-glaze-proportion'] - remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in bucket_variables] - - for col in bucket_variables: + for col in self.BUCKET_VARIABLES: model_data[col] = pd.to_numeric(model_data[col], errors='coerce') - model_data = bucket_and_fill(model_data, col) + model_data = self.bucket_and_fill(model_data, col) # Replace the data with the binned version - model_data = model_data.drop(columns=bucket_variables) + model_data = model_data.drop(columns=self.BUCKET_VARIABLES) model_data = model_data.rename( - columns=dict(zip([c + "_bucket" for c in bucket_variables], bucket_variables)) + columns=dict(zip([c + "_bucket" for c in self.BUCKET_VARIABLES], self.BUCKET_VARIABLES)) ) + # Basic fill the rest of the columns with 0 - currenrtly this provided the best performance for col in remaining_numericals: model_data[col] = np.where( model_data[col] == "", "0", model_data[col] ).astype(float) - # Basic fill the rest of the columns with 0 - - # grouping_variables = ["constituency", "built-form", "property-type"] - # secondary_grouping_variables = ["constituency", "property-type"] - # - # def append_missings(model_data, col, grouping_variables, secondary=False): - # if not secondary: - # numerical_data = model_data[(model_data[col] != "")].copy() - # else: - # numerical_data = model_data[~pd.isnull(model_data[col])].copy() - # - # numerical_data[col] = numerical_data[col].astype(float) - # numerical_data = numerical_data.groupby(grouping_variables, observed=True)[col].mean().reset_index() - # numerical_data = numerical_data.rename(columns={col: f"{col}_avg"}) - # model_data = model_data.merge(numerical_data, how="left", on=grouping_variables) - # - # if not secondary: - # model_data[col] = np.where(model_data[col] == "", model_data[f"{col}_avg"], model_data[col]) - # else: - # model_data[col] = np.where(pd.isnull(model_data[col]), model_data[f"{col}_avg"], model_data[col]) - # - # model_data = model_data.drop(columns=[f"{col}_avg"]) - # model_data[col] = model_data[col].astype(float) - # - # return model_data - # - # for col in numerical_cols: - # model_data[col] = pd.to_numeric(model_data[col], errors='coerce') - # model_data[col] = model_data[col].fillna(model_data[col].mean()) - # model_data = append_missings(model_data, col, grouping_variables) - # - # if pd.isnull(model_data[col]).sum(): - # model_data = append_missings(model_data, col, secondary_grouping_variables, True) - # - # if pd.isnull(model_data[col]).sum(): - # raise Exception("Cleaning failed, missing values remain") - # - # model_data[col] = np.where( - # model_data[col] == "", "0", model_data[col] - # ).astype(float) - return model_data @staticmethod @@ -336,11 +321,12 @@ class SapModel: self.test_x = self.test_x[self.train_x.columns] def fit_model(self): + """ + Main function to fit the model and produce accuracy metrics + :return: + """ - # Dummy out the categorical variables - binned = ["number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion'] - - x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + binned, drop_first=True) + x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + self.BUCKET_VARIABLES, drop_first=True) # Convert booleans to integer for col in x.columns: @@ -361,36 +347,10 @@ class SapModel: train_x = sm.add_constant(self.train_x) test_x = sm.add_constant(self.test_x) train_idx = train_x["idx"].copy() - test_ids = self.test_x["idx"].copy() + test_idx = self.test_x["idx"].copy() train_x = train_x.drop(columns=["idx"]) test_x = test_x.drop(columns=["idx"]) - from sklearn.ensemble import RandomForestRegressor - from sklearn.inspection import permutation_importance - - rf = RandomForestRegressor(random_state=self.random_state) - rf.fit(train_x, self.train_y) - - # Print the name and importance of each feature - importance_df = [] - for feature, importance in zip(train_x.columns, rf.feature_importances_): - importance_df.append( - { - "Feature": feature, - "rf_importance": importance - } - ) - importance_df = pd.DataFrame(importance_df) - importance_df = importance_df.sort_values(by="rf_importance", ascending=False) - - perm_importance = permutation_importance(rf, test_x, self.test_y, scoring='neg_mean_squared_error') - perm_importance_df = pd.DataFrame( - { - "Feature": test_x.columns, - "perm_importance": perm_importance.importances_mean - } - ).sort_values(by="perm_importance", ascending=False) - # make regression model model = sm.OLS(self.train_y, train_x) # fit model and print results @@ -412,157 +372,116 @@ class SapModel: y_true=self.test_y, y_pred=test_predictions ) - fit_df = pd.DataFrame( - { - "fit": self.results.fittedvalues, - "actual": self.train_y, - "idx": train_idx - } - ).sort_values("actual", ascending=True).merge(self.model_data[["idx", "property-type"]], on="idx") + fit_success = self.check_successes(self.fit_error, self.BEST_FIT) + predict_success = self.check_successes(self.predict_error, self.BEST_PREDICT) - # temp hardcoded values - best_fit = {'MAPE': 0.04646530042225876, 'Mean Squared Error': 18.635209563729763, - 'Mean Absolute Error': 2.856347408023325, 'R2 Score': 0.800701753826118, - 'Explained Variance Score': 0.800701753826118, 'Median Absolute Error': 1.9026758012120197} - - best_predict = {'MAPE': 0.04346083528432316, 'Mean Squared Error': 21.16036509335514, - 'Mean Absolute Error': 3.0440540802375833, 'R2 Score': 0.7219965012634312, - 'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828} - - def check_successes(experiment_error, best_error): - - successes = [] - for k in experiment_error: - if k in ["Explained Variance Score", "R2 Score"]: - # We want to maximise this so we want experiment error to be higher - successes.append( - { - "measure": k, - "success": experiment_error[k] >= best_error[k], - "difference": abs(experiment_error[k] - best_error[k]) - } - ) - continue - successes.append( - { - "measure": k, - "success": experiment_error[k] <= best_error[k], - "difference": abs(experiment_error[k] - best_error[k]) - } - ) - - return pd.DataFrame(successes) - - fit_success = check_successes(self.fit_error, best_fit) - predict_success = check_successes(self.predict_error, best_predict) - - print(self.results.summary()) self.model_data['fit'] = self.results.fittedvalues - # The worst errors over index heavily for flats self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)] self.fit_df = pd.DataFrame( { "fit": self.results.fittedvalues, - "actual": self.train_y + "actual": self.train_y, + "idx": train_idx } ).sort_values("actual", ascending=True) - # TODO: Testing - - # Create a StandardScaler instance - scaler = StandardScaler() - - # Fit the scaler to the training data and transform it - train_x_scaled = scaler.fit_transform(train_x) - - # Transform the test data - test_x_scaled = scaler.transform(test_x) - - # Define the model - lasso_reg = Lasso( - alpha=0.1) # you can change the alpha parameter to adjust the strength of the regularization. - - # Fit the model - lasso_reg.fit(train_x_scaled, self.train_y) - - # Make predictions on the training set - train_predictions = lasso_reg.predict(train_x_scaled) - - # Make predictions on the test set - test_predictions = lasso_reg.predict(test_x_scaled) - - # Calculate metrics based on these predictions. - lasso_fit_error, _ = self.calculate_regression_metrics( - y_true=self.train_y, y_pred=train_predictions - ) - - # Predict on new data - lasso_predict_error, _ = self.calculate_regression_metrics( - y_true=self.test_y, y_pred=test_predictions - ) - - lasso_fit_success = check_successes(lasso_fit_error, best_fit) - lasso_predict_success = check_successes(lasso_predict_error, best_predict) - - # TODO: TESTING 2 - - # Create a StandardScaler instance - scaler = StandardScaler() - - # Fit the scaler to the training data and transform it - train_x_scaled = scaler.fit_transform(train_x) - - # Transform the test data - test_x_scaled = scaler.transform(test_x) - - # Define the model - alphas = np.logspace(-4, 2, 100) # Range of alpha values to search - lasso_reg = LassoCV(cv=10, alphas=alphas) - - # Fit the model - lasso_reg.fit(train_x_scaled, self.train_y) - - # Make predictions on the training set - train_predictions = lasso_reg.predict(train_x_scaled) - - # Make predictions on the test set - test_predictions = lasso_reg.predict(test_x_scaled) - - # Calculate metrics based on these predictions. - lasso_fit_error, lasso_worst_fit_errors = self.calculate_regression_metrics( - y_true=self.train_y, y_pred=train_predictions - ) - - # Predict on new data - lasso_predict_error, lasso_worst_predict_errors = self.calculate_regression_metrics( - y_true=self.test_y, y_pred=test_predictions - ) - - lasso_fit_success = check_successes(lasso_fit_error, best_fit) - lasso_predict_success = check_successes(lasso_predict_error, best_predict) - - fit_df = pd.DataFrame( + self.predict_df = pd.DataFrame( { - "fit": train_predictions, - "actual": self.train_y, - "residual": abs(self.train_y - train_predictions), - "idx": train_idx.values + "predictions": test_predictions, + "actual": self.test_y, + "idx": test_idx } ) - fit_df = fit_df.sort_values("residual", ascending=False) - fit_df = fit_df.merge(self.model_data, on="idx") - zz = fit_df[fit_df["lighting-description"] == "Low energy lighting in all fixed outlets"] + self.diagnosis = { + "fit_success": fit_success, + "predict_success": predict_success, + "summary": self.results.summary() + } - z = fit_df.head(100).groupby("lighting-description", observed=True)["residual"].agg( - ['mean', 'count']).reset_index() - z = z.sort_values("mean", ascending=False) + @staticmethod + def check_successes(experiment_error, best_error): + """ + Simple function to check if the experiment error is better than the best error + :param experiment_error: output of calculate_regression_metrics() on the experiment + :param best_error: Current benchmark best error + :return: + """ - worst_x = self.model_data[self.model_data.index.isin(lasso_worst_fit_errors.index)] - worst_x = worst_x.merge(lasso_worst_fit_errors, left_index=True, right_index=True) - worst_x = worst_x.sort_values("Absolute Residual", ascending=False) + successes = [] + for k in experiment_error: + if k in ["Explained Variance Score", "R2 Score"]: + # We want to maximise this so we want experiment error to be higher + successes.append( + { + "measure": k, + "success": experiment_error[k] >= best_error[k], + "difference": abs(experiment_error[k] - best_error[k]) + } + ) + continue + successes.append( + { + "measure": k, + "success": experiment_error[k] <= best_error[k], + "difference": abs(experiment_error[k] - best_error[k]) + } + ) + + return pd.DataFrame(successes) + + def rf_importance(self, train_x, train_y, test_x, test_y): + """ + Utility function to estimate feature importance using a random forest + This is useful to get a sense of some of the key features which are driving model + performance + + :param train_x: Training data covariates to build the importance model on + :param train_y: Training data response to build the importance model on + :param test_x: Test data covariates to build the permutation importance model on + :param test_y: Test data response to build the permutation importance model on + :return: Pandas dataframe of feature importances, ranked by most important to least + """ + + rf = RandomForestRegressor(random_state=self.random_state) + rf.fit(train_x, train_y) + + # Print the name and importance of each feature + rf_importance_df = [] + for feature, importance in zip(train_x.columns, rf.feature_importances_): + rf_importance_df.append( + { + "Feature": feature, + "rf_importance": importance + } + ) + rf_importance_df = pd.DataFrame(rf_importance_df) + rf_importance_df = rf_importance_df.sort_values(by="rf_importance", ascending=False) + + perm_importance = self.permuation_importance(rf, test_x, test_y) + + return rf_importance_df, perm_importance + + @staticmethod + def permuation_importance(rf, test_x, test_y): + """ + Simple utility function to produce permutation importance for a given model\ + :param rf: Random forest model to calculate permutation importance for + :param test_x: Test covariates to be used for permutation importance + :param test_y: Test response to be used for permutation importance + :return: + """ + perm_importance = permutation_importance(rf, test_x, test_y, scoring='neg_mean_squared_error') + perm_importance_df = pd.DataFrame( + { + "Feature": test_x.columns, + "perm_importance": perm_importance.importances_mean + } + ).sort_values(by="perm_importance", ascending=False) + + return perm_importance_df def detect_multi_collinearity(self): # Get the VIFs for each variable @@ -589,45 +508,6 @@ class SapModel: self.train_x = self.train_x.drop(columns=drop_vifs["features"].values) self.test_x = self.test_x[self.train_x.columns] - def test_multi_collinearity(self, test_variable): - from statsmodels.regression.linear_model import OLS - # drop target variable - x_temp = self.train_x.drop(columns=[test_variable]) - - # define target variable - y_temp = self.train_x[test_variable] - - # add a constant to the predictors - x_temp = sm.add_constant(x_temp) - - # fit the model - model_temp = OLS(y_temp, x_temp).fit() - print(model_temp.summary()) - - smry = model_temp.summary() - smry_coefs = pd.DataFrame(smry.tables[1].data[1:], columns=smry.tables[1].data[0]) - smry_coefs = smry_coefs.sort_values("P>|t|", ascending=True) - - pd.set_option('display.max_rows', 500) - pd.set_option('display.max_columns', 500) - pd.set_option('display.width', 1000) - - print(smry_coefs[smry_coefs["P>|t|"].astype(float) < 0.0001]) - - import seaborn as sns - - # Select columns - selected_columns = ["main-fuel_Gas: mains gas", "mainheat-description_Community scheme"] - - # Subset dataframe - subset_df = self.train_x[selected_columns] - - # Plot pairplot - sns.pairplot(subset_df) - - crosstab1 = pd.crosstab(self.train_x[selected_columns[0]], self.train_x[selected_columns[1]]) - crosstab2 = pd.crosstab(self.train_x[selected_columns[0]], self.train_x[selected_columns[2]]) - @staticmethod def plot_regression(df): # Extract the "fit" and "actual" columns from the dataframe @@ -688,45 +568,3 @@ self = SapModel( data=all_data["data"], cleaner=all_data["cleaner"] ) - -from sklearn.ensemble import RandomForestRegressor - -rf = RandomForestRegressor(random_state=self.random_state) -X = self.df.drop(columns=self.RESPONSE) -X = X.drop( - columns=["address", "uprn-source", "heating-cost-potential", "hot-water-cost-potential", "potential-energy-rating", - "environment-impact-potential", "heating-cost-current", "address3", "local-authority-label", - "hot-water-cost-current", "county", "postcode", "constituency", "co2-emissions-potential", "address2", - "posttown", "lighting-cost-potential", "lodgement-datetime", "current-energy-rating", "uprn", - "energy-consumption-current", "lighting-cost-current", "lodgement-date", "lmk-key", - "potential-energy-efficiency", - "energy-consumption-potential", "inspection-date", "co2-emiss-curr-per-floor-area", "address1", - "constituency-label", - "building-reference-number", "environment-impact-current", "co2-emissions-current", - ]) -# Note: this probably not the best way to treat low-energy-fixed-light-count, unheated-corridor-length, -# fixed-lighting-outlets-count -for col in [ - "photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces", "floor-height", - "low-energy-fixed-light-count", "unheated-corridor-length", "fixed-lighting-outlets-count", "total-floor-area" -]: - X[col] = np.where( - X[col] == "", "0", X[col] - ).astype(float) - -Y = self.df[self.RESPONSE] -X = pd.get_dummies(X) -rf.fit(X, Y) - -# Print the name and importance of each feature -importance_df = [] -for feature, importance in zip(X.columns, rf.feature_importances_): - importance_df.append( - { - "Feature": feature, - "rf_importance": importance - } - ) - -importance_df = pd.DataFrame(importance_df) -importance_df = importance_df.sort_values("rf_importance", ascending=False)