diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py index 45ffd530..190a4c36 100644 --- a/model_data/analysis/SapModel.py +++ b/model_data/analysis/SapModel.py @@ -11,13 +11,16 @@ from sklearn.linear_model import Lasso from sklearn.preprocessing import StandardScaler import xgboost as xgb +from statsmodels.stats.outliers_influence import variance_inflation_factor +from tqdm import tqdm + with open("all_data.pkl", "rb") as f: all_data = pickle.load(f) class SapModel: # We want to estimate for making improvements on different property components - RESPONSE = "environment-impact-current" + RESPONSE = "current-energy-efficiency" # We could potentially build models by constituency to avoid having too many # features in the model BASE_FEATURES = [ @@ -50,6 +53,12 @@ class SapModel: # "lighting-description" # Might not need to use this "low-energy-lighting", "number-open-fireplaces", + "mainheatcont-description", + "fixed-lighting-outlets-count", + "floor-height", + "floor-level", + "total-floor-area", + "extension-count", ] CATEGORICAL_COLS = [ @@ -69,8 +78,17 @@ class SapModel: "glazed-type", "glazed-area", "construction-age-band", - # Testing - "lighting-description" + "lighting-description", + "mainheatcont-description", + "floor-level", + ] + + NUMERICAL_COLUMNS = [ + "photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces", + "fixed-lighting-outlets-count", + "floor-height", + "total-floor-area", + "extension-count", ] def __init__(self, data, cleaner, test_size=0.2, random_state=None): @@ -145,14 +163,82 @@ class SapModel: model_data = model_data.drop(columns=["transaction-type"]) return model_data - @staticmethod - def _clean_numericals(model_data): + def _clean_numericals(self, model_data): - for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces"]: + # Try binning numericals + def bucket_and_fill(df, column_name, n_bins=10): + # Check if the column is numerical + if np.issubdtype(df[column_name].dtype, np.number): + # Create a new categorical column from numerical one by binning the data + df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str) + # Replace missing data with "NO_RECORD" + df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD") + df[column_name + "_bucket"] = np.where( + df[column_name + "_bucket"] == "nan", + "NO_RECORD", + df[column_name + "_bucket"] + ) + return df + + bucket_variables = [] + remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in bucket_variables] + + for col in bucket_variables: + model_data[col] = pd.to_numeric(model_data[col], errors='coerce') + model_data = bucket_and_fill(model_data, col) + + # Replace the data with the binned version + model_data = model_data.drop(columns=bucket_variables) + model_data = model_data.rename( + columns=dict(zip([c + "_bucket" for c in bucket_variables], bucket_variables)) + ) + + for col in remaining_numericals: model_data[col] = np.where( model_data[col] == "", "0", model_data[col] ).astype(float) + # Basic fill the rest of the columns with 0 + + # grouping_variables = ["constituency", "built-form", "property-type"] + # secondary_grouping_variables = ["constituency", "property-type"] + # + # def append_missings(model_data, col, grouping_variables, secondary=False): + # if not secondary: + # numerical_data = model_data[(model_data[col] != "")].copy() + # else: + # numerical_data = model_data[~pd.isnull(model_data[col])].copy() + # + # numerical_data[col] = numerical_data[col].astype(float) + # numerical_data = numerical_data.groupby(grouping_variables, observed=True)[col].mean().reset_index() + # numerical_data = numerical_data.rename(columns={col: f"{col}_avg"}) + # model_data = model_data.merge(numerical_data, how="left", on=grouping_variables) + # + # if not secondary: + # model_data[col] = np.where(model_data[col] == "", model_data[f"{col}_avg"], model_data[col]) + # else: + # model_data[col] = np.where(pd.isnull(model_data[col]), model_data[f"{col}_avg"], model_data[col]) + # + # model_data = model_data.drop(columns=[f"{col}_avg"]) + # model_data[col] = model_data[col].astype(float) + # + # return model_data + # + # for col in numerical_cols: + # model_data[col] = pd.to_numeric(model_data[col], errors='coerce') + # model_data[col] = model_data[col].fillna(model_data[col].mean()) + # model_data = append_missings(model_data, col, grouping_variables) + # + # if pd.isnull(model_data[col]).sum(): + # model_data = append_missings(model_data, col, secondary_grouping_variables, True) + # + # if pd.isnull(model_data[col]).sum(): + # raise Exception("Cleaning failed, missing values remain") + # + # model_data[col] = np.where( + # model_data[col] == "", "0", model_data[col] + # ).astype(float) + return model_data @staticmethod @@ -212,7 +298,7 @@ class SapModel: features = [ x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [ - "walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx" + "walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx", "is_rdsap" ] if x not in exclude_features ] @@ -251,7 +337,9 @@ class SapModel: def fit_model(self): # Dummy out the categorical variables - x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS, drop_first=True) + binned = [] + + x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + binned, drop_first=True) # Convert booleans to integer for col in x.columns: @@ -276,16 +364,6 @@ class SapModel: train_x = train_x.drop(columns=["idx"]) test_x = test_x.drop(columns=["idx"]) - # importance_df = self.make_importance(train_x) - # Test dropping the least important features - # to_drop = importance_df.tail(2)["Feature"].values - # Dropping this is a good idea - to_drop = [ - "hotwater-description_Electric immersion, off-peak", - ] - train_x = train_x.drop(columns=to_drop) - test_x = test_x[train_x.columns] - from sklearn.ensemble import RandomForestRegressor from sklearn.inspection import permutation_importance @@ -342,13 +420,13 @@ class SapModel: ).sort_values("actual", ascending=True).merge(self.model_data[["idx", "property-type"]], on="idx") # temp hardcoded values - best_fit = {'MAPE': 0.042824355225087686, 'Mean Squared Error': 21.49263731368226, - 'Mean Absolute Error': 3.298755911054327, 'R2 Score': 0.794118580154128, - 'Explained Variance Score': 0.794118580154128, 'Median Absolute Error': 2.426789554039914} + best_fit = {'MAPE': 0.04617542805587113, 'Mean Squared Error': 18.62306128026334, + 'Mean Absolute Error': 2.865262003625814, 'R2 Score': 0.8008316762496143, + 'Explained Variance Score': 0.8008316762496143, 'Median Absolute Error': 1.911197425417548} - best_predict = {'MAPE': 0.04413439429441669, 'Mean Squared Error': 22.700373062051142, - 'Mean Absolute Error': 3.3961241443022008, 'R2 Score': 0.750296045867001, - 'Explained Variance Score': 0.7503518147827141, 'Median Absolute Error': 2.4442017110145855} + best_predict = {'MAPE': 0.04358926901734807, 'Mean Squared Error': 21.197491698961528, + 'Mean Absolute Error': 3.046853690257838, 'R2 Score': 0.7215087343364782, + 'Explained Variance Score': 0.7215726927575035, 'Median Absolute Error': 1.921094388694634} def check_successes(experiment_error, best_error): @@ -485,38 +563,7 @@ class SapModel: worst_x = worst_x.merge(lasso_worst_fit_errors, left_index=True, right_index=True) worst_x = worst_x.sort_values("Absolute Residual", ascending=False) - def make_importance(self, train_x): - - # Create a DMatrix from your training data - dtrain = xgb.DMatrix(train_x, label=self.train_y) - - # Set the parameters for the XGBoost model - params = { - 'objective': 'reg:squarederror', - 'eval_metric': 'rmse' - } - - # Train the XGBoost model - model = xgb.train(params, dtrain) - - # Get feature importance scores - importance_scores = model.get_score(importance_type='gain') - - # Create a dataframe with feature names and importance scores - importance_df = pd.DataFrame({ - 'Feature': importance_scores.keys(), - 'Importance': importance_scores.values() - }) - - # Sort the dataframe by importance score in descending order - importance_df = importance_df.sort_values(by='Importance', ascending=False) - - # Print the feature importances - return importance_df - def detect_multi_collinearity(self): - from statsmodels.stats.outliers_influence import variance_inflation_factor - from tqdm import tqdm # Get the VIFs for each variable vifs = pd.DataFrame() vifs["features"] = self.train_x.columns @@ -616,14 +663,14 @@ class SapModel: Returns: dict: Dictionary containing the calculated metrics. """ - metrics = {} - - metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred) - metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred) - metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred) - metrics['R2 Score'] = r2_score(y_true, y_pred) - metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred) - metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred) + metrics = { + 'MAPE': mean_absolute_percentage_error(y_true, y_pred), + 'Mean Squared Error': mean_squared_error(y_true, y_pred), + 'Mean Absolute Error': mean_absolute_error(y_true, y_pred), + 'R2 Score': r2_score(y_true, y_pred), + 'Explained Variance Score': explained_variance_score(y_true, y_pred), + 'Median Absolute Error': median_absolute_error(y_true, y_pred) + } errors = pd.DataFrame() errors['Fit'] = y_true @@ -645,20 +692,40 @@ from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(random_state=self.random_state) X = self.df.drop(columns=self.RESPONSE) -for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces"]: +X = X.drop( + columns=["address", "uprn-source", "heating-cost-potential", "hot-water-cost-potential", "potential-energy-rating", + "environment-impact-potential", "heating-cost-current", "address3", "local-authority-label", + "hot-water-cost-current", "county", "postcode", "constituency", "co2-emissions-potential", "address2", + "posttown", "lighting-cost-potential", "lodgement-datetime", "current-energy-rating", "uprn", + "energy-consumption-current", "lighting-cost-current", "lodgement-date", "lmk-key", + "potential-energy-efficiency", + "energy-consumption-potential", "inspection-date", "co2-emiss-curr-per-floor-area", "address1", + "constituency-label", + "building-reference-number", "environment-impact-current", "co2-emissions-current", + ]) +# Note: this probably not the best way to treat low-energy-fixed-light-count, unheated-corridor-length, +# fixed-lighting-outlets-count +for col in [ + "photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces", "floor-height", + "low-energy-fixed-light-count", "unheated-corridor-length", "fixed-lighting-outlets-count", "total-floor-area" +]: X[col] = np.where( X[col] == "", "0", X[col] ).astype(float) Y = self.df[self.RESPONSE] +X = pd.get_dummies(X) rf.fit(X, Y) # Print the name and importance of each feature importance_df = [] -for feature, importance in zip(train_x.columns, rf.feature_importances_): +for feature, importance in zip(X.columns, rf.feature_importances_): importance_df.append( { "Feature": feature, "rf_importance": importance } ) + +importance_df = pd.DataFrame(importance_df) +importance_df = importance_df.sort_values("rf_importance", ascending=False)