From c698f49d58256c3b8fb57ae22fb219700fc09b93 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 4 Jul 2023 12:55:59 +0100 Subject: [PATCH] categorical cleaning in progress --- model_data/analysis/SapModel.py | 200 +++++++++++++++++++++++++++++++- 1 file changed, 195 insertions(+), 5 deletions(-) diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py index f99cf583..e9a9dc64 100644 --- a/model_data/analysis/SapModel.py +++ b/model_data/analysis/SapModel.py @@ -160,6 +160,35 @@ class SapModel: # Append on u-values model_data = self._append_cleaned_data(model_data) + def clean_missings(model_data): + CLEANING_COLS = ["mechanical-ventilation", "energy-tariff", "solar-water-heating-flag", "glazed-type", ""] + model_data["glazed-area"].value_counts() + + model_data["mechanical-ventilation"] = np.where( + model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"] + ) + + # REVIEW THIS + # model_data["energy-tariff"] = np.where( + # model_data["energy-tariff"] == "", "Unknown", model_data["mechanical-ventilation"] + # ) + # + model_data["solar-water-heating-flag"] = np.where( + model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"] + ) + + model_data["glazed-type"] = np.where( + model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"] + ) + + model_data["glazed-area"] = np.where( + model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"] + ) + + return model_data + + model_data = clean_missings(model_data) + # Convert transaction_type model_data = self._convert_transaction_type(model_data) @@ -181,7 +210,7 @@ class SapModel: features = [ x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [ - "walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE + "walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx" ] if x not in exclude_features ] @@ -235,8 +264,15 @@ class SapModel: self.remove_zero_std_cols() + # self.detect_multi_collinearity() + # Add a constant to the independent value train_x = sm.add_constant(self.train_x) + test_x = sm.add_constant(self.test_x) + train_idx = train_x["idx"].copy() + test_ids = self.test_x["idx"].copy() + train_x = train_x.drop(columns=["idx"]) + test_x = test_x.drop(columns=["idx"]) # make regression model model = sm.OLS(self.train_y, train_x) @@ -249,7 +285,7 @@ class SapModel: ) # Predict on new data - predictions = self.results.predict(sm.add_constant(self.test_x)) + predictions = self.results.predict(test_x) self.predict_error, self.worst["prediction_errors"] = self.calculate_regression_metrics( y_true=self.test_y, y_pred=predictions ) @@ -267,7 +303,7 @@ class SapModel: successes = [] for k in experiment_error: - if k == "Explained Variance Score": + if k in ["Explained Variance Score", "R2 Score"]: # We want to maximise this so we want experiment error to be higher successes.append( { @@ -287,8 +323,8 @@ class SapModel: return pd.DataFrame(successes) - check_successes(self.fit_error, best_fit) - check_successes(self.predict_error, best_predict) + fit_success = check_successes(self.fit_error, best_fit) + predict_success = check_successes(self.predict_error, best_predict) self.model_data['fit'] = self.results.fittedvalues # The worst errors over index heavily for flats @@ -301,6 +337,105 @@ class SapModel: } ).sort_values("actual", ascending=True) + # TODO: Testing + from sklearn.linear_model import Lasso + from sklearn.preprocessing import StandardScaler + + # Create a StandardScaler instance + scaler = StandardScaler() + + # Fit the scaler to the training data and transform it + train_x_scaled = scaler.fit_transform(train_x) + + # Transform the test data + test_x_scaled = scaler.transform(test_x) + + # Define the model + lasso_reg = Lasso( + alpha=0.1) # you can change the alpha parameter to adjust the strength of the regularization. + + # Fit the model + lasso_reg.fit(train_x_scaled, self.train_y) + + # Make predictions on the training set + train_predictions = lasso_reg.predict(train_x_scaled) + + # Make predictions on the test set + test_predictions = lasso_reg.predict(test_x_scaled) + + # Calculate metrics based on these predictions. + lasso_fit_error, _ = self.calculate_regression_metrics( + y_true=self.train_y, y_pred=train_predictions + ) + + # Predict on new data + lasso_predict_error, _ = self.calculate_regression_metrics( + y_true=self.test_y, y_pred=test_predictions + ) + + lasso_fit_success = check_successes(lasso_fit_error, best_fit) + lasso_predict_success = check_successes(lasso_predict_error, best_predict) + + # TODO: TESTING 2 + from sklearn.linear_model import LassoCV + from sklearn.preprocessing import StandardScaler + + # Create a StandardScaler instance + scaler = StandardScaler() + + # Fit the scaler to the training data and transform it + train_x_scaled = scaler.fit_transform(train_x) + + # Transform the test data + test_x_scaled = scaler.transform(test_x) + + # Define the model + alphas = np.logspace(-4, 2, 100) # Range of alpha values to search + lasso_reg = LassoCV(cv=10, alphas=alphas) + + # Fit the model + lasso_reg.fit(train_x_scaled, self.train_y) + + # Make predictions on the training set + train_predictions = lasso_reg.predict(train_x_scaled) + + # Make predictions on the test set + test_predictions = lasso_reg.predict(test_x_scaled) + + # Calculate metrics based on these predictions. + lasso_fit_error, lasso_worst_fit_errors = self.calculate_regression_metrics( + y_true=self.train_y, y_pred=train_predictions + ) + + # Predict on new data + lasso_predict_error, lasso_worst_predict_errors = self.calculate_regression_metrics( + y_true=self.test_y, y_pred=test_predictions + ) + + lasso_fit_success = check_successes(lasso_fit_error, best_fit) + lasso_predict_success = check_successes(lasso_predict_error, best_predict) + + fit_df = pd.DataFrame( + { + "fit": train_predictions, + "actual": self.train_y, + "residual": abs(self.train_y - train_predictions), + "idx": train_idx.values + } + ) + fit_df = fit_df.sort_values("residual", ascending=False) + fit_df = fit_df.merge(self.model_data, on="idx") + + zz = fit_df[fit_df["lighting-description"] == "Low energy lighting in all fixed outlets"] + + z = fit_df.head(100).groupby("lighting-description", observed=True)["residual"].agg( + ['mean', 'count']).reset_index() + z = z.sort_values("mean", ascending=False) + + worst_x = self.model_data[self.model_data.index.isin(lasso_worst_fit_errors.index)] + worst_x = worst_x.merge(lasso_worst_fit_errors, left_index=True, right_index=True) + worst_x = worst_x.sort_values("Absolute Residual", ascending=False) + def detect_multi_collinearity(self): from statsmodels.stats.outliers_influence import variance_inflation_factor from tqdm import tqdm @@ -312,6 +447,61 @@ class SapModel: # Get the features with the highest VIF vifs = vifs.sort_values("vif", ascending=False) + # There are some features, we do not want to remove + required_features = [ + "walls_u_value", "floor_u_value", "roof_u_value" + ] + + vifs = vifs[~vifs["features"].isin(required_features)] + drop_vifs = vifs[vifs["vif"] > 100] + + # Acceptable drop variables: + # main-fuel_Gas: mains gas + # glazed-type_NO DATA! + # glazed-area_NO DATA! + + self.train_x = self.train_x.drop(columns=drop_vifs["features"].values) + self.test_x = self.test_x[self.train_x.columns] + + def test_multi_collinearity(self, test_variable): + from statsmodels.regression.linear_model import OLS + # drop target variable + x_temp = self.train_x.drop(columns=[test_variable]) + + # define target variable + y_temp = self.train_x[test_variable] + + # add a constant to the predictors + x_temp = sm.add_constant(x_temp) + + # fit the model + model_temp = OLS(y_temp, x_temp).fit() + print(model_temp.summary()) + + smry = model_temp.summary() + smry_coefs = pd.DataFrame(smry.tables[1].data[1:], columns=smry.tables[1].data[0]) + smry_coefs = smry_coefs.sort_values("P>|t|", ascending=True) + + pd.set_option('display.max_rows', 500) + pd.set_option('display.max_columns', 500) + pd.set_option('display.width', 1000) + + print(smry_coefs[smry_coefs["P>|t|"].astype(float) < 0.0001]) + + import seaborn as sns + + # Select columns + selected_columns = ["main-fuel_Gas: mains gas", "mainheat-description_Community scheme"] + + # Subset dataframe + subset_df = self.train_x[selected_columns] + + # Plot pairplot + sns.pairplot(subset_df) + + crosstab1 = pd.crosstab(self.train_x[selected_columns[0]], self.train_x[selected_columns[1]]) + crosstab2 = pd.crosstab(self.train_x[selected_columns[0]], self.train_x[selected_columns[2]]) + @staticmethod def plot_regression(df): # Extract the "fit" and "actual" columns from the dataframe