From ccfdb7cc8c4a566c6151c8848b1dbf6e4bb2bfb3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 4 Jul 2023 15:45:34 +0100 Subject: [PATCH] Testing dropping features - new best accuracy --- model_data/analysis/SapModel.py | 88 ++++++++++++++++++++++++++------- model_data/app.py | 8 +++ model_data/requirements.txt | 3 +- 3 files changed, 80 insertions(+), 19 deletions(-) diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py index e9a9dc64..66382f87 100644 --- a/model_data/analysis/SapModel.py +++ b/model_data/analysis/SapModel.py @@ -7,6 +7,10 @@ from typing import Any, Dict, Tuple from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \ median_absolute_error, mean_absolute_percentage_error +from sklearn.linear_model import Lasso +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LinearRegression +import xgboost as xgb with open("all_data.pkl", "rb") as f: all_data = pickle.load(f) @@ -162,7 +166,7 @@ class SapModel: def clean_missings(model_data): CLEANING_COLS = ["mechanical-ventilation", "energy-tariff", "solar-water-heating-flag", "glazed-type", ""] - model_data["glazed-area"].value_counts() + model_data["construction-age-band"].value_counts() model_data["mechanical-ventilation"] = np.where( model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"] @@ -185,6 +189,10 @@ class SapModel: model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"] ) + # model_data["construction-age-band"] = np.where( + # model_data["construction-age-band"] == "", "NO DATA!", model_data["construction-age-band"] + # ) + return model_data model_data = clean_missings(model_data) @@ -264,7 +272,7 @@ class SapModel: self.remove_zero_std_cols() - # self.detect_multi_collinearity() + self.detect_multi_collinearity() # Add a constant to the independent value train_x = sm.add_constant(self.train_x) @@ -274,30 +282,49 @@ class SapModel: train_x = train_x.drop(columns=["idx"]) test_x = test_x.drop(columns=["idx"]) + importance_df = self.make_importance(train_x) + # Test dropping the least important features + to_drop = importance_df.tail(1)["Feature"].values + train_x = train_x.drop(columns=to_drop) + test_x = test_x[train_x.columns] + # make regression model model = sm.OLS(self.train_y, train_x) - # fit model and print results self.results = model.fit() + train_predictions = self.results.fittedvalues + test_predictions = self.results.predict(test_x) + + diagnose = self.test_x.copy() + diagnose["predictions"] = test_predictions + diagnose["actual"] = self.test_y.values + self.fit_error, self.worst["fit_errors"] = self.calculate_regression_metrics( - y_true=self.train_y, y_pred=self.results.fittedvalues + y_true=self.train_y, y_pred=train_predictions ) # Predict on new data - predictions = self.results.predict(test_x) self.predict_error, self.worst["prediction_errors"] = self.calculate_regression_metrics( - y_true=self.test_y, y_pred=predictions + y_true=self.test_y, y_pred=test_predictions ) - # temp hardcoded values - best_fit = {'MAPE': 0.04138090547359925, 'Mean Squared Error': 20.14558392249143, - 'Mean Absolute Error': 3.2071693100226386, 'R2 Score': 0.8070222206305815, - 'Explained Variance Score': 0.8070222206305815, 'Median Absolute Error': 2.418797962633903} + fit_df = pd.DataFrame( + { + "fit": self.results.fittedvalues, + "actual": self.train_y, + "idx": train_idx + } + ).sort_values("actual", ascending=True).merge(self.model_data[["idx", "property-type"]], on="idx") - best_predict = {'MAPE': 0.04477710915141379, 'Mean Squared Error': 24.121330207821273, - 'Mean Absolute Error': 3.443075571126256, 'R2 Score': 0.7346655266247644, - 'Explained Variance Score': 0.7346701958813864, 'Median Absolute Error': 2.5234727208706076} + # temp hardcoded values + best_fit = {'MAPE': 0.042768242654695386, 'Mean Squared Error': 21.606875710236896, + 'Mean Absolute Error': 3.293776606279645, 'R2 Score': 0.7930242722318233, + 'Explained Variance Score': 0.7930242722318233, 'Median Absolute Error': 2.47686604239054} + + best_predict = {'MAPE': 0.04397538047202114, 'Mean Squared Error': 22.582856696398935, + 'Mean Absolute Error': 3.384549163877968, 'R2 Score': 0.7515887251149801, + 'Explained Variance Score': 0.7516508219403573, 'Median Absolute Error': 2.4624472128668344} def check_successes(experiment_error, best_error): @@ -338,8 +365,6 @@ class SapModel: ).sort_values("actual", ascending=True) # TODO: Testing - from sklearn.linear_model import Lasso - from sklearn.preprocessing import StandardScaler # Create a StandardScaler instance scaler = StandardScaler() @@ -377,8 +402,6 @@ class SapModel: lasso_predict_success = check_successes(lasso_predict_error, best_predict) # TODO: TESTING 2 - from sklearn.linear_model import LassoCV - from sklearn.preprocessing import StandardScaler # Create a StandardScaler instance scaler = StandardScaler() @@ -436,6 +459,35 @@ class SapModel: worst_x = worst_x.merge(lasso_worst_fit_errors, left_index=True, right_index=True) worst_x = worst_x.sort_values("Absolute Residual", ascending=False) + def make_importance(self, train_x): + + # Create a DMatrix from your training data + dtrain = xgb.DMatrix(train_x, label=self.train_y) + + # Set the parameters for the XGBoost model + params = { + 'objective': 'reg:squarederror', + 'eval_metric': 'rmse' + } + + # Train the XGBoost model + model = xgb.train(params, dtrain) + + # Get feature importance scores + importance_scores = model.get_score(importance_type='gain') + + # Create a dataframe with feature names and importance scores + importance_df = pd.DataFrame({ + 'Feature': importance_scores.keys(), + 'Importance': importance_scores.values() + }) + + # Sort the dataframe by importance score in descending order + importance_df = importance_df.sort_values(by='Importance', ascending=False) + + # Print the feature importances + return importance_df + def detect_multi_collinearity(self): from statsmodels.stats.outliers_influence import variance_inflation_factor from tqdm import tqdm @@ -453,7 +505,7 @@ class SapModel: ] vifs = vifs[~vifs["features"].isin(required_features)] - drop_vifs = vifs[vifs["vif"] > 100] + drop_vifs = vifs[np.isinf(vifs["vif"])] # Acceptable drop variables: # main-fuel_Gas: mains gas diff --git a/model_data/app.py b/model_data/app.py index 586337db..8e340e9e 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -296,3 +296,11 @@ def handler(): # (summary["construction-age-band"] == "England and Wales: 1976-1982") (summary["number-habitable-rooms"] == "4") ] + + from textblob import TextBlob + converter = TextBlob("excelent lighting in this hosehold") + + from model_data.utils import correct_spelling + result = correct_spelling("excelent lighting in this hosehold") + print(result) + 'excellent lighting in this household' diff --git a/model_data/requirements.txt b/model_data/requirements.txt index ff4d3dda..13012d8a 100644 --- a/model_data/requirements.txt +++ b/model_data/requirements.txt @@ -18,4 +18,5 @@ seaborn statsmodels scikit-learn pyspellchecker -textblob \ No newline at end of file +textblob +xgboost \ No newline at end of file