mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Testing dropping features - new best accuracy
This commit is contained in:
parent
c698f49d58
commit
ccfdb7cc8c
3 changed files with 80 additions and 19 deletions
|
|
@ -7,6 +7,10 @@ from typing import Any, Dict, Tuple
|
|||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
||||
median_absolute_error, mean_absolute_percentage_error
|
||||
from sklearn.linear_model import Lasso
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.linear_model import LinearRegression
|
||||
import xgboost as xgb
|
||||
|
||||
with open("all_data.pkl", "rb") as f:
|
||||
all_data = pickle.load(f)
|
||||
|
|
@ -162,7 +166,7 @@ class SapModel:
|
|||
|
||||
def clean_missings(model_data):
|
||||
CLEANING_COLS = ["mechanical-ventilation", "energy-tariff", "solar-water-heating-flag", "glazed-type", ""]
|
||||
model_data["glazed-area"].value_counts()
|
||||
model_data["construction-age-band"].value_counts()
|
||||
|
||||
model_data["mechanical-ventilation"] = np.where(
|
||||
model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"]
|
||||
|
|
@ -185,6 +189,10 @@ class SapModel:
|
|||
model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"]
|
||||
)
|
||||
|
||||
# model_data["construction-age-band"] = np.where(
|
||||
# model_data["construction-age-band"] == "", "NO DATA!", model_data["construction-age-band"]
|
||||
# )
|
||||
|
||||
return model_data
|
||||
|
||||
model_data = clean_missings(model_data)
|
||||
|
|
@ -264,7 +272,7 @@ class SapModel:
|
|||
|
||||
self.remove_zero_std_cols()
|
||||
|
||||
# self.detect_multi_collinearity()
|
||||
self.detect_multi_collinearity()
|
||||
|
||||
# Add a constant to the independent value
|
||||
train_x = sm.add_constant(self.train_x)
|
||||
|
|
@ -274,30 +282,49 @@ class SapModel:
|
|||
train_x = train_x.drop(columns=["idx"])
|
||||
test_x = test_x.drop(columns=["idx"])
|
||||
|
||||
importance_df = self.make_importance(train_x)
|
||||
# Test dropping the least important features
|
||||
to_drop = importance_df.tail(1)["Feature"].values
|
||||
train_x = train_x.drop(columns=to_drop)
|
||||
test_x = test_x[train_x.columns]
|
||||
|
||||
# make regression model
|
||||
model = sm.OLS(self.train_y, train_x)
|
||||
|
||||
# fit model and print results
|
||||
self.results = model.fit()
|
||||
|
||||
train_predictions = self.results.fittedvalues
|
||||
test_predictions = self.results.predict(test_x)
|
||||
|
||||
diagnose = self.test_x.copy()
|
||||
diagnose["predictions"] = test_predictions
|
||||
diagnose["actual"] = self.test_y.values
|
||||
|
||||
self.fit_error, self.worst["fit_errors"] = self.calculate_regression_metrics(
|
||||
y_true=self.train_y, y_pred=self.results.fittedvalues
|
||||
y_true=self.train_y, y_pred=train_predictions
|
||||
)
|
||||
|
||||
# Predict on new data
|
||||
predictions = self.results.predict(test_x)
|
||||
self.predict_error, self.worst["prediction_errors"] = self.calculate_regression_metrics(
|
||||
y_true=self.test_y, y_pred=predictions
|
||||
y_true=self.test_y, y_pred=test_predictions
|
||||
)
|
||||
|
||||
# temp hardcoded values
|
||||
best_fit = {'MAPE': 0.04138090547359925, 'Mean Squared Error': 20.14558392249143,
|
||||
'Mean Absolute Error': 3.2071693100226386, 'R2 Score': 0.8070222206305815,
|
||||
'Explained Variance Score': 0.8070222206305815, 'Median Absolute Error': 2.418797962633903}
|
||||
fit_df = pd.DataFrame(
|
||||
{
|
||||
"fit": self.results.fittedvalues,
|
||||
"actual": self.train_y,
|
||||
"idx": train_idx
|
||||
}
|
||||
).sort_values("actual", ascending=True).merge(self.model_data[["idx", "property-type"]], on="idx")
|
||||
|
||||
best_predict = {'MAPE': 0.04477710915141379, 'Mean Squared Error': 24.121330207821273,
|
||||
'Mean Absolute Error': 3.443075571126256, 'R2 Score': 0.7346655266247644,
|
||||
'Explained Variance Score': 0.7346701958813864, 'Median Absolute Error': 2.5234727208706076}
|
||||
# temp hardcoded values
|
||||
best_fit = {'MAPE': 0.042768242654695386, 'Mean Squared Error': 21.606875710236896,
|
||||
'Mean Absolute Error': 3.293776606279645, 'R2 Score': 0.7930242722318233,
|
||||
'Explained Variance Score': 0.7930242722318233, 'Median Absolute Error': 2.47686604239054}
|
||||
|
||||
best_predict = {'MAPE': 0.04397538047202114, 'Mean Squared Error': 22.582856696398935,
|
||||
'Mean Absolute Error': 3.384549163877968, 'R2 Score': 0.7515887251149801,
|
||||
'Explained Variance Score': 0.7516508219403573, 'Median Absolute Error': 2.4624472128668344}
|
||||
|
||||
def check_successes(experiment_error, best_error):
|
||||
|
||||
|
|
@ -338,8 +365,6 @@ class SapModel:
|
|||
).sort_values("actual", ascending=True)
|
||||
|
||||
# TODO: Testing
|
||||
from sklearn.linear_model import Lasso
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
# Create a StandardScaler instance
|
||||
scaler = StandardScaler()
|
||||
|
|
@ -377,8 +402,6 @@ class SapModel:
|
|||
lasso_predict_success = check_successes(lasso_predict_error, best_predict)
|
||||
|
||||
# TODO: TESTING 2
|
||||
from sklearn.linear_model import LassoCV
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
# Create a StandardScaler instance
|
||||
scaler = StandardScaler()
|
||||
|
|
@ -436,6 +459,35 @@ class SapModel:
|
|||
worst_x = worst_x.merge(lasso_worst_fit_errors, left_index=True, right_index=True)
|
||||
worst_x = worst_x.sort_values("Absolute Residual", ascending=False)
|
||||
|
||||
def make_importance(self, train_x):
|
||||
|
||||
# Create a DMatrix from your training data
|
||||
dtrain = xgb.DMatrix(train_x, label=self.train_y)
|
||||
|
||||
# Set the parameters for the XGBoost model
|
||||
params = {
|
||||
'objective': 'reg:squarederror',
|
||||
'eval_metric': 'rmse'
|
||||
}
|
||||
|
||||
# Train the XGBoost model
|
||||
model = xgb.train(params, dtrain)
|
||||
|
||||
# Get feature importance scores
|
||||
importance_scores = model.get_score(importance_type='gain')
|
||||
|
||||
# Create a dataframe with feature names and importance scores
|
||||
importance_df = pd.DataFrame({
|
||||
'Feature': importance_scores.keys(),
|
||||
'Importance': importance_scores.values()
|
||||
})
|
||||
|
||||
# Sort the dataframe by importance score in descending order
|
||||
importance_df = importance_df.sort_values(by='Importance', ascending=False)
|
||||
|
||||
# Print the feature importances
|
||||
return importance_df
|
||||
|
||||
def detect_multi_collinearity(self):
|
||||
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
||||
from tqdm import tqdm
|
||||
|
|
@ -453,7 +505,7 @@ class SapModel:
|
|||
]
|
||||
|
||||
vifs = vifs[~vifs["features"].isin(required_features)]
|
||||
drop_vifs = vifs[vifs["vif"] > 100]
|
||||
drop_vifs = vifs[np.isinf(vifs["vif"])]
|
||||
|
||||
# Acceptable drop variables:
|
||||
# main-fuel_Gas: mains gas
|
||||
|
|
|
|||
|
|
@ -296,3 +296,11 @@ def handler():
|
|||
# (summary["construction-age-band"] == "England and Wales: 1976-1982")
|
||||
(summary["number-habitable-rooms"] == "4")
|
||||
]
|
||||
|
||||
from textblob import TextBlob
|
||||
converter = TextBlob("excelent lighting in this hosehold")
|
||||
|
||||
from model_data.utils import correct_spelling
|
||||
result = correct_spelling("excelent lighting in this hosehold")
|
||||
print(result)
|
||||
'excellent lighting in this household'
|
||||
|
|
|
|||
|
|
@ -18,4 +18,5 @@ seaborn
|
|||
statsmodels
|
||||
scikit-learn
|
||||
pyspellchecker
|
||||
textblob
|
||||
textblob
|
||||
xgboost
|
||||
Loading…
Add table
Reference in a new issue