Cleaning up modelling code

This commit is contained in:
Khalim Conn-Kowlessar 2023-07-05 09:34:14 +01:00
parent 946e6d092e
commit 710f446ebc

View file

@ -7,9 +7,8 @@ from typing import Any, Dict, Tuple
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
median_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tqdm import tqdm
@ -91,6 +90,23 @@ class SapModel:
"extension-count",
]
# For the moment, we store records of the best performing models as a benchmark for future imporvements
BEST_FIT = {
'MAPE': 0.04646530042225876, 'Mean Squared Error': 18.635209563729763,
'Mean Absolute Error': 2.856347408023325, 'R2 Score': 0.800701753826118,
'Explained Variance Score': 0.800701753826118, 'Median Absolute Error': 1.9026758012120197
}
BEST_PREDICT = {
'MAPE': 0.04346083528432316, 'Mean Squared Error': 21.16036509335514,
'Mean Absolute Error': 3.0440540802375833, 'R2 Score': 0.7219965012634312,
'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828
}
BUCKET_VARIABLES = [
"number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion'
]
def __init__(self, data, cleaner, test_size=0.2, random_state=None):
self.df = pd.DataFrame(data)
self.cleaner = cleaner
@ -107,7 +123,10 @@ class SapModel:
self.fit_error = None
self.predict_error = None
self.worst = {"fit_errors": pd.DataFrame(), "x": pd.DataFrame(), "prediction_errors": pd.DataFrame()}
self.fit_df = None
self.predict_df = None
self.diagnosis = {}
def run(self, plot=False):
"""
@ -163,83 +182,49 @@ class SapModel:
model_data = model_data.drop(columns=["transaction-type"])
return model_data
@staticmethod
def bucket_and_fill(df, column_name, n_bins=10):
"""
Simple utility function to bucket up features into bins and then fill any missing values with "NO_RECORD"
:param df: Dataframe of features to be binned
:param column_name: Name of the column to be binned
:param n_bins: Number of bins to use
:return: Dataframe with binned column
"""
# Check if the column is numerical
if np.issubdtype(df[column_name].dtype, np.number):
# Create a new categorical column from numerical one by binning the data
df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str)
# Replace missing data with "NO_RECORD"
df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD")
df[column_name + "_bucket"] = np.where(
df[column_name + "_bucket"] == "nan",
"NO_RECORD",
df[column_name + "_bucket"]
)
return df
def _clean_numericals(self, model_data):
# Try binning numericals
def bucket_and_fill(df, column_name, n_bins=10):
# Check if the column is numerical
if np.issubdtype(df[column_name].dtype, np.number):
# Create a new categorical column from numerical one by binning the data
df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str)
# Replace missing data with "NO_RECORD"
df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD")
df[column_name + "_bucket"] = np.where(
df[column_name + "_bucket"] == "nan",
"NO_RECORD",
df[column_name + "_bucket"]
)
return df
remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in self.BUCKET_VARIABLES]
bucket_variables = ["number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count',
'multi-glaze-proportion']
remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in bucket_variables]
for col in bucket_variables:
for col in self.BUCKET_VARIABLES:
model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
model_data = bucket_and_fill(model_data, col)
model_data = self.bucket_and_fill(model_data, col)
# Replace the data with the binned version
model_data = model_data.drop(columns=bucket_variables)
model_data = model_data.drop(columns=self.BUCKET_VARIABLES)
model_data = model_data.rename(
columns=dict(zip([c + "_bucket" for c in bucket_variables], bucket_variables))
columns=dict(zip([c + "_bucket" for c in self.BUCKET_VARIABLES], self.BUCKET_VARIABLES))
)
# Basic fill the rest of the columns with 0 - currenrtly this provided the best performance
for col in remaining_numericals:
model_data[col] = np.where(
model_data[col] == "", "0", model_data[col]
).astype(float)
# Basic fill the rest of the columns with 0
# grouping_variables = ["constituency", "built-form", "property-type"]
# secondary_grouping_variables = ["constituency", "property-type"]
#
# def append_missings(model_data, col, grouping_variables, secondary=False):
# if not secondary:
# numerical_data = model_data[(model_data[col] != "")].copy()
# else:
# numerical_data = model_data[~pd.isnull(model_data[col])].copy()
#
# numerical_data[col] = numerical_data[col].astype(float)
# numerical_data = numerical_data.groupby(grouping_variables, observed=True)[col].mean().reset_index()
# numerical_data = numerical_data.rename(columns={col: f"{col}_avg"})
# model_data = model_data.merge(numerical_data, how="left", on=grouping_variables)
#
# if not secondary:
# model_data[col] = np.where(model_data[col] == "", model_data[f"{col}_avg"], model_data[col])
# else:
# model_data[col] = np.where(pd.isnull(model_data[col]), model_data[f"{col}_avg"], model_data[col])
#
# model_data = model_data.drop(columns=[f"{col}_avg"])
# model_data[col] = model_data[col].astype(float)
#
# return model_data
#
# for col in numerical_cols:
# model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
# model_data[col] = model_data[col].fillna(model_data[col].mean())
# model_data = append_missings(model_data, col, grouping_variables)
#
# if pd.isnull(model_data[col]).sum():
# model_data = append_missings(model_data, col, secondary_grouping_variables, True)
#
# if pd.isnull(model_data[col]).sum():
# raise Exception("Cleaning failed, missing values remain")
#
# model_data[col] = np.where(
# model_data[col] == "", "0", model_data[col]
# ).astype(float)
return model_data
@staticmethod
@ -336,11 +321,12 @@ class SapModel:
self.test_x = self.test_x[self.train_x.columns]
def fit_model(self):
"""
Main function to fit the model and produce accuracy metrics
:return:
"""
# Dummy out the categorical variables
binned = ["number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion']
x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + binned, drop_first=True)
x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + self.BUCKET_VARIABLES, drop_first=True)
# Convert booleans to integer
for col in x.columns:
@ -361,36 +347,10 @@ class SapModel:
train_x = sm.add_constant(self.train_x)
test_x = sm.add_constant(self.test_x)
train_idx = train_x["idx"].copy()
test_ids = self.test_x["idx"].copy()
test_idx = self.test_x["idx"].copy()
train_x = train_x.drop(columns=["idx"])
test_x = test_x.drop(columns=["idx"])
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
rf = RandomForestRegressor(random_state=self.random_state)
rf.fit(train_x, self.train_y)
# Print the name and importance of each feature
importance_df = []
for feature, importance in zip(train_x.columns, rf.feature_importances_):
importance_df.append(
{
"Feature": feature,
"rf_importance": importance
}
)
importance_df = pd.DataFrame(importance_df)
importance_df = importance_df.sort_values(by="rf_importance", ascending=False)
perm_importance = permutation_importance(rf, test_x, self.test_y, scoring='neg_mean_squared_error')
perm_importance_df = pd.DataFrame(
{
"Feature": test_x.columns,
"perm_importance": perm_importance.importances_mean
}
).sort_values(by="perm_importance", ascending=False)
# make regression model
model = sm.OLS(self.train_y, train_x)
# fit model and print results
@ -412,157 +372,116 @@ class SapModel:
y_true=self.test_y, y_pred=test_predictions
)
fit_df = pd.DataFrame(
{
"fit": self.results.fittedvalues,
"actual": self.train_y,
"idx": train_idx
}
).sort_values("actual", ascending=True).merge(self.model_data[["idx", "property-type"]], on="idx")
fit_success = self.check_successes(self.fit_error, self.BEST_FIT)
predict_success = self.check_successes(self.predict_error, self.BEST_PREDICT)
# temp hardcoded values
best_fit = {'MAPE': 0.04646530042225876, 'Mean Squared Error': 18.635209563729763,
'Mean Absolute Error': 2.856347408023325, 'R2 Score': 0.800701753826118,
'Explained Variance Score': 0.800701753826118, 'Median Absolute Error': 1.9026758012120197}
best_predict = {'MAPE': 0.04346083528432316, 'Mean Squared Error': 21.16036509335514,
'Mean Absolute Error': 3.0440540802375833, 'R2 Score': 0.7219965012634312,
'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828}
def check_successes(experiment_error, best_error):
successes = []
for k in experiment_error:
if k in ["Explained Variance Score", "R2 Score"]:
# We want to maximise this so we want experiment error to be higher
successes.append(
{
"measure": k,
"success": experiment_error[k] >= best_error[k],
"difference": abs(experiment_error[k] - best_error[k])
}
)
continue
successes.append(
{
"measure": k,
"success": experiment_error[k] <= best_error[k],
"difference": abs(experiment_error[k] - best_error[k])
}
)
return pd.DataFrame(successes)
fit_success = check_successes(self.fit_error, best_fit)
predict_success = check_successes(self.predict_error, best_predict)
print(self.results.summary())
self.model_data['fit'] = self.results.fittedvalues
# The worst errors over index heavily for flats
self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
self.fit_df = pd.DataFrame(
{
"fit": self.results.fittedvalues,
"actual": self.train_y
"actual": self.train_y,
"idx": train_idx
}
).sort_values("actual", ascending=True)
# TODO: Testing
# Create a StandardScaler instance
scaler = StandardScaler()
# Fit the scaler to the training data and transform it
train_x_scaled = scaler.fit_transform(train_x)
# Transform the test data
test_x_scaled = scaler.transform(test_x)
# Define the model
lasso_reg = Lasso(
alpha=0.1) # you can change the alpha parameter to adjust the strength of the regularization.
# Fit the model
lasso_reg.fit(train_x_scaled, self.train_y)
# Make predictions on the training set
train_predictions = lasso_reg.predict(train_x_scaled)
# Make predictions on the test set
test_predictions = lasso_reg.predict(test_x_scaled)
# Calculate metrics based on these predictions.
lasso_fit_error, _ = self.calculate_regression_metrics(
y_true=self.train_y, y_pred=train_predictions
)
# Predict on new data
lasso_predict_error, _ = self.calculate_regression_metrics(
y_true=self.test_y, y_pred=test_predictions
)
lasso_fit_success = check_successes(lasso_fit_error, best_fit)
lasso_predict_success = check_successes(lasso_predict_error, best_predict)
# TODO: TESTING 2
# Create a StandardScaler instance
scaler = StandardScaler()
# Fit the scaler to the training data and transform it
train_x_scaled = scaler.fit_transform(train_x)
# Transform the test data
test_x_scaled = scaler.transform(test_x)
# Define the model
alphas = np.logspace(-4, 2, 100) # Range of alpha values to search
lasso_reg = LassoCV(cv=10, alphas=alphas)
# Fit the model
lasso_reg.fit(train_x_scaled, self.train_y)
# Make predictions on the training set
train_predictions = lasso_reg.predict(train_x_scaled)
# Make predictions on the test set
test_predictions = lasso_reg.predict(test_x_scaled)
# Calculate metrics based on these predictions.
lasso_fit_error, lasso_worst_fit_errors = self.calculate_regression_metrics(
y_true=self.train_y, y_pred=train_predictions
)
# Predict on new data
lasso_predict_error, lasso_worst_predict_errors = self.calculate_regression_metrics(
y_true=self.test_y, y_pred=test_predictions
)
lasso_fit_success = check_successes(lasso_fit_error, best_fit)
lasso_predict_success = check_successes(lasso_predict_error, best_predict)
fit_df = pd.DataFrame(
self.predict_df = pd.DataFrame(
{
"fit": train_predictions,
"actual": self.train_y,
"residual": abs(self.train_y - train_predictions),
"idx": train_idx.values
"predictions": test_predictions,
"actual": self.test_y,
"idx": test_idx
}
)
fit_df = fit_df.sort_values("residual", ascending=False)
fit_df = fit_df.merge(self.model_data, on="idx")
zz = fit_df[fit_df["lighting-description"] == "Low energy lighting in all fixed outlets"]
self.diagnosis = {
"fit_success": fit_success,
"predict_success": predict_success,
"summary": self.results.summary()
}
z = fit_df.head(100).groupby("lighting-description", observed=True)["residual"].agg(
['mean', 'count']).reset_index()
z = z.sort_values("mean", ascending=False)
@staticmethod
def check_successes(experiment_error, best_error):
"""
Simple function to check if the experiment error is better than the best error
:param experiment_error: output of calculate_regression_metrics() on the experiment
:param best_error: Current benchmark best error
:return:
"""
worst_x = self.model_data[self.model_data.index.isin(lasso_worst_fit_errors.index)]
worst_x = worst_x.merge(lasso_worst_fit_errors, left_index=True, right_index=True)
worst_x = worst_x.sort_values("Absolute Residual", ascending=False)
successes = []
for k in experiment_error:
if k in ["Explained Variance Score", "R2 Score"]:
# We want to maximise this so we want experiment error to be higher
successes.append(
{
"measure": k,
"success": experiment_error[k] >= best_error[k],
"difference": abs(experiment_error[k] - best_error[k])
}
)
continue
successes.append(
{
"measure": k,
"success": experiment_error[k] <= best_error[k],
"difference": abs(experiment_error[k] - best_error[k])
}
)
return pd.DataFrame(successes)
def rf_importance(self, train_x, train_y, test_x, test_y):
"""
Utility function to estimate feature importance using a random forest
This is useful to get a sense of some of the key features which are driving model
performance
:param train_x: Training data covariates to build the importance model on
:param train_y: Training data response to build the importance model on
:param test_x: Test data covariates to build the permutation importance model on
:param test_y: Test data response to build the permutation importance model on
:return: Pandas dataframe of feature importances, ranked by most important to least
"""
rf = RandomForestRegressor(random_state=self.random_state)
rf.fit(train_x, train_y)
# Print the name and importance of each feature
rf_importance_df = []
for feature, importance in zip(train_x.columns, rf.feature_importances_):
rf_importance_df.append(
{
"Feature": feature,
"rf_importance": importance
}
)
rf_importance_df = pd.DataFrame(rf_importance_df)
rf_importance_df = rf_importance_df.sort_values(by="rf_importance", ascending=False)
perm_importance = self.permuation_importance(rf, test_x, test_y)
return rf_importance_df, perm_importance
@staticmethod
def permuation_importance(rf, test_x, test_y):
"""
Simple utility function to produce permutation importance for a given model\
:param rf: Random forest model to calculate permutation importance for
:param test_x: Test covariates to be used for permutation importance
:param test_y: Test response to be used for permutation importance
:return:
"""
perm_importance = permutation_importance(rf, test_x, test_y, scoring='neg_mean_squared_error')
perm_importance_df = pd.DataFrame(
{
"Feature": test_x.columns,
"perm_importance": perm_importance.importances_mean
}
).sort_values(by="perm_importance", ascending=False)
return perm_importance_df
def detect_multi_collinearity(self):
# Get the VIFs for each variable
@ -589,45 +508,6 @@ class SapModel:
self.train_x = self.train_x.drop(columns=drop_vifs["features"].values)
self.test_x = self.test_x[self.train_x.columns]
def test_multi_collinearity(self, test_variable):
from statsmodels.regression.linear_model import OLS
# drop target variable
x_temp = self.train_x.drop(columns=[test_variable])
# define target variable
y_temp = self.train_x[test_variable]
# add a constant to the predictors
x_temp = sm.add_constant(x_temp)
# fit the model
model_temp = OLS(y_temp, x_temp).fit()
print(model_temp.summary())
smry = model_temp.summary()
smry_coefs = pd.DataFrame(smry.tables[1].data[1:], columns=smry.tables[1].data[0])
smry_coefs = smry_coefs.sort_values("P>|t|", ascending=True)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
print(smry_coefs[smry_coefs["P>|t|"].astype(float) < 0.0001])
import seaborn as sns
# Select columns
selected_columns = ["main-fuel_Gas: mains gas", "mainheat-description_Community scheme"]
# Subset dataframe
subset_df = self.train_x[selected_columns]
# Plot pairplot
sns.pairplot(subset_df)
crosstab1 = pd.crosstab(self.train_x[selected_columns[0]], self.train_x[selected_columns[1]])
crosstab2 = pd.crosstab(self.train_x[selected_columns[0]], self.train_x[selected_columns[2]])
@staticmethod
def plot_regression(df):
# Extract the "fit" and "actual" columns from the dataframe
@ -688,45 +568,3 @@ self = SapModel(
data=all_data["data"],
cleaner=all_data["cleaner"]
)
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=self.random_state)
X = self.df.drop(columns=self.RESPONSE)
X = X.drop(
columns=["address", "uprn-source", "heating-cost-potential", "hot-water-cost-potential", "potential-energy-rating",
"environment-impact-potential", "heating-cost-current", "address3", "local-authority-label",
"hot-water-cost-current", "county", "postcode", "constituency", "co2-emissions-potential", "address2",
"posttown", "lighting-cost-potential", "lodgement-datetime", "current-energy-rating", "uprn",
"energy-consumption-current", "lighting-cost-current", "lodgement-date", "lmk-key",
"potential-energy-efficiency",
"energy-consumption-potential", "inspection-date", "co2-emiss-curr-per-floor-area", "address1",
"constituency-label",
"building-reference-number", "environment-impact-current", "co2-emissions-current",
])
# Note: this probably not the best way to treat low-energy-fixed-light-count, unheated-corridor-length,
# fixed-lighting-outlets-count
for col in [
"photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces", "floor-height",
"low-energy-fixed-light-count", "unheated-corridor-length", "fixed-lighting-outlets-count", "total-floor-area"
]:
X[col] = np.where(
X[col] == "", "0", X[col]
).astype(float)
Y = self.df[self.RESPONSE]
X = pd.get_dummies(X)
rf.fit(X, Y)
# Print the name and importance of each feature
importance_df = []
for feature, importance in zip(X.columns, rf.feature_importances_):
importance_df.append(
{
"Feature": feature,
"rf_importance": importance
}
)
importance_df = pd.DataFrame(importance_df)
importance_df = importance_df.sort_values("rf_importance", ascending=False)