mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
removed zero variance features
This commit is contained in:
parent
ff84635cb8
commit
d04be241bd
1 changed files with 76 additions and 6 deletions
|
|
@ -55,7 +55,6 @@ class SapModel:
|
||||||
"number-habitable-rooms",
|
"number-habitable-rooms",
|
||||||
"constituency",
|
"constituency",
|
||||||
"number-heated-rooms",
|
"number-heated-rooms",
|
||||||
"lighting-description",
|
|
||||||
"mainheat-description",
|
"mainheat-description",
|
||||||
"hotwater-description",
|
"hotwater-description",
|
||||||
"main-fuel",
|
"main-fuel",
|
||||||
|
|
@ -67,6 +66,8 @@ class SapModel:
|
||||||
"glazed-type",
|
"glazed-type",
|
||||||
"glazed-area",
|
"glazed-area",
|
||||||
"construction-age-band",
|
"construction-age-band",
|
||||||
|
# Testing
|
||||||
|
"lighting-description"
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, data, cleaner, test_size=0.2, random_state=None):
|
def __init__(self, data, cleaner, test_size=0.2, random_state=None):
|
||||||
|
|
@ -83,7 +84,8 @@ class SapModel:
|
||||||
self.results = None
|
self.results = None
|
||||||
self.model_data = None
|
self.model_data = None
|
||||||
self.fit_error = None
|
self.fit_error = None
|
||||||
self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
|
self.predict_error = None
|
||||||
|
self.worst = {"fit_errors": pd.DataFrame(), "x": pd.DataFrame(), "prediction_errors": pd.DataFrame()}
|
||||||
self.fit_df = None
|
self.fit_df = None
|
||||||
|
|
||||||
def run(self, plot=False):
|
def run(self, plot=False):
|
||||||
|
|
@ -173,7 +175,9 @@ class SapModel:
|
||||||
~pd.isnull(model_data["roof_u_value"])
|
~pd.isnull(model_data["roof_u_value"])
|
||||||
]
|
]
|
||||||
|
|
||||||
exclude_features = ["walls-description", "floor-description", "roof-description", "transaction-type"]
|
exclude_features = [
|
||||||
|
"walls-description", "floor-description", "roof-description", "transaction-type"
|
||||||
|
]
|
||||||
|
|
||||||
features = [
|
features = [
|
||||||
x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [
|
x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [
|
||||||
|
|
@ -200,6 +204,19 @@ class SapModel:
|
||||||
random_state=self.random_state
|
random_state=self.random_state
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def remove_zero_std_cols(self, threshold=1e-3):
|
||||||
|
# Compute standard deviations
|
||||||
|
std_devs = self.train_x.std()
|
||||||
|
|
||||||
|
# Find columns with zero or near-zero standard deviation
|
||||||
|
zero_std_cols = std_devs[std_devs <= threshold].index
|
||||||
|
|
||||||
|
# Drop these columns from the training data
|
||||||
|
self.train_x = self.train_x.drop(zero_std_cols, axis=1)
|
||||||
|
|
||||||
|
# Ensure the test data has the same columns
|
||||||
|
self.test_x = self.test_x[self.train_x.columns]
|
||||||
|
|
||||||
def fit_model(self):
|
def fit_model(self):
|
||||||
|
|
||||||
# Dummy out the categorical variables
|
# Dummy out the categorical variables
|
||||||
|
|
@ -216,6 +233,8 @@ class SapModel:
|
||||||
# Create the training and test sets for each run
|
# Create the training and test sets for each run
|
||||||
self.make_training_test(x)
|
self.make_training_test(x)
|
||||||
|
|
||||||
|
self.remove_zero_std_cols()
|
||||||
|
|
||||||
# Add a constant to the independent value
|
# Add a constant to the independent value
|
||||||
train_x = sm.add_constant(self.train_x)
|
train_x = sm.add_constant(self.train_x)
|
||||||
|
|
||||||
|
|
@ -225,10 +244,52 @@ class SapModel:
|
||||||
# fit model and print results
|
# fit model and print results
|
||||||
self.results = model.fit()
|
self.results = model.fit()
|
||||||
|
|
||||||
self.fit_error, self.worst["errors"] = self.calculate_regression_metrics(
|
self.fit_error, self.worst["fit_errors"] = self.calculate_regression_metrics(
|
||||||
y_true=self.train_y, y_pred=self.results.fittedvalues
|
y_true=self.train_y, y_pred=self.results.fittedvalues
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Predict on new data
|
||||||
|
predictions = self.results.predict(sm.add_constant(self.test_x))
|
||||||
|
self.predict_error, self.worst["prediction_errors"] = self.calculate_regression_metrics(
|
||||||
|
y_true=self.test_y, y_pred=predictions
|
||||||
|
)
|
||||||
|
|
||||||
|
# temp hardcoded values
|
||||||
|
best_fit = {'MAPE': 0.04138090547359925, 'Mean Squared Error': 20.14558392249143,
|
||||||
|
'Mean Absolute Error': 3.2071693100226386, 'R2 Score': 0.8070222206305815,
|
||||||
|
'Explained Variance Score': 0.8070222206305815, 'Median Absolute Error': 2.418797962633903}
|
||||||
|
|
||||||
|
best_predict = {'MAPE': 0.04477710915141379, 'Mean Squared Error': 24.121330207821273,
|
||||||
|
'Mean Absolute Error': 3.443075571126256, 'R2 Score': 0.7346655266247644,
|
||||||
|
'Explained Variance Score': 0.7346701958813864, 'Median Absolute Error': 2.5234727208706076}
|
||||||
|
|
||||||
|
def check_successes(experiment_error, best_error):
|
||||||
|
|
||||||
|
successes = []
|
||||||
|
for k in experiment_error:
|
||||||
|
if k == "Explained Variance Score":
|
||||||
|
# We want to maximise this so we want experiment error to be higher
|
||||||
|
successes.append(
|
||||||
|
{
|
||||||
|
"measure": k,
|
||||||
|
"success": experiment_error[k] >= best_error[k],
|
||||||
|
"difference": abs(experiment_error[k] - best_error[k])
|
||||||
|
}
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
successes.append(
|
||||||
|
{
|
||||||
|
"measure": k,
|
||||||
|
"success": experiment_error[k] <= best_error[k],
|
||||||
|
"difference": abs(experiment_error[k] - best_error[k])
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return pd.DataFrame(successes)
|
||||||
|
|
||||||
|
check_successes(self.fit_error, best_fit)
|
||||||
|
check_successes(self.predict_error, best_predict)
|
||||||
|
|
||||||
self.model_data['fit'] = self.results.fittedvalues
|
self.model_data['fit'] = self.results.fittedvalues
|
||||||
# The worst errors over index heavily for flats
|
# The worst errors over index heavily for flats
|
||||||
self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
|
self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
|
||||||
|
|
@ -240,6 +301,17 @@ class SapModel:
|
||||||
}
|
}
|
||||||
).sort_values("actual", ascending=True)
|
).sort_values("actual", ascending=True)
|
||||||
|
|
||||||
|
def detect_multi_collinearity(self):
|
||||||
|
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
||||||
|
from tqdm import tqdm
|
||||||
|
# Get the VIFs for each variable
|
||||||
|
vifs = pd.DataFrame()
|
||||||
|
vifs["features"] = self.train_x.columns
|
||||||
|
vifs["vif"] = [variance_inflation_factor(self.train_x.values, i) for i in tqdm(range(self.train_x.shape[1]))]
|
||||||
|
|
||||||
|
# Get the features with the highest VIF
|
||||||
|
vifs = vifs.sort_values("vif", ascending=False)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def plot_regression(df):
|
def plot_regression(df):
|
||||||
# Extract the "fit" and "actual" columns from the dataframe
|
# Extract the "fit" and "actual" columns from the dataframe
|
||||||
|
|
@ -284,8 +356,6 @@ class SapModel:
|
||||||
metrics['R2 Score'] = r2_score(y_true, y_pred)
|
metrics['R2 Score'] = r2_score(y_true, y_pred)
|
||||||
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
|
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
|
||||||
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
|
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
|
||||||
metrics['Mean True Value'] = y_true.mean()
|
|
||||||
metrics['Mean Predicted Value'] = y_pred.mean()
|
|
||||||
|
|
||||||
errors = pd.DataFrame()
|
errors = pd.DataFrame()
|
||||||
errors['Fit'] = y_true
|
errors['Fit'] = y_true
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue