mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
categorical cleaning in progress
This commit is contained in:
parent
d04be241bd
commit
c698f49d58
1 changed files with 195 additions and 5 deletions
|
|
@ -160,6 +160,35 @@ class SapModel:
|
|||
# Append on u-values
|
||||
model_data = self._append_cleaned_data(model_data)
|
||||
|
||||
def clean_missings(model_data):
|
||||
CLEANING_COLS = ["mechanical-ventilation", "energy-tariff", "solar-water-heating-flag", "glazed-type", ""]
|
||||
model_data["glazed-area"].value_counts()
|
||||
|
||||
model_data["mechanical-ventilation"] = np.where(
|
||||
model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"]
|
||||
)
|
||||
|
||||
# REVIEW THIS
|
||||
# model_data["energy-tariff"] = np.where(
|
||||
# model_data["energy-tariff"] == "", "Unknown", model_data["mechanical-ventilation"]
|
||||
# )
|
||||
#
|
||||
model_data["solar-water-heating-flag"] = np.where(
|
||||
model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"]
|
||||
)
|
||||
|
||||
model_data["glazed-type"] = np.where(
|
||||
model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"]
|
||||
)
|
||||
|
||||
model_data["glazed-area"] = np.where(
|
||||
model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"]
|
||||
)
|
||||
|
||||
return model_data
|
||||
|
||||
model_data = clean_missings(model_data)
|
||||
|
||||
# Convert transaction_type
|
||||
model_data = self._convert_transaction_type(model_data)
|
||||
|
||||
|
|
@ -181,7 +210,7 @@ class SapModel:
|
|||
|
||||
features = [
|
||||
x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [
|
||||
"walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE
|
||||
"walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx"
|
||||
] if x not in exclude_features
|
||||
]
|
||||
|
||||
|
|
@ -235,8 +264,15 @@ class SapModel:
|
|||
|
||||
self.remove_zero_std_cols()
|
||||
|
||||
# self.detect_multi_collinearity()
|
||||
|
||||
# Add a constant to the independent value
|
||||
train_x = sm.add_constant(self.train_x)
|
||||
test_x = sm.add_constant(self.test_x)
|
||||
train_idx = train_x["idx"].copy()
|
||||
test_ids = self.test_x["idx"].copy()
|
||||
train_x = train_x.drop(columns=["idx"])
|
||||
test_x = test_x.drop(columns=["idx"])
|
||||
|
||||
# make regression model
|
||||
model = sm.OLS(self.train_y, train_x)
|
||||
|
|
@ -249,7 +285,7 @@ class SapModel:
|
|||
)
|
||||
|
||||
# Predict on new data
|
||||
predictions = self.results.predict(sm.add_constant(self.test_x))
|
||||
predictions = self.results.predict(test_x)
|
||||
self.predict_error, self.worst["prediction_errors"] = self.calculate_regression_metrics(
|
||||
y_true=self.test_y, y_pred=predictions
|
||||
)
|
||||
|
|
@ -267,7 +303,7 @@ class SapModel:
|
|||
|
||||
successes = []
|
||||
for k in experiment_error:
|
||||
if k == "Explained Variance Score":
|
||||
if k in ["Explained Variance Score", "R2 Score"]:
|
||||
# We want to maximise this so we want experiment error to be higher
|
||||
successes.append(
|
||||
{
|
||||
|
|
@ -287,8 +323,8 @@ class SapModel:
|
|||
|
||||
return pd.DataFrame(successes)
|
||||
|
||||
check_successes(self.fit_error, best_fit)
|
||||
check_successes(self.predict_error, best_predict)
|
||||
fit_success = check_successes(self.fit_error, best_fit)
|
||||
predict_success = check_successes(self.predict_error, best_predict)
|
||||
|
||||
self.model_data['fit'] = self.results.fittedvalues
|
||||
# The worst errors over index heavily for flats
|
||||
|
|
@ -301,6 +337,105 @@ class SapModel:
|
|||
}
|
||||
).sort_values("actual", ascending=True)
|
||||
|
||||
# TODO: Testing
|
||||
from sklearn.linear_model import Lasso
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
# Create a StandardScaler instance
|
||||
scaler = StandardScaler()
|
||||
|
||||
# Fit the scaler to the training data and transform it
|
||||
train_x_scaled = scaler.fit_transform(train_x)
|
||||
|
||||
# Transform the test data
|
||||
test_x_scaled = scaler.transform(test_x)
|
||||
|
||||
# Define the model
|
||||
lasso_reg = Lasso(
|
||||
alpha=0.1) # you can change the alpha parameter to adjust the strength of the regularization.
|
||||
|
||||
# Fit the model
|
||||
lasso_reg.fit(train_x_scaled, self.train_y)
|
||||
|
||||
# Make predictions on the training set
|
||||
train_predictions = lasso_reg.predict(train_x_scaled)
|
||||
|
||||
# Make predictions on the test set
|
||||
test_predictions = lasso_reg.predict(test_x_scaled)
|
||||
|
||||
# Calculate metrics based on these predictions.
|
||||
lasso_fit_error, _ = self.calculate_regression_metrics(
|
||||
y_true=self.train_y, y_pred=train_predictions
|
||||
)
|
||||
|
||||
# Predict on new data
|
||||
lasso_predict_error, _ = self.calculate_regression_metrics(
|
||||
y_true=self.test_y, y_pred=test_predictions
|
||||
)
|
||||
|
||||
lasso_fit_success = check_successes(lasso_fit_error, best_fit)
|
||||
lasso_predict_success = check_successes(lasso_predict_error, best_predict)
|
||||
|
||||
# TODO: TESTING 2
|
||||
from sklearn.linear_model import LassoCV
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
# Create a StandardScaler instance
|
||||
scaler = StandardScaler()
|
||||
|
||||
# Fit the scaler to the training data and transform it
|
||||
train_x_scaled = scaler.fit_transform(train_x)
|
||||
|
||||
# Transform the test data
|
||||
test_x_scaled = scaler.transform(test_x)
|
||||
|
||||
# Define the model
|
||||
alphas = np.logspace(-4, 2, 100) # Range of alpha values to search
|
||||
lasso_reg = LassoCV(cv=10, alphas=alphas)
|
||||
|
||||
# Fit the model
|
||||
lasso_reg.fit(train_x_scaled, self.train_y)
|
||||
|
||||
# Make predictions on the training set
|
||||
train_predictions = lasso_reg.predict(train_x_scaled)
|
||||
|
||||
# Make predictions on the test set
|
||||
test_predictions = lasso_reg.predict(test_x_scaled)
|
||||
|
||||
# Calculate metrics based on these predictions.
|
||||
lasso_fit_error, lasso_worst_fit_errors = self.calculate_regression_metrics(
|
||||
y_true=self.train_y, y_pred=train_predictions
|
||||
)
|
||||
|
||||
# Predict on new data
|
||||
lasso_predict_error, lasso_worst_predict_errors = self.calculate_regression_metrics(
|
||||
y_true=self.test_y, y_pred=test_predictions
|
||||
)
|
||||
|
||||
lasso_fit_success = check_successes(lasso_fit_error, best_fit)
|
||||
lasso_predict_success = check_successes(lasso_predict_error, best_predict)
|
||||
|
||||
fit_df = pd.DataFrame(
|
||||
{
|
||||
"fit": train_predictions,
|
||||
"actual": self.train_y,
|
||||
"residual": abs(self.train_y - train_predictions),
|
||||
"idx": train_idx.values
|
||||
}
|
||||
)
|
||||
fit_df = fit_df.sort_values("residual", ascending=False)
|
||||
fit_df = fit_df.merge(self.model_data, on="idx")
|
||||
|
||||
zz = fit_df[fit_df["lighting-description"] == "Low energy lighting in all fixed outlets"]
|
||||
|
||||
z = fit_df.head(100).groupby("lighting-description", observed=True)["residual"].agg(
|
||||
['mean', 'count']).reset_index()
|
||||
z = z.sort_values("mean", ascending=False)
|
||||
|
||||
worst_x = self.model_data[self.model_data.index.isin(lasso_worst_fit_errors.index)]
|
||||
worst_x = worst_x.merge(lasso_worst_fit_errors, left_index=True, right_index=True)
|
||||
worst_x = worst_x.sort_values("Absolute Residual", ascending=False)
|
||||
|
||||
def detect_multi_collinearity(self):
|
||||
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
||||
from tqdm import tqdm
|
||||
|
|
@ -312,6 +447,61 @@ class SapModel:
|
|||
# Get the features with the highest VIF
|
||||
vifs = vifs.sort_values("vif", ascending=False)
|
||||
|
||||
# There are some features, we do not want to remove
|
||||
required_features = [
|
||||
"walls_u_value", "floor_u_value", "roof_u_value"
|
||||
]
|
||||
|
||||
vifs = vifs[~vifs["features"].isin(required_features)]
|
||||
drop_vifs = vifs[vifs["vif"] > 100]
|
||||
|
||||
# Acceptable drop variables:
|
||||
# main-fuel_Gas: mains gas
|
||||
# glazed-type_NO DATA!
|
||||
# glazed-area_NO DATA!
|
||||
|
||||
self.train_x = self.train_x.drop(columns=drop_vifs["features"].values)
|
||||
self.test_x = self.test_x[self.train_x.columns]
|
||||
|
||||
def test_multi_collinearity(self, test_variable):
|
||||
from statsmodels.regression.linear_model import OLS
|
||||
# drop target variable
|
||||
x_temp = self.train_x.drop(columns=[test_variable])
|
||||
|
||||
# define target variable
|
||||
y_temp = self.train_x[test_variable]
|
||||
|
||||
# add a constant to the predictors
|
||||
x_temp = sm.add_constant(x_temp)
|
||||
|
||||
# fit the model
|
||||
model_temp = OLS(y_temp, x_temp).fit()
|
||||
print(model_temp.summary())
|
||||
|
||||
smry = model_temp.summary()
|
||||
smry_coefs = pd.DataFrame(smry.tables[1].data[1:], columns=smry.tables[1].data[0])
|
||||
smry_coefs = smry_coefs.sort_values("P>|t|", ascending=True)
|
||||
|
||||
pd.set_option('display.max_rows', 500)
|
||||
pd.set_option('display.max_columns', 500)
|
||||
pd.set_option('display.width', 1000)
|
||||
|
||||
print(smry_coefs[smry_coefs["P>|t|"].astype(float) < 0.0001])
|
||||
|
||||
import seaborn as sns
|
||||
|
||||
# Select columns
|
||||
selected_columns = ["main-fuel_Gas: mains gas", "mainheat-description_Community scheme"]
|
||||
|
||||
# Subset dataframe
|
||||
subset_df = self.train_x[selected_columns]
|
||||
|
||||
# Plot pairplot
|
||||
sns.pairplot(subset_df)
|
||||
|
||||
crosstab1 = pd.crosstab(self.train_x[selected_columns[0]], self.train_x[selected_columns[1]])
|
||||
crosstab2 = pd.crosstab(self.train_x[selected_columns[0]], self.train_x[selected_columns[2]])
|
||||
|
||||
@staticmethod
|
||||
def plot_regression(df):
|
||||
# Extract the "fit" and "actual" columns from the dataframe
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue