mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Cleaning up modelling code
This commit is contained in:
parent
946e6d092e
commit
710f446ebc
1 changed files with 149 additions and 311 deletions
|
|
@ -7,9 +7,8 @@ from typing import Any, Dict, Tuple
|
|||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
||||
median_absolute_error, mean_absolute_percentage_error
|
||||
from sklearn.linear_model import Lasso
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
import xgboost as xgb
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.inspection import permutation_importance
|
||||
|
||||
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
||||
from tqdm import tqdm
|
||||
|
|
@ -91,6 +90,23 @@ class SapModel:
|
|||
"extension-count",
|
||||
]
|
||||
|
||||
# For the moment, we store records of the best performing models as a benchmark for future imporvements
|
||||
BEST_FIT = {
|
||||
'MAPE': 0.04646530042225876, 'Mean Squared Error': 18.635209563729763,
|
||||
'Mean Absolute Error': 2.856347408023325, 'R2 Score': 0.800701753826118,
|
||||
'Explained Variance Score': 0.800701753826118, 'Median Absolute Error': 1.9026758012120197
|
||||
}
|
||||
|
||||
BEST_PREDICT = {
|
||||
'MAPE': 0.04346083528432316, 'Mean Squared Error': 21.16036509335514,
|
||||
'Mean Absolute Error': 3.0440540802375833, 'R2 Score': 0.7219965012634312,
|
||||
'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828
|
||||
}
|
||||
|
||||
BUCKET_VARIABLES = [
|
||||
"number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion'
|
||||
]
|
||||
|
||||
def __init__(self, data, cleaner, test_size=0.2, random_state=None):
|
||||
self.df = pd.DataFrame(data)
|
||||
self.cleaner = cleaner
|
||||
|
|
@ -107,7 +123,10 @@ class SapModel:
|
|||
self.fit_error = None
|
||||
self.predict_error = None
|
||||
self.worst = {"fit_errors": pd.DataFrame(), "x": pd.DataFrame(), "prediction_errors": pd.DataFrame()}
|
||||
|
||||
self.fit_df = None
|
||||
self.predict_df = None
|
||||
self.diagnosis = {}
|
||||
|
||||
def run(self, plot=False):
|
||||
"""
|
||||
|
|
@ -163,83 +182,49 @@ class SapModel:
|
|||
model_data = model_data.drop(columns=["transaction-type"])
|
||||
return model_data
|
||||
|
||||
@staticmethod
|
||||
def bucket_and_fill(df, column_name, n_bins=10):
|
||||
"""
|
||||
Simple utility function to bucket up features into bins and then fill any missing values with "NO_RECORD"
|
||||
:param df: Dataframe of features to be binned
|
||||
:param column_name: Name of the column to be binned
|
||||
:param n_bins: Number of bins to use
|
||||
:return: Dataframe with binned column
|
||||
"""
|
||||
# Check if the column is numerical
|
||||
if np.issubdtype(df[column_name].dtype, np.number):
|
||||
# Create a new categorical column from numerical one by binning the data
|
||||
df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str)
|
||||
# Replace missing data with "NO_RECORD"
|
||||
df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD")
|
||||
df[column_name + "_bucket"] = np.where(
|
||||
df[column_name + "_bucket"] == "nan",
|
||||
"NO_RECORD",
|
||||
df[column_name + "_bucket"]
|
||||
)
|
||||
return df
|
||||
|
||||
def _clean_numericals(self, model_data):
|
||||
|
||||
# Try binning numericals
|
||||
def bucket_and_fill(df, column_name, n_bins=10):
|
||||
# Check if the column is numerical
|
||||
if np.issubdtype(df[column_name].dtype, np.number):
|
||||
# Create a new categorical column from numerical one by binning the data
|
||||
df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str)
|
||||
# Replace missing data with "NO_RECORD"
|
||||
df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD")
|
||||
df[column_name + "_bucket"] = np.where(
|
||||
df[column_name + "_bucket"] == "nan",
|
||||
"NO_RECORD",
|
||||
df[column_name + "_bucket"]
|
||||
)
|
||||
return df
|
||||
remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in self.BUCKET_VARIABLES]
|
||||
|
||||
bucket_variables = ["number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count',
|
||||
'multi-glaze-proportion']
|
||||
remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in bucket_variables]
|
||||
|
||||
for col in bucket_variables:
|
||||
for col in self.BUCKET_VARIABLES:
|
||||
model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
|
||||
model_data = bucket_and_fill(model_data, col)
|
||||
model_data = self.bucket_and_fill(model_data, col)
|
||||
|
||||
# Replace the data with the binned version
|
||||
model_data = model_data.drop(columns=bucket_variables)
|
||||
model_data = model_data.drop(columns=self.BUCKET_VARIABLES)
|
||||
model_data = model_data.rename(
|
||||
columns=dict(zip([c + "_bucket" for c in bucket_variables], bucket_variables))
|
||||
columns=dict(zip([c + "_bucket" for c in self.BUCKET_VARIABLES], self.BUCKET_VARIABLES))
|
||||
)
|
||||
|
||||
# Basic fill the rest of the columns with 0 - currenrtly this provided the best performance
|
||||
for col in remaining_numericals:
|
||||
model_data[col] = np.where(
|
||||
model_data[col] == "", "0", model_data[col]
|
||||
).astype(float)
|
||||
|
||||
# Basic fill the rest of the columns with 0
|
||||
|
||||
# grouping_variables = ["constituency", "built-form", "property-type"]
|
||||
# secondary_grouping_variables = ["constituency", "property-type"]
|
||||
#
|
||||
# def append_missings(model_data, col, grouping_variables, secondary=False):
|
||||
# if not secondary:
|
||||
# numerical_data = model_data[(model_data[col] != "")].copy()
|
||||
# else:
|
||||
# numerical_data = model_data[~pd.isnull(model_data[col])].copy()
|
||||
#
|
||||
# numerical_data[col] = numerical_data[col].astype(float)
|
||||
# numerical_data = numerical_data.groupby(grouping_variables, observed=True)[col].mean().reset_index()
|
||||
# numerical_data = numerical_data.rename(columns={col: f"{col}_avg"})
|
||||
# model_data = model_data.merge(numerical_data, how="left", on=grouping_variables)
|
||||
#
|
||||
# if not secondary:
|
||||
# model_data[col] = np.where(model_data[col] == "", model_data[f"{col}_avg"], model_data[col])
|
||||
# else:
|
||||
# model_data[col] = np.where(pd.isnull(model_data[col]), model_data[f"{col}_avg"], model_data[col])
|
||||
#
|
||||
# model_data = model_data.drop(columns=[f"{col}_avg"])
|
||||
# model_data[col] = model_data[col].astype(float)
|
||||
#
|
||||
# return model_data
|
||||
#
|
||||
# for col in numerical_cols:
|
||||
# model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
|
||||
# model_data[col] = model_data[col].fillna(model_data[col].mean())
|
||||
# model_data = append_missings(model_data, col, grouping_variables)
|
||||
#
|
||||
# if pd.isnull(model_data[col]).sum():
|
||||
# model_data = append_missings(model_data, col, secondary_grouping_variables, True)
|
||||
#
|
||||
# if pd.isnull(model_data[col]).sum():
|
||||
# raise Exception("Cleaning failed, missing values remain")
|
||||
#
|
||||
# model_data[col] = np.where(
|
||||
# model_data[col] == "", "0", model_data[col]
|
||||
# ).astype(float)
|
||||
|
||||
return model_data
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -336,11 +321,12 @@ class SapModel:
|
|||
self.test_x = self.test_x[self.train_x.columns]
|
||||
|
||||
def fit_model(self):
|
||||
"""
|
||||
Main function to fit the model and produce accuracy metrics
|
||||
:return:
|
||||
"""
|
||||
|
||||
# Dummy out the categorical variables
|
||||
binned = ["number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion']
|
||||
|
||||
x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + binned, drop_first=True)
|
||||
x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + self.BUCKET_VARIABLES, drop_first=True)
|
||||
|
||||
# Convert booleans to integer
|
||||
for col in x.columns:
|
||||
|
|
@ -361,36 +347,10 @@ class SapModel:
|
|||
train_x = sm.add_constant(self.train_x)
|
||||
test_x = sm.add_constant(self.test_x)
|
||||
train_idx = train_x["idx"].copy()
|
||||
test_ids = self.test_x["idx"].copy()
|
||||
test_idx = self.test_x["idx"].copy()
|
||||
train_x = train_x.drop(columns=["idx"])
|
||||
test_x = test_x.drop(columns=["idx"])
|
||||
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.inspection import permutation_importance
|
||||
|
||||
rf = RandomForestRegressor(random_state=self.random_state)
|
||||
rf.fit(train_x, self.train_y)
|
||||
|
||||
# Print the name and importance of each feature
|
||||
importance_df = []
|
||||
for feature, importance in zip(train_x.columns, rf.feature_importances_):
|
||||
importance_df.append(
|
||||
{
|
||||
"Feature": feature,
|
||||
"rf_importance": importance
|
||||
}
|
||||
)
|
||||
importance_df = pd.DataFrame(importance_df)
|
||||
importance_df = importance_df.sort_values(by="rf_importance", ascending=False)
|
||||
|
||||
perm_importance = permutation_importance(rf, test_x, self.test_y, scoring='neg_mean_squared_error')
|
||||
perm_importance_df = pd.DataFrame(
|
||||
{
|
||||
"Feature": test_x.columns,
|
||||
"perm_importance": perm_importance.importances_mean
|
||||
}
|
||||
).sort_values(by="perm_importance", ascending=False)
|
||||
|
||||
# make regression model
|
||||
model = sm.OLS(self.train_y, train_x)
|
||||
# fit model and print results
|
||||
|
|
@ -412,157 +372,116 @@ class SapModel:
|
|||
y_true=self.test_y, y_pred=test_predictions
|
||||
)
|
||||
|
||||
fit_df = pd.DataFrame(
|
||||
{
|
||||
"fit": self.results.fittedvalues,
|
||||
"actual": self.train_y,
|
||||
"idx": train_idx
|
||||
}
|
||||
).sort_values("actual", ascending=True).merge(self.model_data[["idx", "property-type"]], on="idx")
|
||||
fit_success = self.check_successes(self.fit_error, self.BEST_FIT)
|
||||
predict_success = self.check_successes(self.predict_error, self.BEST_PREDICT)
|
||||
|
||||
# temp hardcoded values
|
||||
best_fit = {'MAPE': 0.04646530042225876, 'Mean Squared Error': 18.635209563729763,
|
||||
'Mean Absolute Error': 2.856347408023325, 'R2 Score': 0.800701753826118,
|
||||
'Explained Variance Score': 0.800701753826118, 'Median Absolute Error': 1.9026758012120197}
|
||||
|
||||
best_predict = {'MAPE': 0.04346083528432316, 'Mean Squared Error': 21.16036509335514,
|
||||
'Mean Absolute Error': 3.0440540802375833, 'R2 Score': 0.7219965012634312,
|
||||
'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828}
|
||||
|
||||
def check_successes(experiment_error, best_error):
|
||||
|
||||
successes = []
|
||||
for k in experiment_error:
|
||||
if k in ["Explained Variance Score", "R2 Score"]:
|
||||
# We want to maximise this so we want experiment error to be higher
|
||||
successes.append(
|
||||
{
|
||||
"measure": k,
|
||||
"success": experiment_error[k] >= best_error[k],
|
||||
"difference": abs(experiment_error[k] - best_error[k])
|
||||
}
|
||||
)
|
||||
continue
|
||||
successes.append(
|
||||
{
|
||||
"measure": k,
|
||||
"success": experiment_error[k] <= best_error[k],
|
||||
"difference": abs(experiment_error[k] - best_error[k])
|
||||
}
|
||||
)
|
||||
|
||||
return pd.DataFrame(successes)
|
||||
|
||||
fit_success = check_successes(self.fit_error, best_fit)
|
||||
predict_success = check_successes(self.predict_error, best_predict)
|
||||
|
||||
print(self.results.summary())
|
||||
self.model_data['fit'] = self.results.fittedvalues
|
||||
|
||||
# The worst errors over index heavily for flats
|
||||
self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
|
||||
|
||||
self.fit_df = pd.DataFrame(
|
||||
{
|
||||
"fit": self.results.fittedvalues,
|
||||
"actual": self.train_y
|
||||
"actual": self.train_y,
|
||||
"idx": train_idx
|
||||
}
|
||||
).sort_values("actual", ascending=True)
|
||||
|
||||
# TODO: Testing
|
||||
|
||||
# Create a StandardScaler instance
|
||||
scaler = StandardScaler()
|
||||
|
||||
# Fit the scaler to the training data and transform it
|
||||
train_x_scaled = scaler.fit_transform(train_x)
|
||||
|
||||
# Transform the test data
|
||||
test_x_scaled = scaler.transform(test_x)
|
||||
|
||||
# Define the model
|
||||
lasso_reg = Lasso(
|
||||
alpha=0.1) # you can change the alpha parameter to adjust the strength of the regularization.
|
||||
|
||||
# Fit the model
|
||||
lasso_reg.fit(train_x_scaled, self.train_y)
|
||||
|
||||
# Make predictions on the training set
|
||||
train_predictions = lasso_reg.predict(train_x_scaled)
|
||||
|
||||
# Make predictions on the test set
|
||||
test_predictions = lasso_reg.predict(test_x_scaled)
|
||||
|
||||
# Calculate metrics based on these predictions.
|
||||
lasso_fit_error, _ = self.calculate_regression_metrics(
|
||||
y_true=self.train_y, y_pred=train_predictions
|
||||
)
|
||||
|
||||
# Predict on new data
|
||||
lasso_predict_error, _ = self.calculate_regression_metrics(
|
||||
y_true=self.test_y, y_pred=test_predictions
|
||||
)
|
||||
|
||||
lasso_fit_success = check_successes(lasso_fit_error, best_fit)
|
||||
lasso_predict_success = check_successes(lasso_predict_error, best_predict)
|
||||
|
||||
# TODO: TESTING 2
|
||||
|
||||
# Create a StandardScaler instance
|
||||
scaler = StandardScaler()
|
||||
|
||||
# Fit the scaler to the training data and transform it
|
||||
train_x_scaled = scaler.fit_transform(train_x)
|
||||
|
||||
# Transform the test data
|
||||
test_x_scaled = scaler.transform(test_x)
|
||||
|
||||
# Define the model
|
||||
alphas = np.logspace(-4, 2, 100) # Range of alpha values to search
|
||||
lasso_reg = LassoCV(cv=10, alphas=alphas)
|
||||
|
||||
# Fit the model
|
||||
lasso_reg.fit(train_x_scaled, self.train_y)
|
||||
|
||||
# Make predictions on the training set
|
||||
train_predictions = lasso_reg.predict(train_x_scaled)
|
||||
|
||||
# Make predictions on the test set
|
||||
test_predictions = lasso_reg.predict(test_x_scaled)
|
||||
|
||||
# Calculate metrics based on these predictions.
|
||||
lasso_fit_error, lasso_worst_fit_errors = self.calculate_regression_metrics(
|
||||
y_true=self.train_y, y_pred=train_predictions
|
||||
)
|
||||
|
||||
# Predict on new data
|
||||
lasso_predict_error, lasso_worst_predict_errors = self.calculate_regression_metrics(
|
||||
y_true=self.test_y, y_pred=test_predictions
|
||||
)
|
||||
|
||||
lasso_fit_success = check_successes(lasso_fit_error, best_fit)
|
||||
lasso_predict_success = check_successes(lasso_predict_error, best_predict)
|
||||
|
||||
fit_df = pd.DataFrame(
|
||||
self.predict_df = pd.DataFrame(
|
||||
{
|
||||
"fit": train_predictions,
|
||||
"actual": self.train_y,
|
||||
"residual": abs(self.train_y - train_predictions),
|
||||
"idx": train_idx.values
|
||||
"predictions": test_predictions,
|
||||
"actual": self.test_y,
|
||||
"idx": test_idx
|
||||
}
|
||||
)
|
||||
fit_df = fit_df.sort_values("residual", ascending=False)
|
||||
fit_df = fit_df.merge(self.model_data, on="idx")
|
||||
|
||||
zz = fit_df[fit_df["lighting-description"] == "Low energy lighting in all fixed outlets"]
|
||||
self.diagnosis = {
|
||||
"fit_success": fit_success,
|
||||
"predict_success": predict_success,
|
||||
"summary": self.results.summary()
|
||||
}
|
||||
|
||||
z = fit_df.head(100).groupby("lighting-description", observed=True)["residual"].agg(
|
||||
['mean', 'count']).reset_index()
|
||||
z = z.sort_values("mean", ascending=False)
|
||||
@staticmethod
|
||||
def check_successes(experiment_error, best_error):
|
||||
"""
|
||||
Simple function to check if the experiment error is better than the best error
|
||||
:param experiment_error: output of calculate_regression_metrics() on the experiment
|
||||
:param best_error: Current benchmark best error
|
||||
:return:
|
||||
"""
|
||||
|
||||
worst_x = self.model_data[self.model_data.index.isin(lasso_worst_fit_errors.index)]
|
||||
worst_x = worst_x.merge(lasso_worst_fit_errors, left_index=True, right_index=True)
|
||||
worst_x = worst_x.sort_values("Absolute Residual", ascending=False)
|
||||
successes = []
|
||||
for k in experiment_error:
|
||||
if k in ["Explained Variance Score", "R2 Score"]:
|
||||
# We want to maximise this so we want experiment error to be higher
|
||||
successes.append(
|
||||
{
|
||||
"measure": k,
|
||||
"success": experiment_error[k] >= best_error[k],
|
||||
"difference": abs(experiment_error[k] - best_error[k])
|
||||
}
|
||||
)
|
||||
continue
|
||||
successes.append(
|
||||
{
|
||||
"measure": k,
|
||||
"success": experiment_error[k] <= best_error[k],
|
||||
"difference": abs(experiment_error[k] - best_error[k])
|
||||
}
|
||||
)
|
||||
|
||||
return pd.DataFrame(successes)
|
||||
|
||||
def rf_importance(self, train_x, train_y, test_x, test_y):
|
||||
"""
|
||||
Utility function to estimate feature importance using a random forest
|
||||
This is useful to get a sense of some of the key features which are driving model
|
||||
performance
|
||||
|
||||
:param train_x: Training data covariates to build the importance model on
|
||||
:param train_y: Training data response to build the importance model on
|
||||
:param test_x: Test data covariates to build the permutation importance model on
|
||||
:param test_y: Test data response to build the permutation importance model on
|
||||
:return: Pandas dataframe of feature importances, ranked by most important to least
|
||||
"""
|
||||
|
||||
rf = RandomForestRegressor(random_state=self.random_state)
|
||||
rf.fit(train_x, train_y)
|
||||
|
||||
# Print the name and importance of each feature
|
||||
rf_importance_df = []
|
||||
for feature, importance in zip(train_x.columns, rf.feature_importances_):
|
||||
rf_importance_df.append(
|
||||
{
|
||||
"Feature": feature,
|
||||
"rf_importance": importance
|
||||
}
|
||||
)
|
||||
rf_importance_df = pd.DataFrame(rf_importance_df)
|
||||
rf_importance_df = rf_importance_df.sort_values(by="rf_importance", ascending=False)
|
||||
|
||||
perm_importance = self.permuation_importance(rf, test_x, test_y)
|
||||
|
||||
return rf_importance_df, perm_importance
|
||||
|
||||
@staticmethod
|
||||
def permuation_importance(rf, test_x, test_y):
|
||||
"""
|
||||
Simple utility function to produce permutation importance for a given model\
|
||||
:param rf: Random forest model to calculate permutation importance for
|
||||
:param test_x: Test covariates to be used for permutation importance
|
||||
:param test_y: Test response to be used for permutation importance
|
||||
:return:
|
||||
"""
|
||||
perm_importance = permutation_importance(rf, test_x, test_y, scoring='neg_mean_squared_error')
|
||||
perm_importance_df = pd.DataFrame(
|
||||
{
|
||||
"Feature": test_x.columns,
|
||||
"perm_importance": perm_importance.importances_mean
|
||||
}
|
||||
).sort_values(by="perm_importance", ascending=False)
|
||||
|
||||
return perm_importance_df
|
||||
|
||||
def detect_multi_collinearity(self):
|
||||
# Get the VIFs for each variable
|
||||
|
|
@ -589,45 +508,6 @@ class SapModel:
|
|||
self.train_x = self.train_x.drop(columns=drop_vifs["features"].values)
|
||||
self.test_x = self.test_x[self.train_x.columns]
|
||||
|
||||
def test_multi_collinearity(self, test_variable):
|
||||
from statsmodels.regression.linear_model import OLS
|
||||
# drop target variable
|
||||
x_temp = self.train_x.drop(columns=[test_variable])
|
||||
|
||||
# define target variable
|
||||
y_temp = self.train_x[test_variable]
|
||||
|
||||
# add a constant to the predictors
|
||||
x_temp = sm.add_constant(x_temp)
|
||||
|
||||
# fit the model
|
||||
model_temp = OLS(y_temp, x_temp).fit()
|
||||
print(model_temp.summary())
|
||||
|
||||
smry = model_temp.summary()
|
||||
smry_coefs = pd.DataFrame(smry.tables[1].data[1:], columns=smry.tables[1].data[0])
|
||||
smry_coefs = smry_coefs.sort_values("P>|t|", ascending=True)
|
||||
|
||||
pd.set_option('display.max_rows', 500)
|
||||
pd.set_option('display.max_columns', 500)
|
||||
pd.set_option('display.width', 1000)
|
||||
|
||||
print(smry_coefs[smry_coefs["P>|t|"].astype(float) < 0.0001])
|
||||
|
||||
import seaborn as sns
|
||||
|
||||
# Select columns
|
||||
selected_columns = ["main-fuel_Gas: mains gas", "mainheat-description_Community scheme"]
|
||||
|
||||
# Subset dataframe
|
||||
subset_df = self.train_x[selected_columns]
|
||||
|
||||
# Plot pairplot
|
||||
sns.pairplot(subset_df)
|
||||
|
||||
crosstab1 = pd.crosstab(self.train_x[selected_columns[0]], self.train_x[selected_columns[1]])
|
||||
crosstab2 = pd.crosstab(self.train_x[selected_columns[0]], self.train_x[selected_columns[2]])
|
||||
|
||||
@staticmethod
|
||||
def plot_regression(df):
|
||||
# Extract the "fit" and "actual" columns from the dataframe
|
||||
|
|
@ -688,45 +568,3 @@ self = SapModel(
|
|||
data=all_data["data"],
|
||||
cleaner=all_data["cleaner"]
|
||||
)
|
||||
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
|
||||
rf = RandomForestRegressor(random_state=self.random_state)
|
||||
X = self.df.drop(columns=self.RESPONSE)
|
||||
X = X.drop(
|
||||
columns=["address", "uprn-source", "heating-cost-potential", "hot-water-cost-potential", "potential-energy-rating",
|
||||
"environment-impact-potential", "heating-cost-current", "address3", "local-authority-label",
|
||||
"hot-water-cost-current", "county", "postcode", "constituency", "co2-emissions-potential", "address2",
|
||||
"posttown", "lighting-cost-potential", "lodgement-datetime", "current-energy-rating", "uprn",
|
||||
"energy-consumption-current", "lighting-cost-current", "lodgement-date", "lmk-key",
|
||||
"potential-energy-efficiency",
|
||||
"energy-consumption-potential", "inspection-date", "co2-emiss-curr-per-floor-area", "address1",
|
||||
"constituency-label",
|
||||
"building-reference-number", "environment-impact-current", "co2-emissions-current",
|
||||
])
|
||||
# Note: this probably not the best way to treat low-energy-fixed-light-count, unheated-corridor-length,
|
||||
# fixed-lighting-outlets-count
|
||||
for col in [
|
||||
"photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces", "floor-height",
|
||||
"low-energy-fixed-light-count", "unheated-corridor-length", "fixed-lighting-outlets-count", "total-floor-area"
|
||||
]:
|
||||
X[col] = np.where(
|
||||
X[col] == "", "0", X[col]
|
||||
).astype(float)
|
||||
|
||||
Y = self.df[self.RESPONSE]
|
||||
X = pd.get_dummies(X)
|
||||
rf.fit(X, Y)
|
||||
|
||||
# Print the name and importance of each feature
|
||||
importance_df = []
|
||||
for feature, importance in zip(X.columns, rf.feature_importances_):
|
||||
importance_df.append(
|
||||
{
|
||||
"Feature": feature,
|
||||
"rf_importance": importance
|
||||
}
|
||||
)
|
||||
|
||||
importance_df = pd.DataFrame(importance_df)
|
||||
importance_df = importance_df.sort_values("rf_importance", ascending=False)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue