testing rf importance and permutation importance

This commit is contained in:
Khalim Conn-Kowlessar 2023-07-04 16:28:58 +01:00
parent 58edd9a255
commit c59aff412c

View file

@ -156,6 +156,32 @@ class SapModel:
return model_data
@staticmethod
def clean_missings(model_data):
# Cleaning of energy-tariff and construction-age-band hurt prediction performance, indicating there is
# potentially
# a notable difference between a "" missing and a "NO DATA!" missing, worth differentiating
model_data["construction-age-band"].value_counts()
model_data["mechanical-ventilation"] = np.where(
model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"]
)
model_data["solar-water-heating-flag"] = np.where(
model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"]
)
model_data["glazed-type"] = np.where(
model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"]
)
model_data["glazed-area"] = np.where(
model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"]
)
return model_data
def create_dataset(self):
model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
model_data = model_data.reset_index(drop=True)
@ -164,38 +190,7 @@ class SapModel:
# Append on u-values
model_data = self._append_cleaned_data(model_data)
def clean_missings(model_data):
CLEANING_COLS = ["mechanical-ventilation", "energy-tariff", "solar-water-heating-flag", "glazed-type", ""]
model_data["construction-age-band"].value_counts()
model_data["mechanical-ventilation"] = np.where(
model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"]
)
# REVIEW THIS
# model_data["energy-tariff"] = np.where(
# model_data["energy-tariff"] == "", "Unknown", model_data["mechanical-ventilation"]
# )
#
model_data["solar-water-heating-flag"] = np.where(
model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"]
)
model_data["glazed-type"] = np.where(
model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"]
)
model_data["glazed-area"] = np.where(
model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"]
)
# model_data["construction-age-band"] = np.where(
# model_data["construction-age-band"] == "", "NO DATA!", model_data["construction-age-band"]
# )
return model_data
model_data = clean_missings(model_data)
model_data = self.clean_missings(model_data)
# Convert transaction_type
model_data = self._convert_transaction_type(model_data)
@ -292,6 +287,32 @@ class SapModel:
train_x = train_x.drop(columns=to_drop)
test_x = test_x[train_x.columns]
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
rf = RandomForestRegressor(random_state=self.random_state)
rf.fit(train_x, self.train_y)
# Print the name and importance of each feature
importance_df = []
for feature, importance in zip(train_x.columns, rf.feature_importances_):
importance_df.append(
{
"Feature": feature,
"rf_importance": importance
}
)
importance_df = pd.DataFrame(importance_df)
importance_df = importance_df.sort_values(by="rf_importance", ascending=False)
perm_importance = permutation_importance(rf, test_x, self.test_y, scoring='neg_mean_squared_error')
perm_importance_df = pd.DataFrame(
{
"Feature": test_x.columns,
"perm_importance": perm_importance.importances_mean
}
).sort_values(by="perm_importance", ascending=False)
# make regression model
model = sm.OLS(self.train_y, train_x)
# fit model and print results