add feature importance in model analysis script

This commit is contained in:
Michael Duong 2023-09-21 21:16:48 +00:00
parent 8cfa9a6eb1
commit 6513e4feb9
14 changed files with 446 additions and 48 deletions

View file

@ -1,4 +1,5 @@
.dev_env/
.dev_env_pipeline/
__pycache__/
.DS_Store
.vscode/

View file

@ -68,13 +68,13 @@ def build_model(
data=train_data, target=target, model_hyperparameters=model_hyperparameters
)
logger.info("------------------------------")
logger.info("--- Generating predictions ---")
logger.info("------------------------------")
logger.info("----------------------------------")
logger.info("--- Generating fit predictions ---")
logger.info("----------------------------------")
prediction_data = train_data.drop(columns=target)
predictions = model.predict(data=prediction_data)
fit_predictions = model.predict(data=prediction_data)
logger.info("------------------------------")
logger.info("--- Generating fit metrics ---")
@ -82,7 +82,7 @@ def build_model(
metrics_output = metrics.generate_metrics(
target=train_data[target],
predictions=pd.Series(predictions),
predictions=pd.Series(fit_predictions),
)
logger.info("--------------------")

View file

@ -1,5 +1,5 @@
model_type: SKLearnLinearRegression
model_save_filepath: ./data/model/model.joblib
model_type: AutogluonAutoML
model_save_filepath: ./data/model/autogluonmodel/
fit_metrics_filepath: ./metrics/fit_metrics.json
SKLearnLinearRegression: null
@ -12,5 +12,5 @@ AutogluonAutoML:
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 400
presets: high_quality
presets: good_quality
excluded_model_types: ['KNN']

View file

@ -2,7 +2,59 @@ feature_processor_type: dataframe
feature_processor_config:
subsample_amount: null
subsample_seed: 0
target: RDSAP_CHANGE
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE"]
retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"]
target: SAP_ENDING
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE"]
# retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"]
# retain_features: null
# retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
# 'NUMBER_HEATED_ROOMS',
# 'FIXED_LIGHTING_OUTLETS_COUNT',
# 'CONSTRUCTION_AGE_BAND',
# 'TRANSACTION_TYPE_STARTING',
# 'LIGHTING_DESCRIPTION_STARTING',
# 'MAINHEAT_DESCRIPTION_STARTING',
# 'HOTWATER_DESCRIPTION_STARTING',
# 'MAIN_FUEL_STARTING',
# 'MECHANICAL_VENTILATION_STARTING',
# 'SECONDHEAT_DESCRIPTION_STARTING',
# 'ENERGY_TARIFF_STARTING',
# 'SOLAR_WATER_HEATING_FLAG_STARTING',
# 'PHOTO_SUPPLY_STARTING',
# 'WINDOWS_DESCRIPTION_STARTING',
# 'GLAZED_TYPE_STARTING',
# 'MULTI_GLAZE_PROPORTION_STARTING',
# 'LOW_ENERGY_LIGHTING_STARTING',
# 'NUMBER_OPEN_FIREPLACES_STARTING',
# 'MAINHEATCONT_DESCRIPTION_STARTING',
# 'EXTENSION_COUNT_STARTING',
# 'TOTAL_FLOOR_AREA_STARTING',
# 'FLOOR_HEIGHT_STARTING',
# 'DAYS_TO_STARTING',
# 'WALLS_DESCRIPTION_STARTING',
# 'FLOOR_DESCRIPTION_STARTING']
retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
'NUMBER_HEATED_ROOMS',
'FIXED_LIGHTING_OUTLETS_COUNT',
'CONSTRUCTION_AGE_BAND',
'TRANSACTION_TYPE_ENDING',
'LIGHTING_DESCRIPTION_ENDING',
'MAINHEAT_DESCRIPTION_ENDING',
'HOTWATER_DESCRIPTION_ENDING',
'MAIN_FUEL_ENDING',
'MECHANICAL_VENTILATION_ENDING',
'SECONDHEAT_DESCRIPTION_ENDING',
'ENERGY_TARIFF_ENDING',
'SOLAR_WATER_HEATING_FLAG_ENDING',
'PHOTO_SUPPLY_ENDING',
'WINDOWS_DESCRIPTION_ENDING',
'GLAZED_TYPE_ENDING',
'MULTI_GLAZE_PROPORTION_ENDING',
'LOW_ENERGY_LIGHTING_ENDING',
'NUMBER_OPEN_FIREPLACES_ENDING',
'MAINHEATCONT_DESCRIPTION_ENDING',
'EXTENSION_COUNT_ENDING',
'TOTAL_FLOOR_AREA_ENDING',
'FLOOR_HEIGHT_ENDING',
'DAYS_TO_ENDING',
'WALLS_DESCRIPTION_ENDING',
'FLOOR_DESCRIPTION_ENDING']

View file

@ -10,4 +10,10 @@ business_logic = {}
"""
New features dict + function
"""
new_feature_funcs = {}
def SAP_ENDING(df):
return df["SAP_STARTING"] + df["RDSAP_CHANGE"]
new_feature_funcs = {"SAP_ENDING": SAP_ENDING}

View file

@ -0,0 +1,8 @@
dataclient_type: local
feature_importance_filepath: ./analysis/feature_importance.parquet
permutation_subsample_amount: 1000
loss_fns: "mean_absolute_percentage_error"
feature_importance_column: importance
n_repeats: 5
figwidth: 7
figheight: 6

View file

@ -1,6 +1,5 @@
input_dataclient_type: aws-s3
output_dataclient_type: local
datahandler_type: parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet

View file

@ -134,6 +134,8 @@ class DataFrameFeatureProcessor:
subsample_amount=feature_processor_config["subsample_amount"],
subsample_seed=feature_processor_config["subsample_seed"],
)
df = self.apply_business_logic(df, business_logic=business_logic)
df = self.generate_new_features(df, new_feature_funcs=new_feature_funcs)
df = self.drop_unused_columns(
df, drop_columns=feature_processor_config["drop_columns"]
)
@ -142,6 +144,4 @@ class DataFrameFeatureProcessor:
retain_features=feature_processor_config["retain_features"],
target=feature_processor_config["target"],
)
df = self.apply_business_logic(df, business_logic=business_logic)
df = self.generate_new_features(df, new_feature_funcs=new_feature_funcs)
return df

View file

@ -5,8 +5,8 @@ stages:
deps:
- path: prepare_data.py
hash: md5
md5: 7531a931a405650dc4e8b5d8c1fd3c66
size: 4959
md5: 934d774e67f38e440b621ce71152f5f6
size: 5031
params:
configs/prepare_data.yaml:
output_test_filepath: ./data/prepared_data/test.parquet
@ -15,20 +15,20 @@ stages:
outs:
- path: data/prepared_data/
hash: md5
md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
size: 13238511
md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir
size: 13429347
nfiles: 2
build_model:
cmd: python build_model.py
deps:
- path: build_model.py
hash: md5
md5: c07ce0b8fdaf337ddfb7115684932157
size: 5048
md5: f9fa2a66d908b42ae196ce6f0f782258
size: 5134
- path: data/prepared_data
hash: md5
md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
size: 13238511
md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir
size: 13429347
nfiles: 2
params:
configs/build_model.yaml:
@ -37,42 +37,42 @@ stages:
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 400
presets: high_quality
presets: good_quality
excluded_model_types:
- KNN
SKLearnLinearRegression:
SKLearnSVMRegression:
kernel: linear
fit_metrics_filepath: ./metrics/fit_metrics.json
model_save_filepath: ./data/model/model.joblib
model_type: SKLearnLinearRegression
model_save_filepath: ./data/model/autogluonmodel/
model_type: AutogluonAutoML
outs:
- path: data/model/
hash: md5
md5: 2ace0835c28543512982b69d383b3c49.dir
size: 1832
nfiles: 1
md5: 10c467d6fe4ef8151d2df1e10fdf674f.dir
size: 118580145
nfiles: 71
- path: metrics/fit_metrics.json
hash: md5
md5: c8c5a40863e2ced7f5f5a844ba203d80
size: 180
md5: d4afc981e1e0783b79b02b0ba54638c4
size: 185
generate_predictions:
cmd: python generate_predictions.py
deps:
- path: data/model
hash: md5
md5: 2ace0835c28543512982b69d383b3c49.dir
size: 1832
nfiles: 1
md5: 10c467d6fe4ef8151d2df1e10fdf674f.dir
size: 118580145
nfiles: 71
- path: data/prepared_data
hash: md5
md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
size: 13238511
md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir
size: 13429347
nfiles: 2
- path: generate_predictions.py
hash: md5
md5: ab603e9a526a73f2fe17603e6fe6c0a4
size: 4261
md5: a25c4611ff467cdc1c921918112a30fe
size: 4311
params:
configs/generate_predictions.yaml:
input_dataclient_type: local
@ -83,26 +83,26 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: e87d96ed77d01ab2f24aeab5aaafe344.dir
size: 643838
md5: 4acd58ff6aae8faedc0b0bb848aedc97.dir
size: 537020
nfiles: 1
generate_metrics:
cmd: python generate_metrics.py
deps:
- path: data/predictions
hash: md5
md5: e87d96ed77d01ab2f24aeab5aaafe344.dir
size: 643838
md5: 4acd58ff6aae8faedc0b0bb848aedc97.dir
size: 537020
nfiles: 1
- path: data/prepared_data
hash: md5
md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
size: 13238511
md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir
size: 13429347
nfiles: 2
- path: generate_metrics.py
hash: md5
md5: 78a9b9b25d0a7deaf44277f9afad5f98
size: 4139
md5: 8ce0b6b55e1688fca816985e0cf37f28
size: 4220
params:
configs/generate_metrics.yaml:
dataclient_type: local
@ -113,8 +113,8 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: f494881710a057f90f82c0bd3a40a41d
size: 183
md5: f75356e08ceabb102d5b23508e140f0a
size: 182
startup_cleanup:
cmd: python startup_cleanup.py
deps:

View file

@ -0,0 +1,177 @@
"""
Doing some eda on dataset
"""
# Look at response variable
from matplotlib import pyplot as plt
import pandas as pd
train_df = pd.read_parquet("./data/prepared_data/train.parquet")
target = "SAP_ENDING"
train_df = train_df.head(10000)
# train_df[target].plot(kind='hist')
# Plot the target variable
fig, ax = plt.subplots(figsize=(10, 7))
ax.hist(train_df[target], bins=range(min(train_df[target]), max(train_df[target])))
fig
# Find correlation to sale price (numeric)
train_df.dtypes
# All numerical
train_df_corr = train_df.corr()
train_df_corr.style.background_gradient(cmap="coolwarm")
train_df_corr["EXTENSION_COUNT_ENDING"]
# Check out some correlation plots between variables
# sap starting - negative correlation
train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
# head demand - light positive correlation
train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
x=target, y="HEAT_DEMAND_STARTING", style="o"
)
# Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict
# Load the autogluon model and check feature importance
import os
import yaml
import pandas as pd
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from core.DataClient import dataclient_factory
from core.MLModels import model_factory
from core.Logger import logger
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
client_path = Path(__file__).parent / "configs" / "client.yaml"
client_params = yaml.safe_load(open(client_path))
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
build_model_params = yaml.safe_load(open(build_model_path))
generate_predictions_path = (
Path(__file__).parent / "configs" / "generate_predictions.yaml"
)
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
feature_process_params = yaml.safe_load(open(feature_process_path))
model = model_factory(build_model_params["model_type"])
model_filepath = build_model_params["model_save_filepath"]
model.load_model(model_filepath)
fi = model.model.feature_importance(train_df.reset_index(drop=True))
pred = pd.read_parquet("./data/predictions/predictions.parquet")
test_df = pd.read_parquet("./data/prepared_data/test.parquet")
# test_df = test_df.head(1000)
test_df["predictions"] = pred["predictions"]
test_df.groupby("PROPERTY_TYPE").apply(
lambda x: (x.SAP_ENDING - x.predictions).abs().mean()
)
test_df.head()
flat_df = test_df[test_df["PROPERTY_TYPE"] == "Flat"]
flat_df["residual"] = abs(flat_df["predictions"] - flat_df[target])
generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml"
generate_metrics_params = yaml.safe_load(open(generate_metrics_path))
from core.MLMetrics import metrics_factory
metrics = metrics_factory(generate_metrics_params["metrics_type"])
metrics_output = metrics.generate_metrics(
target=flat_df[target],
predictions=pd.Series(flat_df["predictions"]),
)
# Use alibi to run permutation importance
from alibi.explainers import PermutationImportance, plot_permutation_importance
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
import pandas as pd
test_df = pd.read_parquet("./data/prepared_data/test.parquet")
test_df = test_df.head(1000)
target = "SAP_ENDING"
feature_names = test_df.columns.to_list()
feature_names.remove(target)
x = test_df[feature_names].to_numpy()
y = test_df[target].to_numpy()
def predict_fn(X: np.ndarray) -> np.ndarray:
return model.predict(pd.DataFrame(X, columns=feature_names))
pfi = PermutationImportance(
predictor=predict_fn,
loss_fns=mean_absolute_percentage_error,
feature_names=feature_names,
verbose=True,
)
exp = pfi.explain(x, y)
plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
[
"PROPERTY_TYPE",
"BUILT_FORM",
"CONSTITUENCY",
"NUMBER_HABITABLE_ROOMS",
"NUMBER_HEATED_ROOMS",
"FIXED_LIGHTING_OUTLETS_COUNT",
"CONSTRUCTION_AGE_BAND",
"TRANSACTION_TYPE_STARTING",
"LIGHTING_DESCRIPTION_STARTING",
"MAINHEAT_DESCRIPTION_STARTING",
"HOTWATER_DESCRIPTION_STARTING",
"MAIN_FUEL_STARTING",
"MECHANICAL_VENTILATION_STARTING",
"SECONDHEAT_DESCRIPTION_STARTING",
"ENERGY_TARIFF_STARTING",
"SOLAR_WATER_HEATING_FLAG_STARTING",
"PHOTO_SUPPLY_STARTING",
"WINDOWS_DESCRIPTION_STARTING",
"GLAZED_TYPE_STARTING",
"MULTI_GLAZE_PROPORTION_STARTING",
"LOW_ENERGY_LIGHTING_STARTING",
"NUMBER_OPEN_FIREPLACES_STARTING",
"MAINHEATCONT_DESCRIPTION_STARTING",
"EXTENSION_COUNT_STARTING",
"TOTAL_FLOOR_AREA_STARTING",
"FLOOR_HEIGHT_STARTING",
"DAYS_TO_STARTING",
"WALLS_DESCRIPTION_STARTING",
"FLOOR_DESCRIPTION_STARTING",
]
# Use shap package to explain why 9158 has a 35 prediction when its sap ending is 96
#
#

View file

@ -0,0 +1,150 @@
"""
Post Model generation step:
We want to look at feature analysis of the model
"""
import yaml
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from core.Logger import logger
from core.MLModels import model_factory
from core.DataClient import dataclient_factory
from alibi.explainers import PermutationImportance, plot_permutation_importance
import numpy as np
import pandas as pd
client_path = Path(__file__).parent / "configs" / "client.yaml"
client_params = yaml.safe_load(open(client_path))
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
feature_process_params = yaml.safe_load(open(feature_process_path))
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
build_model_params = yaml.safe_load(open(build_model_path))
model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml"
model_analysis_params = yaml.safe_load(open(model_analysis_path))
generate_predictions_path = (
Path(__file__).parent / "configs" / "generate_predictions.yaml"
)
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
model = model_factory(build_model_params["model_type"])
model.load_model(build_model_params["model_save_filepath"])
dataclient_type = model_analysis_params["dataclient_type"]
dataclient = dataclient_factory(
dataclient_type=dataclient_type,
dataclient_config=client_params[dataclient_type],
)
feature_importance_filepath = model_analysis_params["feature_importance_filepath"]
permutation_subsample_amount = model_analysis_params["permutation_subsample_amount"]
loss_fns = model_analysis_params["loss_fns"]
feature_importance_column = model_analysis_params["feature_importance_column"]
n_repeats = model_analysis_params["n_repeats"]
figwidth = model_analysis_params["figwidth"]
figheight = model_analysis_params["figheight"]
target = feature_process_params["feature_processor_config"]["target"]
output_test_filepath = prepare_data_params["output_test_filepath"]
def model_analysis(
model: MLModel,
dataclient: DataClient,
target: str,
output_test_filepath: str,
feature_importance_filepath: str,
permutation_subsample_amount: int = 100,
loss_fns: str = "mean_absolute_percentage_error",
feature_importance_column: str = "importance",
n_repeats: int = 5,
figwidth: int = 7,
figheight: int = 6,
):
"""
Key task is to take in a model and generate:
- feature importance
and save these outputs
"""
logger.info("------------------------------------")
logger.info(f"--- Generate Feature Importance ---")
logger.info("------------------------------------")
test_df = pd.read_parquet(output_test_filepath)
test_df = test_df.head(permutation_subsample_amount)
feature_names = test_df.columns.to_list()
feature_names.remove(target)
x = test_df[feature_names].to_numpy()
y = test_df[target].to_numpy()
def predict_fn(X: np.ndarray) -> np.ndarray:
return model.predict(pd.DataFrame(X, columns=feature_names))
pfi = PermutationImportance(
predictor=predict_fn,
loss_fns=loss_fns,
feature_names=feature_names,
verbose=True,
)
logger.info(
f"Permutation feature importance - using {permutation_subsample_amount} samples and {n_repeats} shuffles per feature:"
)
exp = pfi.explain(x, y, n_repeats=n_repeats)
mean_value_feature_importance = [
element["mean"] for element in exp.data["feature_importance"][0]
]
feature_importance_df = pd.DataFrame(
mean_value_feature_importance,
index=exp.data["feature_names"],
columns=[feature_importance_column],
).sort_values(feature_importance_column, ascending=False)
plot_permutation_importance(
exp, fig_kw={"figwidth": figwidth, "figheight": figheight}
)
logger.info("--------------------------------------")
logger.info(f"--- Save Feature Importance table ---")
logger.info("--------------------------------------")
dataclient.save_data(feature_importance_df, location=feature_importance_filepath)
if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
model_analysis(
model=model,
dataclient=dataclient,
target=target,
output_test_filepath=output_test_filepath,
feature_importance_filepath=feature_importance_filepath,
permutation_subsample_amount=permutation_subsample_amount,
loss_fns=loss_fns,
feature_importance_column=feature_importance_column,
n_repeats=n_repeats,
figwidth=figwidth,
figheight=figheight,
)
logger.info("-------------------------------")
logger.info(f"--- {__file__} - Complete! ---")
logger.info("-------------------------------")

View file

@ -0,0 +1,4 @@
"""
Look at why the model made such a prediction
Manual script to run
"""

View file

@ -2,6 +2,7 @@ joblib==1.3.2
boto3==1.28.17
pandas==1.5.3
autogluon==0.8.2
alibi==0.9.4
pyarrow==13.0.0
pre-commit==3.3.3
sphinx==7.2.5