From 6513e4feb9b64450aca254ed3c806ac57faaf1f8 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 21 Sep 2023 21:16:48 +0000 Subject: [PATCH] add feature importance in model analysis script --- modules/ml-pipeline/.gitignore | 1 + .../analysis/feature_importance.parquet | Bin 0 -> 3117 bytes .../ml-pipeline/src/pipeline/build_model.py | 10 +- .../src/pipeline/configs/build_model.yaml | 6 +- .../pipeline/configs/feature_processor.yaml | 58 +++++- .../configs/feature_processor_logic.py | 8 +- .../src/pipeline/configs/model_analysis.yaml | 8 + .../src/pipeline/configs/prepare_data.yaml | 1 - .../src/pipeline/core/FeatureProcessor.py | 4 +- modules/ml-pipeline/src/pipeline/dvc.lock | 66 +++---- modules/ml-pipeline/src/pipeline/eda.py | 177 ++++++++++++++++++ .../src/pipeline/model_analysis.py | 150 +++++++++++++++ .../src/pipeline/prediction_analysis.py | 4 + .../training/requirements-dev.txt | 1 + 14 files changed, 446 insertions(+), 48 deletions(-) create mode 100644 modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet create mode 100644 modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml create mode 100644 modules/ml-pipeline/src/pipeline/eda.py create mode 100644 modules/ml-pipeline/src/pipeline/model_analysis.py create mode 100644 modules/ml-pipeline/src/pipeline/prediction_analysis.py diff --git a/modules/ml-pipeline/.gitignore b/modules/ml-pipeline/.gitignore index 63900db..664bc8d 100644 --- a/modules/ml-pipeline/.gitignore +++ b/modules/ml-pipeline/.gitignore @@ -1,4 +1,5 @@ .dev_env/ +.dev_env_pipeline/ __pycache__/ .DS_Store .vscode/ diff --git a/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet b/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b0c328faa639b3a3a210a3f03c23f429954c4719 GIT binary patch literal 3117 zcmcgveP|o!89zCeD;sNd8lNI3&SGn|YC6T1CCjr?v&*Z~>2&JpPClJJ!#H%($(DPP zRau`&-R7TlY4<@(O8*FqZH$d{4lFBW9c)WsO9Cqt3Wc$3>&PElX=z8#Z0-I@*FAT# z<2X(?!k{nsJ-yHKyXW~mU$5jC5@zsT{Kh`~-F+;+fMX9}m}b&Gyj=K||B1Ny!~@sA zdgU$lW8&#^uZgb@enk9k^zHY2-}(np`_uB7pS-h49C_z``dW39c=(~|mwxEqBz`bK z-aO~sB&x4!*K5<8#M#+vf!F@25zp3cUUa{!5$nfKnzgqzqEGqb%U6G|5hq{qeX{S0 zM*QsJLpLqo(1`E+?U}0+OB(U#pFa583vrE54_|9?A&sz{yL95C1&y$MFYEfHUn4&G z(V6k_IgR-H2KL1((;A^PUwFX^&!?tNecJxAMx1}|%^$z&)QH&wXBVGzXhiJ8{3UO{ zMl`O!ML2sk;_~!`qkq{4xWAr#DqzwG9K+}F1s7ys2=BxDM(vjE4A?E(7#lWu7b%8e zqiPTA3np{_*jeT0c?tK=2D z$F^X5*=)OPc0SkZwA_o!5f{`9!+S8;hTwr=aK1Q*f8VlQ!x#H*41ze0Vp3d0g2-^R z9nLAlvT-R$#YusQi44z46c+^DVbDbp!-Xj%N};0|cJ#| zM@=8N02mh;F+p);#`M8ph~;_io7jl~00t5a3lw-Eiep&+af^^3&F3c(O-TX71&1(f z;=qs;N>Hq|jU6zxCxIC+#S^g@o3S2z5V{wmS(K%M=5JeVray>WA{w9s37m6Mh!Ln5 zi%2SN9iJE;;zh(t{TqWwppfV~6oU_ODY5zuVm@<9W z>N6*Y60FEb6J|^rKi|s>N0QjE>G}SRVJH(53R(SQqu?jRq^aP@Z^rn9$WmflB27Ox zRgVQ+dV#2*B=CGRZo0Am;3x_(A_}ln{~>f?IL@<3keIWkxp^q66k-v*Rp2}Odk&jC z)TMW~cCZiZlmW>6kN$po|0q7V)!+LisZ=Q_ZAn#{iYmD!$!PO`TWufrJKwzD`P_Y@ z_u%?&eLk|_!|Z))`DLY?!+SA&V4~;tslxC<5bUKffw6jCZ3`o#Bd*V7B%PmUUTl{R z*cO}?+bj1EW7vqx-dD|43b{4=*LzPpx&_Ivs*Q4G&2fC}q+?H!9A9&dIl72${S*Uu z+EK}s6?mo6P*p$_ONv^McFVesqWk9j?FvP*LTS5X5rYmJas#}*xaGJgH=&o&DG~-_p-W8aT_f? z+;tbQ+a2m|hkKI@Z`m;18o}3;Tz$1>BuFjEwOp;E=cn4q)oQCPJ&LAMTPv+r zbm@$D%01*4z3R6}aSvonJ_Ha!( zoUF^0Xo)WcWJdKDb1C0r+4OQzrjwp*s$GnR{6$7}uQ6m~MfS93RmNO6WEkg;aZuB~d9`R#ll6igGzr&&B~q`(|XirQ2y`z(*#{6$@dtW$4Jrj9ga1 zzMwLt))Ls8k<$TR+AH|-v~CxbWKZPrY>IPt^)qUL4uK6}nORv~QaTY;E_2N+4RPt5 z$Vf6tW$HW`@fVZ3xS!JTDeqe87B4bV1z*j4MOX$KTmG|Ei2bp&XFD!b%B5ynvNz%e z?km9gv*IYrw12q}%xv-88%MU>Zf4Sp9*#6@)UyynmJYcyw{q0>v!q_5ZQuVTr(~p< z_H5U(trp73we+b#G2Qu0e*agVy&#Y0)s2FWT$=McDF>15Z59MSh?@^nwZj)r%>`I~$?L_=~ z=fYed6b?X*gyp>6=K2{X1(wnOofr(1E?-O-lK{h{AB np.ndarray: + return model.predict(pd.DataFrame(X, columns=feature_names)) + + +pfi = PermutationImportance( + predictor=predict_fn, + loss_fns=mean_absolute_percentage_error, + feature_names=feature_names, + verbose=True, +) + +exp = pfi.explain(x, y) +plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6}) + +[ + "PROPERTY_TYPE", + "BUILT_FORM", + "CONSTITUENCY", + "NUMBER_HABITABLE_ROOMS", + "NUMBER_HEATED_ROOMS", + "FIXED_LIGHTING_OUTLETS_COUNT", + "CONSTRUCTION_AGE_BAND", + "TRANSACTION_TYPE_STARTING", + "LIGHTING_DESCRIPTION_STARTING", + "MAINHEAT_DESCRIPTION_STARTING", + "HOTWATER_DESCRIPTION_STARTING", + "MAIN_FUEL_STARTING", + "MECHANICAL_VENTILATION_STARTING", + "SECONDHEAT_DESCRIPTION_STARTING", + "ENERGY_TARIFF_STARTING", + "SOLAR_WATER_HEATING_FLAG_STARTING", + "PHOTO_SUPPLY_STARTING", + "WINDOWS_DESCRIPTION_STARTING", + "GLAZED_TYPE_STARTING", + "MULTI_GLAZE_PROPORTION_STARTING", + "LOW_ENERGY_LIGHTING_STARTING", + "NUMBER_OPEN_FIREPLACES_STARTING", + "MAINHEATCONT_DESCRIPTION_STARTING", + "EXTENSION_COUNT_STARTING", + "TOTAL_FLOOR_AREA_STARTING", + "FLOOR_HEIGHT_STARTING", + "DAYS_TO_STARTING", + "WALLS_DESCRIPTION_STARTING", + "FLOOR_DESCRIPTION_STARTING", +] + +# Use shap package to explain why 9158 has a 35 prediction when its sap ending is 96 +# +# diff --git a/modules/ml-pipeline/src/pipeline/model_analysis.py b/modules/ml-pipeline/src/pipeline/model_analysis.py new file mode 100644 index 0000000..206eb9a --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/model_analysis.py @@ -0,0 +1,150 @@ +""" +Post Model generation step: +We want to look at feature analysis of the model +""" + +import yaml +from pathlib import Path +from core.interface.InterfaceModels import MLModel +from core.interface.InterfaceDataClient import DataClient +from core.Logger import logger +from core.MLModels import model_factory +from core.DataClient import dataclient_factory +from alibi.explainers import PermutationImportance, plot_permutation_importance +import numpy as np +import pandas as pd + + +client_path = Path(__file__).parent / "configs" / "client.yaml" +client_params = yaml.safe_load(open(client_path)) + +prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" +prepare_data_params = yaml.safe_load(open(prepare_data_path)) + +feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml" +feature_process_params = yaml.safe_load(open(feature_process_path)) + +build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" +build_model_params = yaml.safe_load(open(build_model_path)) + +model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml" +model_analysis_params = yaml.safe_load(open(model_analysis_path)) + +generate_predictions_path = ( + Path(__file__).parent / "configs" / "generate_predictions.yaml" +) +generate_predictions_params = yaml.safe_load(open(generate_predictions_path)) + +model = model_factory(build_model_params["model_type"]) +model.load_model(build_model_params["model_save_filepath"]) + +dataclient_type = model_analysis_params["dataclient_type"] +dataclient = dataclient_factory( + dataclient_type=dataclient_type, + dataclient_config=client_params[dataclient_type], +) + + +feature_importance_filepath = model_analysis_params["feature_importance_filepath"] +permutation_subsample_amount = model_analysis_params["permutation_subsample_amount"] +loss_fns = model_analysis_params["loss_fns"] +feature_importance_column = model_analysis_params["feature_importance_column"] +n_repeats = model_analysis_params["n_repeats"] +figwidth = model_analysis_params["figwidth"] +figheight = model_analysis_params["figheight"] +target = feature_process_params["feature_processor_config"]["target"] +output_test_filepath = prepare_data_params["output_test_filepath"] + + +def model_analysis( + model: MLModel, + dataclient: DataClient, + target: str, + output_test_filepath: str, + feature_importance_filepath: str, + permutation_subsample_amount: int = 100, + loss_fns: str = "mean_absolute_percentage_error", + feature_importance_column: str = "importance", + n_repeats: int = 5, + figwidth: int = 7, + figheight: int = 6, +): + """ + Key task is to take in a model and generate: + - feature importance + and save these outputs + """ + + logger.info("------------------------------------") + logger.info(f"--- Generate Feature Importance ---") + logger.info("------------------------------------") + + test_df = pd.read_parquet(output_test_filepath) + + test_df = test_df.head(permutation_subsample_amount) + + feature_names = test_df.columns.to_list() + feature_names.remove(target) + + x = test_df[feature_names].to_numpy() + y = test_df[target].to_numpy() + + def predict_fn(X: np.ndarray) -> np.ndarray: + return model.predict(pd.DataFrame(X, columns=feature_names)) + + pfi = PermutationImportance( + predictor=predict_fn, + loss_fns=loss_fns, + feature_names=feature_names, + verbose=True, + ) + + logger.info( + f"Permutation feature importance - using {permutation_subsample_amount} samples and {n_repeats} shuffles per feature:" + ) + + exp = pfi.explain(x, y, n_repeats=n_repeats) + + mean_value_feature_importance = [ + element["mean"] for element in exp.data["feature_importance"][0] + ] + feature_importance_df = pd.DataFrame( + mean_value_feature_importance, + index=exp.data["feature_names"], + columns=[feature_importance_column], + ).sort_values(feature_importance_column, ascending=False) + + plot_permutation_importance( + exp, fig_kw={"figwidth": figwidth, "figheight": figheight} + ) + + logger.info("--------------------------------------") + logger.info(f"--- Save Feature Importance table ---") + logger.info("--------------------------------------") + + dataclient.save_data(feature_importance_df, location=feature_importance_filepath) + + +if __name__ == "__main__": + + logger.info("----------------------------") + logger.info(f"--- {__file__} - Start! ---") + logger.info("----------------------------") + + model_analysis( + model=model, + dataclient=dataclient, + target=target, + output_test_filepath=output_test_filepath, + feature_importance_filepath=feature_importance_filepath, + permutation_subsample_amount=permutation_subsample_amount, + loss_fns=loss_fns, + feature_importance_column=feature_importance_column, + n_repeats=n_repeats, + figwidth=figwidth, + figheight=figheight, + ) + + logger.info("-------------------------------") + logger.info(f"--- {__file__} - Complete! ---") + logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/prediction_analysis.py b/modules/ml-pipeline/src/pipeline/prediction_analysis.py new file mode 100644 index 0000000..428bf0b --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/prediction_analysis.py @@ -0,0 +1,4 @@ +""" +Look at why the model made such a prediction +Manual script to run +""" diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index b4679d0..e34d5af 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -2,6 +2,7 @@ joblib==1.3.2 boto3==1.28.17 pandas==1.5.3 autogluon==0.8.2 +alibi==0.9.4 pyarrow==13.0.0 pre-commit==3.3.3 sphinx==7.2.5