From 6513e4feb9b64450aca254ed3c806ac57faaf1f8 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 21 Sep 2023 21:16:48 +0000 Subject: [PATCH 1/3] add feature importance in model analysis script --- modules/ml-pipeline/.gitignore | 1 + .../analysis/feature_importance.parquet | Bin 0 -> 3117 bytes .../ml-pipeline/src/pipeline/build_model.py | 10 +- .../src/pipeline/configs/build_model.yaml | 6 +- .../pipeline/configs/feature_processor.yaml | 58 +++++- .../configs/feature_processor_logic.py | 8 +- .../src/pipeline/configs/model_analysis.yaml | 8 + .../src/pipeline/configs/prepare_data.yaml | 1 - .../src/pipeline/core/FeatureProcessor.py | 4 +- modules/ml-pipeline/src/pipeline/dvc.lock | 66 +++---- modules/ml-pipeline/src/pipeline/eda.py | 177 ++++++++++++++++++ .../src/pipeline/model_analysis.py | 150 +++++++++++++++ .../src/pipeline/prediction_analysis.py | 4 + .../training/requirements-dev.txt | 1 + 14 files changed, 446 insertions(+), 48 deletions(-) create mode 100644 modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet create mode 100644 modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml create mode 100644 modules/ml-pipeline/src/pipeline/eda.py create mode 100644 modules/ml-pipeline/src/pipeline/model_analysis.py create mode 100644 modules/ml-pipeline/src/pipeline/prediction_analysis.py diff --git a/modules/ml-pipeline/.gitignore b/modules/ml-pipeline/.gitignore index 63900db..664bc8d 100644 --- a/modules/ml-pipeline/.gitignore +++ b/modules/ml-pipeline/.gitignore @@ -1,4 +1,5 @@ .dev_env/ +.dev_env_pipeline/ __pycache__/ .DS_Store .vscode/ diff --git a/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet b/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b0c328faa639b3a3a210a3f03c23f429954c4719 GIT binary patch literal 3117 zcmcgveP|o!89zCeD;sNd8lNI3&SGn|YC6T1CCjr?v&*Z~>2&JpPClJJ!#H%($(DPP zRau`&-R7TlY4<@(O8*FqZH$d{4lFBW9c)WsO9Cqt3Wc$3>&PElX=z8#Z0-I@*FAT# z<2X(?!k{nsJ-yHKyXW~mU$5jC5@zsT{Kh`~-F+;+fMX9}m}b&Gyj=K||B1Ny!~@sA zdgU$lW8&#^uZgb@enk9k^zHY2-}(np`_uB7pS-h49C_z``dW39c=(~|mwxEqBz`bK z-aO~sB&x4!*K5<8#M#+vf!F@25zp3cUUa{!5$nfKnzgqzqEGqb%U6G|5hq{qeX{S0 zM*QsJLpLqo(1`E+?U}0+OB(U#pFa583vrE54_|9?A&sz{yL95C1&y$MFYEfHUn4&G z(V6k_IgR-H2KL1((;A^PUwFX^&!?tNecJxAMx1}|%^$z&)QH&wXBVGzXhiJ8{3UO{ zMl`O!ML2sk;_~!`qkq{4xWAr#DqzwG9K+}F1s7ys2=BxDM(vjE4A?E(7#lWu7b%8e zqiPTA3np{_*jeT0c?tK=2D z$F^X5*=)OPc0SkZwA_o!5f{`9!+S8;hTwr=aK1Q*f8VlQ!x#H*41ze0Vp3d0g2-^R z9nLAlvT-R$#YusQi44z46c+^DVbDbp!-Xj%N};0|cJ#| zM@=8N02mh;F+p);#`M8ph~;_io7jl~00t5a3lw-Eiep&+af^^3&F3c(O-TX71&1(f z;=qs;N>Hq|jU6zxCxIC+#S^g@o3S2z5V{wmS(K%M=5JeVray>WA{w9s37m6Mh!Ln5 zi%2SN9iJE;;zh(t{TqWwppfV~6oU_ODY5zuVm@<9W z>N6*Y60FEb6J|^rKi|s>N0QjE>G}SRVJH(53R(SQqu?jRq^aP@Z^rn9$WmflB27Ox zRgVQ+dV#2*B=CGRZo0Am;3x_(A_}ln{~>f?IL@<3keIWkxp^q66k-v*Rp2}Odk&jC z)TMW~cCZiZlmW>6kN$po|0q7V)!+LisZ=Q_ZAn#{iYmD!$!PO`TWufrJKwzD`P_Y@ z_u%?&eLk|_!|Z))`DLY?!+SA&V4~;tslxC<5bUKffw6jCZ3`o#Bd*V7B%PmUUTl{R z*cO}?+bj1EW7vqx-dD|43b{4=*LzPpx&_Ivs*Q4G&2fC}q+?H!9A9&dIl72${S*Uu z+EK}s6?mo6P*p$_ONv^McFVesqWk9j?FvP*LTS5X5rYmJas#}*xaGJgH=&o&DG~-_p-W8aT_f? z+;tbQ+a2m|hkKI@Z`m;18o}3;Tz$1>BuFjEwOp;E=cn4q)oQCPJ&LAMTPv+r zbm@$D%01*4z3R6}aSvonJ_Ha!( zoUF^0Xo)WcWJdKDb1C0r+4OQzrjwp*s$GnR{6$7}uQ6m~MfS93RmNO6WEkg;aZuB~d9`R#ll6igGzr&&B~q`(|XirQ2y`z(*#{6$@dtW$4Jrj9ga1 zzMwLt))Ls8k<$TR+AH|-v~CxbWKZPrY>IPt^)qUL4uK6}nORv~QaTY;E_2N+4RPt5 z$Vf6tW$HW`@fVZ3xS!JTDeqe87B4bV1z*j4MOX$KTmG|Ei2bp&XFD!b%B5ynvNz%e z?km9gv*IYrw12q}%xv-88%MU>Zf4Sp9*#6@)UyynmJYcyw{q0>v!q_5ZQuVTr(~p< z_H5U(trp73we+b#G2Qu0e*agVy&#Y0)s2FWT$=McDF>15Z59MSh?@^nwZj)r%>`I~$?L_=~ z=fYed6b?X*gyp>6=K2{X1(wnOofr(1E?-O-lK{h{AB np.ndarray: + return model.predict(pd.DataFrame(X, columns=feature_names)) + + +pfi = PermutationImportance( + predictor=predict_fn, + loss_fns=mean_absolute_percentage_error, + feature_names=feature_names, + verbose=True, +) + +exp = pfi.explain(x, y) +plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6}) + +[ + "PROPERTY_TYPE", + "BUILT_FORM", + "CONSTITUENCY", + "NUMBER_HABITABLE_ROOMS", + "NUMBER_HEATED_ROOMS", + "FIXED_LIGHTING_OUTLETS_COUNT", + "CONSTRUCTION_AGE_BAND", + "TRANSACTION_TYPE_STARTING", + "LIGHTING_DESCRIPTION_STARTING", + "MAINHEAT_DESCRIPTION_STARTING", + "HOTWATER_DESCRIPTION_STARTING", + "MAIN_FUEL_STARTING", + "MECHANICAL_VENTILATION_STARTING", + "SECONDHEAT_DESCRIPTION_STARTING", + "ENERGY_TARIFF_STARTING", + "SOLAR_WATER_HEATING_FLAG_STARTING", + "PHOTO_SUPPLY_STARTING", + "WINDOWS_DESCRIPTION_STARTING", + "GLAZED_TYPE_STARTING", + "MULTI_GLAZE_PROPORTION_STARTING", + "LOW_ENERGY_LIGHTING_STARTING", + "NUMBER_OPEN_FIREPLACES_STARTING", + "MAINHEATCONT_DESCRIPTION_STARTING", + "EXTENSION_COUNT_STARTING", + "TOTAL_FLOOR_AREA_STARTING", + "FLOOR_HEIGHT_STARTING", + "DAYS_TO_STARTING", + "WALLS_DESCRIPTION_STARTING", + "FLOOR_DESCRIPTION_STARTING", +] + +# Use shap package to explain why 9158 has a 35 prediction when its sap ending is 96 +# +# diff --git a/modules/ml-pipeline/src/pipeline/model_analysis.py b/modules/ml-pipeline/src/pipeline/model_analysis.py new file mode 100644 index 0000000..206eb9a --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/model_analysis.py @@ -0,0 +1,150 @@ +""" +Post Model generation step: +We want to look at feature analysis of the model +""" + +import yaml +from pathlib import Path +from core.interface.InterfaceModels import MLModel +from core.interface.InterfaceDataClient import DataClient +from core.Logger import logger +from core.MLModels import model_factory +from core.DataClient import dataclient_factory +from alibi.explainers import PermutationImportance, plot_permutation_importance +import numpy as np +import pandas as pd + + +client_path = Path(__file__).parent / "configs" / "client.yaml" +client_params = yaml.safe_load(open(client_path)) + +prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" +prepare_data_params = yaml.safe_load(open(prepare_data_path)) + +feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml" +feature_process_params = yaml.safe_load(open(feature_process_path)) + +build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" +build_model_params = yaml.safe_load(open(build_model_path)) + +model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml" +model_analysis_params = yaml.safe_load(open(model_analysis_path)) + +generate_predictions_path = ( + Path(__file__).parent / "configs" / "generate_predictions.yaml" +) +generate_predictions_params = yaml.safe_load(open(generate_predictions_path)) + +model = model_factory(build_model_params["model_type"]) +model.load_model(build_model_params["model_save_filepath"]) + +dataclient_type = model_analysis_params["dataclient_type"] +dataclient = dataclient_factory( + dataclient_type=dataclient_type, + dataclient_config=client_params[dataclient_type], +) + + +feature_importance_filepath = model_analysis_params["feature_importance_filepath"] +permutation_subsample_amount = model_analysis_params["permutation_subsample_amount"] +loss_fns = model_analysis_params["loss_fns"] +feature_importance_column = model_analysis_params["feature_importance_column"] +n_repeats = model_analysis_params["n_repeats"] +figwidth = model_analysis_params["figwidth"] +figheight = model_analysis_params["figheight"] +target = feature_process_params["feature_processor_config"]["target"] +output_test_filepath = prepare_data_params["output_test_filepath"] + + +def model_analysis( + model: MLModel, + dataclient: DataClient, + target: str, + output_test_filepath: str, + feature_importance_filepath: str, + permutation_subsample_amount: int = 100, + loss_fns: str = "mean_absolute_percentage_error", + feature_importance_column: str = "importance", + n_repeats: int = 5, + figwidth: int = 7, + figheight: int = 6, +): + """ + Key task is to take in a model and generate: + - feature importance + and save these outputs + """ + + logger.info("------------------------------------") + logger.info(f"--- Generate Feature Importance ---") + logger.info("------------------------------------") + + test_df = pd.read_parquet(output_test_filepath) + + test_df = test_df.head(permutation_subsample_amount) + + feature_names = test_df.columns.to_list() + feature_names.remove(target) + + x = test_df[feature_names].to_numpy() + y = test_df[target].to_numpy() + + def predict_fn(X: np.ndarray) -> np.ndarray: + return model.predict(pd.DataFrame(X, columns=feature_names)) + + pfi = PermutationImportance( + predictor=predict_fn, + loss_fns=loss_fns, + feature_names=feature_names, + verbose=True, + ) + + logger.info( + f"Permutation feature importance - using {permutation_subsample_amount} samples and {n_repeats} shuffles per feature:" + ) + + exp = pfi.explain(x, y, n_repeats=n_repeats) + + mean_value_feature_importance = [ + element["mean"] for element in exp.data["feature_importance"][0] + ] + feature_importance_df = pd.DataFrame( + mean_value_feature_importance, + index=exp.data["feature_names"], + columns=[feature_importance_column], + ).sort_values(feature_importance_column, ascending=False) + + plot_permutation_importance( + exp, fig_kw={"figwidth": figwidth, "figheight": figheight} + ) + + logger.info("--------------------------------------") + logger.info(f"--- Save Feature Importance table ---") + logger.info("--------------------------------------") + + dataclient.save_data(feature_importance_df, location=feature_importance_filepath) + + +if __name__ == "__main__": + + logger.info("----------------------------") + logger.info(f"--- {__file__} - Start! ---") + logger.info("----------------------------") + + model_analysis( + model=model, + dataclient=dataclient, + target=target, + output_test_filepath=output_test_filepath, + feature_importance_filepath=feature_importance_filepath, + permutation_subsample_amount=permutation_subsample_amount, + loss_fns=loss_fns, + feature_importance_column=feature_importance_column, + n_repeats=n_repeats, + figwidth=figwidth, + figheight=figheight, + ) + + logger.info("-------------------------------") + logger.info(f"--- {__file__} - Complete! ---") + logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/prediction_analysis.py b/modules/ml-pipeline/src/pipeline/prediction_analysis.py new file mode 100644 index 0000000..428bf0b --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/prediction_analysis.py @@ -0,0 +1,4 @@ +""" +Look at why the model made such a prediction +Manual script to run +""" diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index b4679d0..e34d5af 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -2,6 +2,7 @@ joblib==1.3.2 boto3==1.28.17 pandas==1.5.3 autogluon==0.8.2 +alibi==0.9.4 pyarrow==13.0.0 pre-commit==3.3.3 sphinx==7.2.5 From 4a6b7f3ed770094704f2f58bd0bd5a00b7720f83 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 21 Sep 2023 21:28:14 +0000 Subject: [PATCH 2/3] fixed bug --- .pre-commit-config.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 38 ++++++------ .../src/pipeline/prediction_analysis.py | 61 +++++++++++++++++++ 3 files changed, 81 insertions(+), 20 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d59b9e8..196008f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,6 +14,6 @@ repos: hooks: - id: dvc-push-experiment name: DVC - Push to experiment to remote location (experiments) - entry: bash -c 'cd modules/ml-pipeline/src/pipeline/src && dvc push -r experiments || echo "Up to date!"' + entry: bash -c 'cd modules/ml-pipeline/src/pipeline && dvc push -r experiments || echo "Up to date!"' language: system verbose: true diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index b1567bf..501dc10 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -15,8 +15,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir - size: 13429347 + md5: 3767eec56906f5ac724a3f07433645ef.dir + size: 13442342 nfiles: 2 build_model: cmd: python build_model.py @@ -27,8 +27,8 @@ stages: size: 5134 - path: data/prepared_data hash: md5 - md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir - size: 13429347 + md5: 3767eec56906f5ac724a3f07433645ef.dir + size: 13442342 nfiles: 2 params: configs/build_model.yaml: @@ -49,25 +49,25 @@ stages: outs: - path: data/model/ hash: md5 - md5: 10c467d6fe4ef8151d2df1e10fdf674f.dir - size: 118580145 + md5: 7b2f8334c81fb5ff23e42e77741b31d1.dir + size: 118227750 nfiles: 71 - path: metrics/fit_metrics.json hash: md5 - md5: d4afc981e1e0783b79b02b0ba54638c4 + md5: e1c9a16617804f48e8ffac7cec6575ca size: 185 generate_predictions: cmd: python generate_predictions.py deps: - path: data/model hash: md5 - md5: 10c467d6fe4ef8151d2df1e10fdf674f.dir - size: 118580145 + md5: 7b2f8334c81fb5ff23e42e77741b31d1.dir + size: 118227750 nfiles: 71 - path: data/prepared_data hash: md5 - md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir - size: 13429347 + md5: 3767eec56906f5ac724a3f07433645ef.dir + size: 13442342 nfiles: 2 - path: generate_predictions.py hash: md5 @@ -83,21 +83,21 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 4acd58ff6aae8faedc0b0bb848aedc97.dir - size: 537020 + md5: fb7cf3f4a90598ec1e43a1b7a4af3bef.dir + size: 536774 nfiles: 1 generate_metrics: cmd: python generate_metrics.py deps: - path: data/predictions hash: md5 - md5: 4acd58ff6aae8faedc0b0bb848aedc97.dir - size: 537020 + md5: fb7cf3f4a90598ec1e43a1b7a4af3bef.dir + size: 536774 nfiles: 1 - path: data/prepared_data hash: md5 - md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir - size: 13429347 + md5: 3767eec56906f5ac724a3f07433645ef.dir + size: 13442342 nfiles: 2 - path: generate_metrics.py hash: md5 @@ -113,8 +113,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: f75356e08ceabb102d5b23508e140f0a - size: 182 + md5: 852ef4cf2ca5e7f89d70420a9df7a596 + size: 183 startup_cleanup: cmd: python startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/prediction_analysis.py b/modules/ml-pipeline/src/pipeline/prediction_analysis.py index 428bf0b..344b602 100644 --- a/modules/ml-pipeline/src/pipeline/prediction_analysis.py +++ b/modules/ml-pipeline/src/pipeline/prediction_analysis.py @@ -1,4 +1,65 @@ """ Look at why the model made such a prediction Manual script to run +Workflow: +- Identify a prediction row/s that you wish to look into + - i.e. a bad prediction/s +- Add these rows to the config +- Run script """ + +import shap + +shap.initjs() + + +import yaml +from pathlib import Path +from core.interface.InterfaceModels import MLModel +from core.interface.InterfaceDataClient import DataClient +from core.Logger import logger +from core.MLModels import model_factory +from core.DataClient import dataclient_factory +import numpy as np +import pandas as pd + + +client_path = Path(__file__).parent / "configs" / "client.yaml" +client_params = yaml.safe_load(open(client_path)) + +prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" +prepare_data_params = yaml.safe_load(open(prepare_data_path)) + +feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml" +feature_process_params = yaml.safe_load(open(feature_process_path)) + +build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" +build_model_params = yaml.safe_load(open(build_model_path)) + +model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml" +model_analysis_params = yaml.safe_load(open(model_analysis_path)) + +generate_predictions_path = ( + Path(__file__).parent / "configs" / "generate_predictions.yaml" +) +generate_predictions_params = yaml.safe_load(open(generate_predictions_path)) + +model = model_factory(build_model_params["model_type"]) +model.load_model(build_model_params["model_save_filepath"]) + +dataclient_type = model_analysis_params["dataclient_type"] +dataclient = dataclient_factory( + dataclient_type=dataclient_type, + dataclient_config=client_params[dataclient_type], +) + + +def prediction_analysis(model: MLModel, dataclient: DataClient): + + shap.kmeans() + + ... + + +if __name__ == "__main__": + prediction_analysis() From 1e70a3a582c2ebf3302df261a616aa636ddf3d57 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 21 Sep 2023 21:58:55 +0000 Subject: [PATCH 3/3] nearly there --- .../pipeline/configs/generate_metrics.yaml | 2 - .../src/pipeline/model_analysis.py | 2 +- .../src/pipeline/prediction_analysis.py | 66 ++++++++++++++++--- .../ml-pipeline/src/pipeline/prepare_data.py | 3 + 4 files changed, 60 insertions(+), 13 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml b/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml index 84f5897..7ed9819 100644 --- a/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml @@ -1,5 +1,3 @@ dataclient_type: local -input_datahandler_type: parquet -output_datahandler_type: json metrics_type: Regression metrics_output_filepath: ./metrics/metrics.json diff --git a/modules/ml-pipeline/src/pipeline/model_analysis.py b/modules/ml-pipeline/src/pipeline/model_analysis.py index 206eb9a..fb1f23c 100644 --- a/modules/ml-pipeline/src/pipeline/model_analysis.py +++ b/modules/ml-pipeline/src/pipeline/model_analysis.py @@ -79,7 +79,7 @@ def model_analysis( logger.info(f"--- Generate Feature Importance ---") logger.info("------------------------------------") - test_df = pd.read_parquet(output_test_filepath) + test_df = dataclient.load_data(output_test_filepath) test_df = test_df.head(permutation_subsample_amount) diff --git a/modules/ml-pipeline/src/pipeline/prediction_analysis.py b/modules/ml-pipeline/src/pipeline/prediction_analysis.py index 344b602..c65684f 100644 --- a/modules/ml-pipeline/src/pipeline/prediction_analysis.py +++ b/modules/ml-pipeline/src/pipeline/prediction_analysis.py @@ -36,30 +36,76 @@ feature_process_params = yaml.safe_load(open(feature_process_path)) build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" build_model_params = yaml.safe_load(open(build_model_path)) -model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml" -model_analysis_params = yaml.safe_load(open(model_analysis_path)) - -generate_predictions_path = ( - Path(__file__).parent / "configs" / "generate_predictions.yaml" +prediction_analysis_path = ( + Path(__file__).parent / "configs" / "prediction_analysis.yaml" ) -generate_predictions_params = yaml.safe_load(open(generate_predictions_path)) +prediction_analysis_params = yaml.safe_load(open(prediction_analysis_path)) model = model_factory(build_model_params["model_type"]) model.load_model(build_model_params["model_save_filepath"]) -dataclient_type = model_analysis_params["dataclient_type"] +dataclient_type = prediction_analysis_params["dataclient_type"] dataclient = dataclient_factory( dataclient_type=dataclient_type, dataclient_config=client_params[dataclient_type], ) +output_test_filepath = prepare_data_params["output_test_filepath"] -def prediction_analysis(model: MLModel, dataclient: DataClient): - shap.kmeans() +def prediction_analysis( + model: MLModel, dataclient: DataClient, output_test_filepath: str +): + test_df = dataclient.load_data(output_test_filepath) + target = "SAP_ENDING" + test_df_without_target = test_df.drop(columns=[target]) + + # test_df_summary = shap.kmeans(test_df, 10) + # print("Baseline feature-values: \n", test_df_summary) + class AutogluonWrapper: + def __init__(self, predictor, feature_names): + self.ag_model = predictor + self.feature_names = feature_names + + def predict(self, X): + if isinstance(X, pd.Series): + X = X.values.reshape(1, -1) + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X, columns=self.feature_names) + return self.ag_model.predict(X) + + ag_wrapper = AutogluonWrapper( + model.model, feature_names=test_df_without_target.columns + ) + explainer = shap.KernelExplainer(ag_wrapper.predict, test_df_without_target) + + NSHAP_SAMPLES = 100 # how many samples to use to approximate each Shapely value, larger values will be slower + N_VAL = 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower + + ROW_INDEX = 0 # index of an example datapoint + single_datapoint = test_df_without_target.iloc[[ROW_INDEX]] + single_prediction = ag_wrapper.predict(single_datapoint) + + shap_values_single = explainer.shap_values(single_datapoint, nsamples=NSHAP_SAMPLES) + shap.force_plot( + explainer.expected_value, + shap_values_single, + test_df_without_target.iloc[ROW_INDEX, :], + ) ... if __name__ == "__main__": - prediction_analysis() + + logger.info("----------------------------") + logger.info(f"--- {__file__} - Start! ---") + logger.info("----------------------------") + + prediction_analysis( + model=model, dataclient=dataclient, output_test_filepath=output_test_filepath + ) + + logger.info("-------------------------------") + logger.info(f"--- {__file__} - Complete! ---") + logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/prepare_data.py b/modules/ml-pipeline/src/pipeline/prepare_data.py index 8caa101..f7bdbd1 100644 --- a/modules/ml-pipeline/src/pipeline/prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/prepare_data.py @@ -74,6 +74,9 @@ def prepare_data( train, test = train_test_split( data, train_size=train_proportion, test_size=(1 - train_proportion) ) + test = test.reset_index(drop=True) + + train = train.reset_index(drop=True) logger.info("-----------------------") logger.info("--- Outputting data ---")