From 1e70a3a582c2ebf3302df261a616aa636ddf3d57 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 21 Sep 2023 21:58:55 +0000 Subject: [PATCH] nearly there --- .../pipeline/configs/generate_metrics.yaml | 2 - .../src/pipeline/model_analysis.py | 2 +- .../src/pipeline/prediction_analysis.py | 66 ++++++++++++++++--- .../ml-pipeline/src/pipeline/prepare_data.py | 3 + 4 files changed, 60 insertions(+), 13 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml b/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml index 84f5897..7ed9819 100644 --- a/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml @@ -1,5 +1,3 @@ dataclient_type: local -input_datahandler_type: parquet -output_datahandler_type: json metrics_type: Regression metrics_output_filepath: ./metrics/metrics.json diff --git a/modules/ml-pipeline/src/pipeline/model_analysis.py b/modules/ml-pipeline/src/pipeline/model_analysis.py index 206eb9a..fb1f23c 100644 --- a/modules/ml-pipeline/src/pipeline/model_analysis.py +++ b/modules/ml-pipeline/src/pipeline/model_analysis.py @@ -79,7 +79,7 @@ def model_analysis( logger.info(f"--- Generate Feature Importance ---") logger.info("------------------------------------") - test_df = pd.read_parquet(output_test_filepath) + test_df = dataclient.load_data(output_test_filepath) test_df = test_df.head(permutation_subsample_amount) diff --git a/modules/ml-pipeline/src/pipeline/prediction_analysis.py b/modules/ml-pipeline/src/pipeline/prediction_analysis.py index 344b602..c65684f 100644 --- a/modules/ml-pipeline/src/pipeline/prediction_analysis.py +++ b/modules/ml-pipeline/src/pipeline/prediction_analysis.py @@ -36,30 +36,76 @@ feature_process_params = yaml.safe_load(open(feature_process_path)) build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" build_model_params = yaml.safe_load(open(build_model_path)) -model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml" -model_analysis_params = yaml.safe_load(open(model_analysis_path)) - -generate_predictions_path = ( - Path(__file__).parent / "configs" / "generate_predictions.yaml" +prediction_analysis_path = ( + Path(__file__).parent / "configs" / "prediction_analysis.yaml" ) -generate_predictions_params = yaml.safe_load(open(generate_predictions_path)) +prediction_analysis_params = yaml.safe_load(open(prediction_analysis_path)) model = model_factory(build_model_params["model_type"]) model.load_model(build_model_params["model_save_filepath"]) -dataclient_type = model_analysis_params["dataclient_type"] +dataclient_type = prediction_analysis_params["dataclient_type"] dataclient = dataclient_factory( dataclient_type=dataclient_type, dataclient_config=client_params[dataclient_type], ) +output_test_filepath = prepare_data_params["output_test_filepath"] -def prediction_analysis(model: MLModel, dataclient: DataClient): - shap.kmeans() +def prediction_analysis( + model: MLModel, dataclient: DataClient, output_test_filepath: str +): + test_df = dataclient.load_data(output_test_filepath) + target = "SAP_ENDING" + test_df_without_target = test_df.drop(columns=[target]) + + # test_df_summary = shap.kmeans(test_df, 10) + # print("Baseline feature-values: \n", test_df_summary) + class AutogluonWrapper: + def __init__(self, predictor, feature_names): + self.ag_model = predictor + self.feature_names = feature_names + + def predict(self, X): + if isinstance(X, pd.Series): + X = X.values.reshape(1, -1) + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X, columns=self.feature_names) + return self.ag_model.predict(X) + + ag_wrapper = AutogluonWrapper( + model.model, feature_names=test_df_without_target.columns + ) + explainer = shap.KernelExplainer(ag_wrapper.predict, test_df_without_target) + + NSHAP_SAMPLES = 100 # how many samples to use to approximate each Shapely value, larger values will be slower + N_VAL = 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower + + ROW_INDEX = 0 # index of an example datapoint + single_datapoint = test_df_without_target.iloc[[ROW_INDEX]] + single_prediction = ag_wrapper.predict(single_datapoint) + + shap_values_single = explainer.shap_values(single_datapoint, nsamples=NSHAP_SAMPLES) + shap.force_plot( + explainer.expected_value, + shap_values_single, + test_df_without_target.iloc[ROW_INDEX, :], + ) ... if __name__ == "__main__": - prediction_analysis() + + logger.info("----------------------------") + logger.info(f"--- {__file__} - Start! ---") + logger.info("----------------------------") + + prediction_analysis( + model=model, dataclient=dataclient, output_test_filepath=output_test_filepath + ) + + logger.info("-------------------------------") + logger.info(f"--- {__file__} - Complete! ---") + logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/prepare_data.py b/modules/ml-pipeline/src/pipeline/prepare_data.py index 8caa101..f7bdbd1 100644 --- a/modules/ml-pipeline/src/pipeline/prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/prepare_data.py @@ -74,6 +74,9 @@ def prepare_data( train, test = train_test_split( data, train_size=train_proportion, test_size=(1 - train_proportion) ) + test = test.reset_index(drop=True) + + train = train.reset_index(drop=True) logger.info("-----------------------") logger.info("--- Outputting data ---")