diff --git a/modules/ml-pipeline/src/pipeline/eda.py b/modules/ml-pipeline/src/pipeline/eda.py index 1260d09..de1ebd6 100644 --- a/modules/ml-pipeline/src/pipeline/eda.py +++ b/modules/ml-pipeline/src/pipeline/eda.py @@ -175,3 +175,55 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6}) # Use shap package to explain why 9158 has a 35 prediction when its sap ending is 96 # # + +from core.MLModels import model_factory +from core.DataClient import dataclient_factory +import pandas as pd +from config import settings + +client_params = settings.client +prepare_data_params = settings.prepare_data +feature_process_params = settings.feature_processor +build_model_params = settings.build_model +generate_predictions_params = settings.generate_predictions +prediction_analysis_params = settings.prediction_analysis +model = model_factory(build_model_params["model_type"]) +model.load_model(build_model_params["model_save_filepath"]) +dataclient_type = prediction_analysis_params["dataclient_type"] +dataclient = dataclient_factory( + dataclient_type=dataclient_type, + dataclient_config=client_params[dataclient_type], +) + +target = feature_process_params["feature_processor_config"]["target"] +predictions_column_name = generate_predictions_params["predictions_column_name"] +output_test_filepath = prepare_data_params["output_test_filepath"] +predictions_output_filepath = generate_predictions_params["predictions_output_filepath"] + +test_df = dataclient.load_data(output_test_filepath) +predictions = dataclient.load_data(predictions_output_filepath) +mix_df = pd.concat([test_df.copy(), predictions], axis=1) +mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target]) +mix_df = mix_df.sort_values("residual", ascending=False) + +cosine_similarity_df = mix_df[ + mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"]) +] +from sklearn.metrics.pairwise import cosine_similarity + +row_index = 12624 + +from sklearn.preprocessing import LabelEncoder + +object_columns = cosine_similarity_df.select_dtypes(["object"]) + +cosine_similarity_df[object_columns.columns] = cosine_similarity_df[ + object_columns.columns +].apply(LabelEncoder().fit_transform) + +feature_vector = cosine_similarity_df.loc[[row_index]] + +cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector) +similar_index = cosine_similarity_df[cosine_similarity_df["cosine"] > 0.997].index + +check_df = mix_df.loc[similar_index]