add eda code for nowA

This commit is contained in:
Michael Duong 2023-10-03 22:03:50 +00:00
parent c0d73d8b9e
commit 0386346c67

View file

@ -175,3 +175,55 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
# Use shap package to explain why 9158 has a 35 prediction when its sap ending is 96
#
#
from core.MLModels import model_factory
from core.DataClient import dataclient_factory
import pandas as pd
from config import settings
client_params = settings.client
prepare_data_params = settings.prepare_data
feature_process_params = settings.feature_processor
build_model_params = settings.build_model
generate_predictions_params = settings.generate_predictions
prediction_analysis_params = settings.prediction_analysis
model = model_factory(build_model_params["model_type"])
model.load_model(build_model_params["model_save_filepath"])
dataclient_type = prediction_analysis_params["dataclient_type"]
dataclient = dataclient_factory(
dataclient_type=dataclient_type,
dataclient_config=client_params[dataclient_type],
)
target = feature_process_params["feature_processor_config"]["target"]
predictions_column_name = generate_predictions_params["predictions_column_name"]
output_test_filepath = prepare_data_params["output_test_filepath"]
predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
test_df = dataclient.load_data(output_test_filepath)
predictions = dataclient.load_data(predictions_output_filepath)
mix_df = pd.concat([test_df.copy(), predictions], axis=1)
mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
mix_df = mix_df.sort_values("residual", ascending=False)
cosine_similarity_df = mix_df[
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
]
from sklearn.metrics.pairwise import cosine_similarity
row_index = 12624
from sklearn.preprocessing import LabelEncoder
object_columns = cosine_similarity_df.select_dtypes(["object"])
cosine_similarity_df[object_columns.columns] = cosine_similarity_df[
object_columns.columns
].apply(LabelEncoder().fit_transform)
feature_vector = cosine_similarity_df.loc[[row_index]]
cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector)
similar_index = cosine_similarity_df[cosine_similarity_df["cosine"] > 0.997].index
check_df = mix_df.loc[similar_index]