From bcd2383d8d2238755819097b1bafe34b9972ea3a Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 3 Oct 2023 23:01:17 +0000 Subject: [PATCH] add eda script bits --- modules/ml-pipeline/src/pipeline/eda.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/eda.py b/modules/ml-pipeline/src/pipeline/eda.py index de1ebd6..2fdd8be 100644 --- a/modules/ml-pipeline/src/pipeline/eda.py +++ b/modules/ml-pipeline/src/pipeline/eda.py @@ -211,7 +211,7 @@ cosine_similarity_df = mix_df[ ] from sklearn.metrics.pairwise import cosine_similarity -row_index = 12624 +row_index = 58199 from sklearn.preprocessing import LabelEncoder @@ -224,6 +224,8 @@ cosine_similarity_df[object_columns.columns] = cosine_similarity_df[ feature_vector = cosine_similarity_df.loc[[row_index]] cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector) -similar_index = cosine_similarity_df[cosine_similarity_df["cosine"] > 0.997].index +similar_index = ( + cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index +) check_df = mix_df.loc[similar_index]