diff --git a/modules/ml-pipeline/src/pipeline/configs/analysis.yaml b/modules/ml-pipeline/src/pipeline/configs/analysis.yaml index 5c6e749..725660b 100644 --- a/modules/ml-pipeline/src/pipeline/configs/analysis.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/analysis.yaml @@ -13,4 +13,4 @@ default: dataclient_type: local nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower - row_index: [0, 10, 20] # index of an example datapoint + row_index: [20695, 50243, 7653] # index of an example datapoint diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index d296e6a..bd684e9 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,6 +13,6 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 + time_limit: 180 presets: medium_quality excluded_model_types: ['KNN', 'RF'] diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index c32d2fe..4943f6b 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -18,6 +18,28 @@ def remove_starting_columns(df): return df +def remove_floor_height_ending(df): + # df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING'] + # shows bottom 0.5 percentile is 1.665 + # So keep anything above this + df = df[df["FLOOR_HEIGHT_ENDING"] > 1.665].reset_index(drop=True) + print("we in here") + return df + + +def remove_minimum_habitable_room_size(df): + # Need minimum of 6.5m per habitable room + df = df[ + df["TOTAL_FLOOR_AREA_ENDING"] / df["NUMBER_HABITABLE_ROOMS"] > 6.5 + ].reset_index(drop=True) + return df + + +def keep_flats(df): + df = df[df["PROPERTY_TYPE"] == "Flat"] + return df + + # def keep_ending_columns(df): # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] # keep_columns = df.columns[ending_column_index].to_list() @@ -27,6 +49,9 @@ def remove_starting_columns(df): # return df business_logic = { + # "keep_flats": keep_flats, + # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, + # "remove_floor_height_ending": remove_floor_height_ending # "remove_starting_columns": remove_starting_columns # "keep_ENDING_COLUMNS": keep_ending_columns } diff --git a/modules/ml-pipeline/src/pipeline/eda.py b/modules/ml-pipeline/src/pipeline/eda.py index 2fdd8be..6c29308 100644 --- a/modules/ml-pipeline/src/pipeline/eda.py +++ b/modules/ml-pipeline/src/pipeline/eda.py @@ -207,11 +207,11 @@ mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target]) mix_df = mix_df.sort_values("residual", ascending=False) cosine_similarity_df = mix_df[ - mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"]) + mix_df.columns.difference(["UPRN", "predictions", "residual", "SAP_ENDING"]) ] from sklearn.metrics.pairwise import cosine_similarity -row_index = 58199 +row_index = 20695 from sklearn.preprocessing import LabelEncoder @@ -224,8 +224,8 @@ cosine_similarity_df[object_columns.columns] = cosine_similarity_df[ feature_vector = cosine_similarity_df.loc[[row_index]] cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector) -similar_index = ( - cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index -) + +similar_df = cosine_similarity_df.sort_values("cosine", ascending=False).head(5) +similar_index = similar_df.index check_df = mix_df.loc[similar_index]