add some processing ocde

2026-06-08 11:17:25 +00:00 · 2023-10-09 15:44:37 +00:00 · 2023-10-09 15:44:37 +00:00 · f9b0b6112c
commit f9b0b6112c
parent ba4d1bcc8b
4 changed files with 32 additions and 7 deletions
--- a/modules/ml-pipeline/src/pipeline/configs/analysis.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/analysis.yaml
@ -13,4 +13,4 @@ default:
    dataclient_type: local
    nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
    n_val: 30  # how many datapoints from validation data should we interpret predictions for, larger values will be slower
-    row_index: [0, 10, 20] # index of an example datapoint
+    row_index: [20695, 50243, 7653] # index of an example datapoint
--- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
@ -13,6 +13,6 @@ default:
      output_filepath: ./data/model/allmodels/
      problem_type: regression
      eval_metric: mean_squared_error #mean_absolute_error
-      time_limit: 4000
+      time_limit: 180
      presets: medium_quality
      excluded_model_types: ['KNN', 'RF']
--- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
+++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
@ -18,6 +18,28 @@ def remove_starting_columns(df):
    return df


+def remove_floor_height_ending(df):
+    # df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING']
+    # shows bottom 0.5 percentile is 1.665
+    # So keep anything above this
+    df = df[df["FLOOR_HEIGHT_ENDING"] > 1.665].reset_index(drop=True)
+    print("we in here")
+    return df
+
+
+def remove_minimum_habitable_room_size(df):
+    # Need minimum of 6.5m per habitable room
+    df = df[
+        df["TOTAL_FLOOR_AREA_ENDING"] / df["NUMBER_HABITABLE_ROOMS"] > 6.5
+    ].reset_index(drop=True)
+    return df
+
+
+def keep_flats(df):
+    df = df[df["PROPERTY_TYPE"] == "Flat"]
+    return df
+
+
 # def keep_ending_columns(df):
 #     ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
 #     keep_columns = df.columns[ending_column_index].to_list()
@ -27,6 +49,9 @@ def remove_starting_columns(df):
 #     return df

 business_logic = {
+    # "keep_flats": keep_flats,
+    # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,
+    # "remove_floor_height_ending": remove_floor_height_ending
    # "remove_starting_columns": remove_starting_columns
    # "keep_ENDING_COLUMNS": keep_ending_columns
 }
--- a/modules/ml-pipeline/src/pipeline/eda.py
+++ b/modules/ml-pipeline/src/pipeline/eda.py
@ -207,11 +207,11 @@ mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
 mix_df = mix_df.sort_values("residual", ascending=False)

 cosine_similarity_df = mix_df[
-    mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
+    mix_df.columns.difference(["UPRN", "predictions", "residual", "SAP_ENDING"])
 ]
 from sklearn.metrics.pairwise import cosine_similarity

-row_index = 58199
+row_index = 20695

 from sklearn.preprocessing import LabelEncoder

@ -224,8 +224,8 @@ cosine_similarity_df[object_columns.columns] = cosine_similarity_df[
 feature_vector = cosine_similarity_df.loc[[row_index]]

 cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector)
-similar_index = (
-    cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index
-)
+
+similar_df = cosine_similarity_df.sort_values("cosine", ascending=False).head(5)
+similar_index = similar_df.index

 check_df = mix_df.loc[similar_index]