From acdac3d8dcfc84ce70093a515cd56c91e4eb8cb4 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 22 Dec 2023 10:28:56 +0000 Subject: [PATCH] test new data --- .../src/pipeline/configs/build_model.yaml | 2 +- .../configs/feature_processor_logic.py | 10 +-- .../pipeline/configs/post_prediction_logic.py | 4 +- .../src/pipeline/configs/settings.yaml | 8 +-- modules/ml-pipeline/src/pipeline/dvc.lock | 64 +++++++++---------- 5 files changed, 44 insertions(+), 44 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 1ebb62d..4c72487 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,7 +13,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 + time_limit: 400 presets: medium_quality excluded_model_types: ['KNN', 'RF'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 4943f6b..026191c 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -9,11 +9,11 @@ Business Logic dict + functions def remove_starting_columns(df): keep_column_index = [ - False if col_name.endswith("_STARTING") else True + False if col_name.endswith("_starting") else True for col_name in list(df.columns) ] keep_columns = df.columns[keep_column_index].to_list() - keep_columns.append("SAP_STARTING") + keep_columns.append("sap_starting") df = df[keep_columns] return df @@ -22,7 +22,7 @@ def remove_floor_height_ending(df): # df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING'] # shows bottom 0.5 percentile is 1.665 # So keep anything above this - df = df[df["FLOOR_HEIGHT_ENDING"] > 1.665].reset_index(drop=True) + df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True) print("we in here") return df @@ -30,13 +30,13 @@ def remove_floor_height_ending(df): def remove_minimum_habitable_room_size(df): # Need minimum of 6.5m per habitable room df = df[ - df["TOTAL_FLOOR_AREA_ENDING"] / df["NUMBER_HABITABLE_ROOMS"] > 6.5 + df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5 ].reset_index(drop=True) return df def keep_flats(df): - df = df[df["PROPERTY_TYPE"] == "Flat"] + df = df[df["property_type"] == "Flat"] return df diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index b85d3a4..c1b8ebd 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -12,9 +12,9 @@ def clip_predictions_to_minimum_value( predictions.name = "predictions" predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement - replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"] + replace_index = predictions_df["sap_starting"] + 1 > predictions_df["predictions"] predictions_df.loc[replace_index, "predictions"] = ( - predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value + predictions_df.loc[replace_index, "sap_starting"] + minimum_value ) predictions_new = predictions_df["predictions"] diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 9333c46..d5ffe8d 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -21,7 +21,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_refactor.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -31,9 +31,9 @@ default: feature_processor_config: subsample_amount: null subsample_seed: 0 - target: SAP_ENDING - identifier_columns: ["UPRN"] - drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"] + target: sap_ending + identifier_columns: ["uprn"] + drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_change", "carbon_ending"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 20dd532..5e7bfe5 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -10,17 +10,17 @@ stages: params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: - - HEAT_DEMAND_CHANGE - - CARBON_CHANGE - - RDSAP_CHANGE - - HEAT_DEMAND_ENDING - - CARBON_ENDING + - heat_demand_change + - carbon_change + - rdsap_change + - heat_demand_change + - carbon_ending default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 - default.feature_processor.feature_processor_config.target: SAP_ENDING + default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_refactor.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,20 +29,20 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 6bfdb621b608648c017bf2323f7b5052.dir - size: 37048968 + md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir + size: 40122363 nfiles: 2 build_model: cmd: python 2_build_model.py deps: - path: 2_build_model.py hash: md5 - md5: 7b79f280b8b0d5bc6f07669e7cc37c6a - size: 4150 + md5: b824822475c222521516493e68eef9c5 + size: 4149 - path: data/prepared_data hash: md5 - md5: 6bfdb621b608648c017bf2323f7b5052.dir - size: 37048968 + md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir + size: 40122363 nfiles: 2 params: configs/build_model.yaml: @@ -58,7 +58,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 4000 + time_limit: 400 presets: medium_quality excluded_model_types: - KNN @@ -68,13 +68,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: f2999107de7572ea5ff0f2d774fa83b8.dir - size: 424943352 - nfiles: 27 + md5: 6a737d44dae68be2e75d6edb7f04f3ca.dir + size: 334981921 + nfiles: 24 - path: metrics/fit_metrics.json hash: md5 - md5: 9537e7ebc2eb32b421a7cabd2005f00b - size: 223 + md5: 89ba30b943c911e24b13b4370db12d18 + size: 225 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: f2999107de7572ea5ff0f2d774fa83b8.dir - size: 424943352 - nfiles: 27 + md5: 6a737d44dae68be2e75d6edb7f04f3ca.dir + size: 334981921 + nfiles: 24 - path: data/prepared_data hash: md5 - md5: 6bfdb621b608648c017bf2323f7b5052.dir - size: 37048968 + md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir + size: 40122363 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: f4439a56669f84bc51a9fcb4cd08353f.dir - size: 346539 + md5: c9a0ad3ef06f23d5d507bbec0ba86e98.dir + size: 362994 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: f4439a56669f84bc51a9fcb4cd08353f.dir - size: 346539 + md5: c9a0ad3ef06f23d5d507bbec0ba86e98.dir + size: 362994 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 6bfdb621b608648c017bf2323f7b5052.dir - size: 37048968 + md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir + size: 40122363 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +130,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 357904cf106279be5a578e8faefa5d80 - size: 224 + md5: fa40071006901c4335b5dbd567c9d9b3 + size: 226 startup_cleanup: cmd: python 0_startup_cleanup.py deps: