From ae5349974227dd15075d3cca8762c168c75963d9 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 22 Dec 2023 09:51:57 +0000 Subject: [PATCH 1/3] add keep only non negative carbon change to carbon model --- .../src/pipeline/configs/feature_processor_logic.py | 6 ++++++ modules/ml-pipeline/src/pipeline/example.py | 0 2 files changed, 6 insertions(+) create mode 100644 modules/ml-pipeline/src/pipeline/example.py diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 78c29a9..bce32b6 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -23,6 +23,11 @@ def keep_negative_heat_change(df): return df +def keep_non_negative_carbon_ending(df): + df = df[df["CARBON_ENDING"] > 0] + return df + + def keep_negative_carbon_change(df): df = df[df["CARBON_CHANGE"] < 0] return df @@ -68,6 +73,7 @@ business_logic = { "keep_negative_carbon_change": keep_negative_carbon_change, "remove_top_1_percent_heat_demand": remove_top_1_percent_heat_demand, "remove_top_1_percent_carbon": remove_top_1_percent_carbon, + "keep_non_negative_carbon_ending": keep_non_negative_carbon_ending # "remove_starting_columns": remove_starting_columns # "keep_ENDING_COLUMNS": keep_ending_columns } diff --git a/modules/ml-pipeline/src/pipeline/example.py b/modules/ml-pipeline/src/pipeline/example.py new file mode 100644 index 0000000..e69de29 From 79a55ba8b5ff5b13a7cd9224033dc3e19363a79c Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 17 Jan 2024 23:35:50 +0000 Subject: [PATCH 2/3] train 600 second model on new data --- .../src/pipeline/configs/build_model.yaml | 2 +- .../configs/feature_processor_logic.py | 16 ++--- .../pipeline/configs/post_prediction_logic.py | 4 +- .../src/pipeline/configs/settings.yaml | 8 +-- modules/ml-pipeline/src/pipeline/dvc.lock | 60 +++++++++---------- 5 files changed, 45 insertions(+), 45 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 4c72487..9c97ef0 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,7 +13,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 400 + time_limit: 600 presets: medium_quality excluded_model_types: ['KNN', 'RF'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index bce32b6..94d3c6e 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -9,27 +9,27 @@ Business Logic dict + functions def remove_starting_columns(df): keep_column_index = [ - False if col_name.endswith("_STARTING") else True + False if col_name.endswith("_starting") else True for col_name in list(df.columns) ] keep_columns = df.columns[keep_column_index].to_list() - keep_columns.append("SAP_STARTING") + keep_columns.append("sap_starting") df = df[keep_columns] return df def keep_negative_heat_change(df): - df = df[df["HEAT_DEMAND_CHANGE"] < 0] + df = df[df["heat_demand_change"] < 0] return df def keep_non_negative_carbon_ending(df): - df = df[df["CARBON_ENDING"] > 0] + df = df[df["carbon_ending"] > 0] return df def keep_negative_carbon_change(df): - df = df[df["CARBON_CHANGE"] < 0] + df = df[df["carbon_change"] < 0] return df @@ -39,7 +39,7 @@ def remove_unreasonable_habitable_rooms(df): Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2 """ minimum_room_size_index = ( - df["TOTAL_FLOOR_AREA_ENDING"] / df["NUMBER_HABITABLE_ROOMS"] >= 6.5 + df["total_floor_area_ending"] / df["number_habitable_rooms"] >= 6.5 ) df = df[minimum_room_size_index] return df @@ -48,14 +48,14 @@ def remove_unreasonable_habitable_rooms(df): def remove_top_1_percent_heat_demand(df): # threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%'] threshold_value = 860 - df = df[df["HEAT_DEMAND_STARTING"] < threshold_value] + df = df[df["heat_demand_starting"] < threshold_value] return df def remove_top_1_percent_carbon(df): # threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%'] threshold_value = 18 - df = df[df["CARBON_STARTING"] < threshold_value] + df = df[df["carbon_starting"] < threshold_value] return df diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index 1ffab90..dec7740 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -13,9 +13,9 @@ def clip_predictions_to_minimum_value( predictions.name = "predictions" predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement - replace_index = predictions_df["predictions"] > predictions_df["CARBON_STARTING"] + replace_index = predictions_df["predictions"] > predictions_df["carbon_starting"] predictions_df.loc[replace_index, "predictions"] = predictions_df.loc[ - replace_index, "CARBON_STARTING" + replace_index, "carbon_starting" ] predictions_new = predictions_df["predictions"] diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 5514406..e4bd13b 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -21,7 +21,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -31,9 +31,9 @@ default: feature_processor_config: subsample_amount: null subsample_seed: 0 - target: CARBON_ENDING - identifier_columns: ["UPRN"] - drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "SAP_ENDING"] + target: carbon_ending + identifier_columns: ["uprn"] + drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "sap_ending"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index e65dfe8..d8da73d 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -10,17 +10,17 @@ stages: params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: - - HEAT_DEMAND_CHANGE - - CARBON_CHANGE - - RDSAP_CHANGE - - HEAT_DEMAND_ENDING - - SAP_ENDING + - heat_demand_change + - carbon_change + - rdsap_change + - heat_demand_ending + - sap_ending default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 - default.feature_processor.feature_processor_config.target: CARBON_ENDING + default.feature_processor.feature_processor_config.target: carbon_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: ca205aaf77cb9a9414a0c6a1affd8d82.dir - size: 30597800 + md5: 70d79ba4a6f0648439dc55031c944d47.dir + size: 32673907 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 4149 - path: data/prepared_data hash: md5 - md5: ca205aaf77cb9a9414a0c6a1affd8d82.dir - size: 30597800 + md5: 70d79ba4a6f0648439dc55031c944d47.dir + size: 32673907 nfiles: 2 params: configs/build_model.yaml: @@ -58,7 +58,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 400 + time_limit: 600 presets: medium_quality excluded_model_types: - KNN @@ -68,13 +68,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: f3be67a0a80e525d30665f2ffc367d9b.dir - size: 312133166 - nfiles: 24 + md5: 2fc9223da8b72e61d81f06665e75019e.dir + size: 324532985 + nfiles: 27 - path: metrics/fit_metrics.json hash: md5 - md5: 36912d423f975802ca3661992103e614 - size: 226 + md5: 7d2f226251ce6f8e92af73d50dadb890 + size: 228 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: f3be67a0a80e525d30665f2ffc367d9b.dir - size: 312133166 - nfiles: 24 + md5: 2fc9223da8b72e61d81f06665e75019e.dir + size: 324532985 + nfiles: 27 - path: data/prepared_data hash: md5 - md5: ca205aaf77cb9a9414a0c6a1affd8d82.dir - size: 30597800 + md5: 70d79ba4a6f0648439dc55031c944d47.dir + size: 32673907 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 2ae9ab85ca2551d6b0833337cacbcc3e.dir - size: 389118 + md5: 8bfc33c14aba5abf5ac4bdba32ff3c4c.dir + size: 412880 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3485 - path: data/predictions hash: md5 - md5: 2ae9ab85ca2551d6b0833337cacbcc3e.dir - size: 389118 + md5: 8bfc33c14aba5abf5ac4bdba32ff3c4c.dir + size: 412880 nfiles: 1 - path: data/prepared_data hash: md5 - md5: ca205aaf77cb9a9414a0c6a1affd8d82.dir - size: 30597800 + md5: 70d79ba4a6f0648439dc55031c944d47.dir + size: 32673907 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +130,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 6447c7b2b92a4057aecd3d227de1aadf - size: 224 + md5: 9a0b57244dfdbd6dab0392a4fd618123 + size: 225 startup_cleanup: cmd: python 0_startup_cleanup.py deps: From 9b29e838af73f1e8113c2bc3981009cbe59a2575 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 17 Jan 2024 23:45:07 +0000 Subject: [PATCH 3/3] update requirements for dvc --- .../requirements/version_control/requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt index 91cb005..a2b9531 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt @@ -1,4 +1,4 @@ -dvc==3.18.0 -dvc-s3==2.23.0 -gto==1.0.4 -pyOpenSSL==23.2.0 +dvc==3.36.0 +dvc-s3==3.0.1 +gto==1.6.1 +pyOpenSSL==23.3.0