From f4e91162ec588a730785c42286372e34cfd97f03 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 11 Oct 2023 13:23:54 +0000 Subject: [PATCH] initial model --- .../src/pipeline/configs/build_model.yaml | 2 +- .../configs/feature_processor_logic.py | 6 +++ .../pipeline/configs/post_prediction_logic.py | 6 ++- .../src/pipeline/configs/settings.yaml | 4 +- modules/ml-pipeline/src/pipeline/dvc.lock | 50 +++++++++---------- 5 files changed, 38 insertions(+), 30 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index d296e6a..ee7394e 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,6 +13,6 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 + time_limit: 60 presets: medium_quality excluded_model_types: ['KNN', 'RF'] diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index c32d2fe..7b131dc 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -18,6 +18,11 @@ def remove_starting_columns(df): return df +def keep_negative_carbon_change(df): + df = df[df["CARBON_CHANGE"] < 0] + return df + + # def keep_ending_columns(df): # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] # keep_columns = df.columns[ending_column_index].to_list() @@ -27,6 +32,7 @@ def remove_starting_columns(df): # return df business_logic = { + "keep_negative_carbon_change": keep_negative_carbon_change # "remove_starting_columns": remove_starting_columns # "keep_ENDING_COLUMNS": keep_ending_columns } diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index b85d3a4..bb36713 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -12,9 +12,11 @@ def clip_predictions_to_minimum_value( predictions.name = "predictions" predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement - replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"] + replace_index = ( + predictions_df["predictions"] > predictions_df["CARBON_STARTING"] - 1 + ) predictions_df.loc[replace_index, "predictions"] = ( - predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value + predictions_df.loc[replace_index, "CARBON_STARTING"] - minimum_value ) predictions_new = predictions_df["predictions"] diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index ce7ed2c..4f3ebce 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -31,9 +31,9 @@ default: feature_processor_config: subsample_amount: null subsample_seed: 0 - target: SAP_ENDING + target: CARBON_ENDING identifier_columns: ["UPRN"] - drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"] + drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "SAP_ENDING"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index c499874..d2291d8 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -14,11 +14,11 @@ stages: - CARBON_CHANGE - RDSAP_CHANGE - HEAT_DEMAND_ENDING - - CARBON_ENDING + - SAP_ENDING default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 - default.feature_processor.feature_processor_config.target: SAP_ENDING + default.feature_processor.feature_processor_config.target: CARBON_ENDING default.feature_processor.feature_processor_type: dataframe default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet default.prepare_data.input_dataclient_type: aws-s3 @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: cd9a3d5e6208c1fd5de513b4d5c51b5b.dir + size: 30121189 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 5359 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: cd9a3d5e6208c1fd5de513b4d5c51b5b.dir + size: 30121189 nfiles: 2 params: configs/build_model.yaml: @@ -58,7 +58,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 4000 + time_limit: 60 presets: medium_quality excluded_model_types: - KNN @@ -66,13 +66,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: 7bb5156243b4db39349e80a01ffecde4.dir - size: 473398662 - nfiles: 27 + md5: 0d43e4ac3985da215dadf5fed8e68200.dir + size: 210841782 + nfiles: 21 - path: metrics/fit_metrics.json hash: md5 - md5: 2bb16ac67de8778fbc08171d562b34d5 - size: 184 + md5: 06f8bb0d004b91c33493dbee9a8763e7 + size: 206 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -82,13 +82,13 @@ stages: size: 3028 - path: data/model hash: md5 - md5: 7bb5156243b4db39349e80a01ffecde4.dir - size: 473398662 - nfiles: 27 + md5: 0d43e4ac3985da215dadf5fed8e68200.dir + size: 210841782 + nfiles: 21 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: cd9a3d5e6208c1fd5de513b4d5c51b5b.dir + size: 30121189 nfiles: 2 params: configs/settings.yaml: @@ -100,8 +100,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 0bb3cf991906953def81c8204cdcfaf0.dir - size: 374532 + md5: acdcb06ee7574672b1148c10c37a868b.dir + size: 275959 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -112,13 +112,13 @@ stages: size: 4487 - path: data/predictions hash: md5 - md5: 0bb3cf991906953def81c8204cdcfaf0.dir - size: 374532 + md5: acdcb06ee7574672b1148c10c37a868b.dir + size: 275959 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: cd9a3d5e6208c1fd5de513b4d5c51b5b.dir + size: 30121189 nfiles: 2 params: configs/settings.yaml: @@ -128,8 +128,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 2e13ae67759a64261d03224f1c0d4bf4 - size: 185 + md5: e3bdc173023a7d909704f0313aa1609f + size: 219 startup_cleanup: cmd: python 0_startup_cleanup.py deps: