diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 103168d..5b15867 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -45,6 +45,11 @@ def keep_non_zero_rdsap(df): return df +def keep_non_zero_heating(df): + df = df[df["heating_cost_ending"] > 0] + return df + + # def keep_ending_columns(df): # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] # keep_columns = df.columns[ending_column_index].to_list() @@ -54,6 +59,7 @@ def keep_non_zero_rdsap(df): # return df business_logic = { + "keep_non_zero_heating": keep_non_zero_heating, # "keep_non_zero_rdsap": keep_non_zero_rdsap, # "keep_flats": keep_flats, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 80ad400..867807f 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -22,7 +22,9 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/2024-07-03-23-11-39/dataset_rooms.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-07-03-23-11-39/dataset_rooms.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-07-07-15-16-04/dataset_rooms.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/2024-07-10-20-28-54/dataset_rooms.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 3fe8e87..7d0eb01 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -41,7 +41,7 @@ stages: default.feature_processor.feature_processor_config.target: hot_water_cost_ending default.feature_processor.feature_processor_type: dataframe default.prepare_data.data_filepath: - s3://retrofit-data-dev/sap_change_model/2024-07-03-23-11-39/dataset_rooms.parquet + s3://retrofit-data-dev/sap_change_model/2024-07-10-20-28-54/dataset_rooms.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -50,8 +50,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 782e258411cf655c0a5c8437c20459d9.dir - size: 49160755 + md5: 44c1c25d24094120253253c8872dd954.dir + size: 54668425 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -62,8 +62,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 782e258411cf655c0a5c8437c20459d9.dir - size: 49160755 + md5: 44c1c25d24094120253253c8872dd954.dir + size: 54668425 nfiles: 2 params: configs/build_model.yaml: @@ -95,18 +95,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: f292cc6ddbe9c8f1efea8107a776ce16.dir - size: 3444061 + md5: e3e06d55135815294afd823385860b44.dir + size: 3443615 nfiles: 1 - path: data/model/ hash: md5 - md5: 48cfeca19c3d7ac956704abb425bb2ab.dir - size: 729294819 + md5: de574e373b222cd00435abcd5a174f83.dir + size: 780954025 nfiles: 35 - path: metrics/fit_metrics.json hash: md5 - md5: 2e253c8b9ffc101aad95fc09fb4586c2 - size: 222 + md5: a4c1c6ca2672cbcae18e5e38ee222bfb + size: 221 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -116,13 +116,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 48cfeca19c3d7ac956704abb425bb2ab.dir - size: 729294819 + md5: de574e373b222cd00435abcd5a174f83.dir + size: 780954025 nfiles: 35 - path: data/prepared_data hash: md5 - md5: 782e258411cf655c0a5c8437c20459d9.dir - size: 49160755 + md5: 44c1c25d24094120253253c8872dd954.dir + size: 54668425 nfiles: 2 params: configs/settings.yaml: @@ -134,8 +134,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: bdb3afe72fa8ad9e56997f7da659778e.dir - size: 480363 + md5: dda695b3bd58ada967a2936faf8e4063.dir + size: 480519 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -146,13 +146,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: bdb3afe72fa8ad9e56997f7da659778e.dir - size: 480363 + md5: dda695b3bd58ada967a2936faf8e4063.dir + size: 480519 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 782e258411cf655c0a5c8437c20459d9.dir - size: 49160755 + md5: 44c1c25d24094120253253c8872dd954.dir + size: 54668425 nfiles: 2 params: configs/settings.yaml: @@ -162,7 +162,7 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: fb65d79e41782ebc1f616fa6e0e8bec1 + md5: 3f63ac18e8b2976dd34cdb290611c782 size: 220 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py