diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index a876541..c22fc4d 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -56,6 +56,10 @@ def keep_non_zero_rdsap(df): df = df[df["rdsap_change"] != 0] return df +def remove_heatingkwh_bottom_percentile(df, percentile=0.0001): + df = df[df["heating_kwh"] > df["heating_kwh"].quantile(percentile)] + return df + def add_features_from_code(df): FEATURES = { @@ -167,7 +171,8 @@ def add_features_from_code(df): # return df business_logic = { - "add_features_from_code": add_features_from_code + "add_features_from_code": add_features_from_code, + "remove_heatingkwh_bottom_percentile": remove_heatingkwh_bottom_percentile # "keep_non_zero_rdsap": keep_non_zero_rdsap, # "keep_flats": keep_flats, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 14cd48c..90e74c8 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -103,8 +103,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: d1ca07d66c3e28c133d0561423e6d2c8.dir - size: 14503223 + md5: 660630d5c4f0f9a371f5c43221a56e39.dir + size: 14486809 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -115,8 +115,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: d1ca07d66c3e28c133d0561423e6d2c8.dir - size: 14503223 + md5: 660630d5c4f0f9a371f5c43221a56e39.dir + size: 14486809 nfiles: 2 params: configs/build_model.yaml: @@ -148,18 +148,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: e08a232adc7f805d5d97ed7e93d667b3.dir - size: 726970 + md5: 07b5623892769f33837d89bf6fc6702d.dir + size: 726940 nfiles: 1 - path: data/model/ hash: md5 - md5: 3daab783532ba88d40eb905ff65b0f1c.dir - size: 400927883 - nfiles: 37 + md5: 6f281b6a422453ec853b1d13cb1920de.dir + size: 345477655 + nfiles: 36 - path: metrics/fit_metrics.json hash: md5 - md5: 9d6a478739e42b2f5f8500de585e9cf9 - size: 212 + md5: e6fc8ae0f36b52ce3173515ef75ce526 + size: 223 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -169,13 +169,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 3daab783532ba88d40eb905ff65b0f1c.dir - size: 400927883 - nfiles: 37 + md5: 6f281b6a422453ec853b1d13cb1920de.dir + size: 345477655 + nfiles: 36 - path: data/prepared_data hash: md5 - md5: d1ca07d66c3e28c133d0561423e6d2c8.dir - size: 14503223 + md5: 660630d5c4f0f9a371f5c43221a56e39.dir + size: 14486809 nfiles: 2 params: configs/settings.yaml: @@ -187,8 +187,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 98a3db098cf2ad9bf786fb77b0ce643f.dir - size: 77479 + md5: 19d3ead23af278c2ccdf4836180d4c15.dir + size: 77471 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -199,13 +199,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 98a3db098cf2ad9bf786fb77b0ce643f.dir - size: 77479 + md5: 19d3ead23af278c2ccdf4836180d4c15.dir + size: 77471 nfiles: 1 - path: data/prepared_data hash: md5 - md5: d1ca07d66c3e28c133d0561423e6d2c8.dir - size: 14503223 + md5: 660630d5c4f0f9a371f5c43221a56e39.dir + size: 14486809 nfiles: 2 params: configs/settings.yaml: @@ -215,8 +215,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 41ccaa41fd34009602d0df571e6453e9 - size: 219 + md5: 7b62ecaff5b429ef6c31aba95bce9f39 + size: 218 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: