diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 5b15867..f4abfae 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -45,8 +45,31 @@ def keep_non_zero_rdsap(df): return df -def keep_non_zero_heating(df): +def keep_non_zero_costs(df): df = df[df["heating_cost_ending"] > 0] + df = df[df["hot_water_cost_ending"] > 0] + df = df[df["lighting_cost_ending"] > 0] + df = df[df["heating_cost_starting"] > 0] + df = df[df["hot_water_cost_starting"] > 0] + df = df[df["lighting_cost_starting"] > 0] + return df + + +def clip_bottom_percentile(df, percentile=0.005): + + clip_values = df.describe(percentiles=[percentile])[ + [ + "hot_water_cost_starting", + "hot_water_cost_ending", + "lighting_cost_starting", + "lighting_cost_ending", + "heating_cost_starting", + "heating_cost_ending", + ] + ].loc[f"{percentile*100}%"] + for col in clip_values.index: + df = df[df[col] > clip_values[col]] + return df @@ -59,7 +82,8 @@ def keep_non_zero_heating(df): # return df business_logic = { - "keep_non_zero_heating": keep_non_zero_heating, + "keep_non_zero_costs": keep_non_zero_costs, + "clip_bottom_percentile": clip_bottom_percentile, # "keep_non_zero_rdsap": keep_non_zero_rdsap, # "keep_flats": keep_flats, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 867807f..0a6e742 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -40,7 +40,7 @@ default: drop_columns: [ "sap_ending", "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "lighting_cost_ending", "heating_cost_ending", - # "days_to_starting", "days_to_ending", + "days_to_starting", "days_to_ending", 'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending', 'number_habitable_rooms', 'number_heated_rooms'] retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 7d0eb01..e9b331f 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -29,6 +29,8 @@ stages: - carbon_ending - lighting_cost_ending - heating_cost_ending + - days_to_starting + - days_to_ending - number_habitable_rooms_starting - number_habitable_rooms_ending - number_heated_rooms_starting @@ -50,8 +52,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 44c1c25d24094120253253c8872dd954.dir - size: 54668425 + md5: b04aeb0ea74da1043ddcc60407f70271.dir + size: 50812259 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -62,8 +64,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 44c1c25d24094120253253c8872dd954.dir - size: 54668425 + md5: b04aeb0ea74da1043ddcc60407f70271.dir + size: 50812259 nfiles: 2 params: configs/build_model.yaml: @@ -95,18 +97,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: e3e06d55135815294afd823385860b44.dir - size: 3443615 + md5: 0ff9b01e2080d5dc04b6111a230db449.dir + size: 3365354 nfiles: 1 - path: data/model/ hash: md5 - md5: de574e373b222cd00435abcd5a174f83.dir - size: 780954025 - nfiles: 35 + md5: 298342a6f5430b4e351a3ceb960609f9.dir + size: 790012901 + nfiles: 36 - path: metrics/fit_metrics.json hash: md5 - md5: a4c1c6ca2672cbcae18e5e38ee222bfb - size: 221 + md5: aa45e26b3967d8c4bad7db519c51168e + size: 222 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -116,13 +118,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: de574e373b222cd00435abcd5a174f83.dir - size: 780954025 - nfiles: 35 + md5: 298342a6f5430b4e351a3ceb960609f9.dir + size: 790012901 + nfiles: 36 - path: data/prepared_data hash: md5 - md5: 44c1c25d24094120253253c8872dd954.dir - size: 54668425 + md5: b04aeb0ea74da1043ddcc60407f70271.dir + size: 50812259 nfiles: 2 params: configs/settings.yaml: @@ -134,8 +136,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: dda695b3bd58ada967a2936faf8e4063.dir - size: 480519 + md5: 46ad00cec65838825064617bb117d512.dir + size: 467041 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -146,13 +148,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: dda695b3bd58ada967a2936faf8e4063.dir - size: 480519 + md5: 46ad00cec65838825064617bb117d512.dir + size: 467041 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 44c1c25d24094120253253c8872dd954.dir - size: 54668425 + md5: b04aeb0ea74da1043ddcc60407f70271.dir + size: 50812259 nfiles: 2 params: configs/settings.yaml: @@ -162,8 +164,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 3f63ac18e8b2976dd34cdb290611c782 - size: 220 + md5: f4fb877b32bad8f49560b219fd85cd20 + size: 223 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: