From 97b432bac9d911307dbdfd3336da1ded7762ba26 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 28 Jul 2024 11:30:34 +0100 Subject: [PATCH] try new model --- .../configs/feature_processor_logic.py | 29 ++++++++++- .../src/pipeline/configs/settings.yaml | 11 ++-- modules/ml-pipeline/src/pipeline/dvc.lock | 50 +++++++++---------- 3 files changed, 57 insertions(+), 33 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index c22fc4d..c55af40 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -170,9 +170,36 @@ def add_features_from_code(df): # df = df[keep_columns] # return df +def enforce_minimum_habitable_room_size(df): + # Need minimum of 6.5m per habitable room + df = df[ + df["total-floor-area"] / df["number-habitable-rooms"].astype(float) > 6.5 + ].reset_index(drop=True) + return df + +def round_to_100s(df): + df['heating_kwh'] = (df['heating_kwh']/100).round()*100 + return df + +def remove_high_ratio_of_area_to_rooms(df): + df['area-to-heated-rooms'] = df['total-floor-area'] / df['number-heated-rooms'].astype(float) + + # Remove na rows + df = df[(df['area-to-heated-rooms'].notna())].reset_index(drop=True) + + # change any infinite values to 0 + df['area-to-heated-rooms'] = df['area-to-heated-rooms'].replace([np.inf], 0) + + # Remove top 0.05% of area-to-heated-rooms + df = df[df['area-to-heated-rooms'] < df['area-to-heated-rooms'].quantile(0.9995)].reset_index(drop=True) + return df + business_logic = { "add_features_from_code": add_features_from_code, - "remove_heatingkwh_bottom_percentile": remove_heatingkwh_bottom_percentile + "remove_heatingkwh_bottom_percentile": remove_heatingkwh_bottom_percentile, + "round_to_100s": round_to_100s, + "enforce_minimum_habitable_room_size": enforce_minimum_habitable_room_size, + "remove_high_ratio_of_area_to_rooms": remove_high_ratio_of_area_to_rooms # "keep_non_zero_rdsap": keep_non_zero_rdsap, # "keep_flats": keep_flats, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 7b0f8e8..380bdb9 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -23,7 +23,8 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-07-03-23-11-39/dataset_rooms.parquet - data_filepath: s3://retrofit-data-dev/energy_consumption/2024-07-08/energy_consumption_dataset.parquet + # data_filepath: s3://retrofit-data-dev/energy_consumption/2024-07-08/energy_consumption_dataset.parquet + data_filepath: s3://retrofit-data-dev/energy_consumption/2024-07-25/energy_consumption_dataset.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -78,13 +79,13 @@ default: 'number-open-fireplaces', 'number-heated-rooms', 'lodgement-date', - 'number-habitable-rooms', +# 'number-habitable-rooms', 'windows-description', 'local-authority', 'photo-supply', 'heat-loss-corridor', 'posttown', - 'address', +# 'address', 'flat-top-storey', 'unheated-corridor-length', 'fixed-lighting-outlets-count', @@ -94,7 +95,7 @@ default: 'constituency-label', 'multi-glaze-proportion', 'solar-water-heating-flag', - 'address2', +# 'address2', 'energy-tariff', 'floor-height', 'constituency', @@ -105,7 +106,7 @@ default: 'lodgement-month', 'lighting-cost-current', 'glazed-area', - 'address1', +# 'address1', 'floor-env-eff', 'main-heating-controls'] # retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending', diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 90e74c8..829628a 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -59,13 +59,11 @@ stages: - number-open-fireplaces - number-heated-rooms - lodgement-date - - number-habitable-rooms - windows-description - local-authority - photo-supply - heat-loss-corridor - posttown - - address - flat-top-storey - unheated-corridor-length - fixed-lighting-outlets-count @@ -75,7 +73,6 @@ stages: - constituency-label - multi-glaze-proportion - solar-water-heating-flag - - address2 - energy-tariff - floor-height - constituency @@ -86,7 +83,6 @@ stages: - lodgement-month - lighting-cost-current - glazed-area - - address1 - floor-env-eff - main-heating-controls default.feature_processor.feature_processor_config.subsample_amount: @@ -94,7 +90,7 @@ stages: default.feature_processor.feature_processor_config.target: heating_kwh default.feature_processor.feature_processor_type: dataframe default.prepare_data.data_filepath: - s3://retrofit-data-dev/energy_consumption/2024-07-08/energy_consumption_dataset.parquet + s3://retrofit-data-dev/energy_consumption/2024-07-25/energy_consumption_dataset.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -103,8 +99,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 660630d5c4f0f9a371f5c43221a56e39.dir - size: 14486809 + md5: 8585e7f26fa0008dcc0074996a51a78d.dir + size: 18062621 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -115,8 +111,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 660630d5c4f0f9a371f5c43221a56e39.dir - size: 14486809 + md5: 8585e7f26fa0008dcc0074996a51a78d.dir + size: 18062621 nfiles: 2 params: configs/build_model.yaml: @@ -148,18 +144,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 07b5623892769f33837d89bf6fc6702d.dir - size: 726940 + md5: 0f536790b342ee84fe51f5bf66ca4e3c.dir + size: 1545512 nfiles: 1 - path: data/model/ hash: md5 - md5: 6f281b6a422453ec853b1d13cb1920de.dir - size: 345477655 + md5: 0ce09cc5e2d12876d9315cb18f8b70a9.dir + size: 320950858 nfiles: 36 - path: metrics/fit_metrics.json hash: md5 - md5: e6fc8ae0f36b52ce3173515ef75ce526 - size: 223 + md5: 5c38cf3ad988c55fb9685d76c7da78b3 + size: 216 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -169,13 +165,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 6f281b6a422453ec853b1d13cb1920de.dir - size: 345477655 + md5: 0ce09cc5e2d12876d9315cb18f8b70a9.dir + size: 320950858 nfiles: 36 - path: data/prepared_data hash: md5 - md5: 660630d5c4f0f9a371f5c43221a56e39.dir - size: 14486809 + md5: 8585e7f26fa0008dcc0074996a51a78d.dir + size: 18062621 nfiles: 2 params: configs/settings.yaml: @@ -187,8 +183,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 19d3ead23af278c2ccdf4836180d4c15.dir - size: 77471 + md5: 9f32b5e943df8cd9336077b8daf2975c.dir + size: 163552 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -199,13 +195,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 19d3ead23af278c2ccdf4836180d4c15.dir - size: 77471 + md5: 9f32b5e943df8cd9336077b8daf2975c.dir + size: 163552 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 660630d5c4f0f9a371f5c43221a56e39.dir - size: 14486809 + md5: 8585e7f26fa0008dcc0074996a51a78d.dir + size: 18062621 nfiles: 2 params: configs/settings.yaml: @@ -215,8 +211,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 7b62ecaff5b429ef6c31aba95bce9f39 - size: 218 + md5: 752659c808d2bf0f176a0bf1ad7088a1 + size: 223 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: