diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 48a28bf..e93b44b 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -160,7 +160,9 @@ def add_features_from_code(df): return df - +def add_estimate_annual_kwh(df): + df['estimate_annual_kwh'] = df['energy-consumption-current'] * df['total-floor-area'] + return df # def keep_ending_columns(df): # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] @@ -172,7 +174,8 @@ def add_features_from_code(df): business_logic = { "add_features_from_code": add_features_from_code, - "remove_hotwaterkwh_bottom_percentile": remove_hotwaterkwh_bottom_percentile + "remove_hotwaterkwh_bottom_percentile": remove_hotwaterkwh_bottom_percentile, + "add_estimate_annual_kwh": add_estimate_annual_kwh # "keep_non_zero_rdsap": keep_non_zero_rdsap, # "keep_flats": keep_flats, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 5fb1d1d..bbf7f23 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -30,75 +30,9 @@ default: subsample_seed: 0 target: hot_water_kwh identifier_columns: ["uprn"] - drop_columns: ["heating_kwh"] - retain_features: [ - 'uprn', - 'heating-cost-current', - 'co2-emissions-current', - 'hot-water-cost-current', - 'total-floor-area', - 'secondheat-description', - 'environment-impact-current', - 'floor-description', - 'mainheat-energy-eff', - 'current-energy-efficiency', - 'mainheat-env-eff', - 'walls-energy-eff', - 'roof-energy-eff', - 'property-type', - 'mainheat-description', - 'hot-water-env-eff', - 'mechanical-ventilation', - 'floor-level', - 'built-form', - 'walls-description', - 'mainheatcont-description', - 'roof-description', - 'energy-consumption-current', - 'construction-age-band', - 'hotwater-description', - # 'lodgement-datetime', - 'main-fuel', - 'hot-water-energy-eff', - 'co2-emiss-curr-per-floor-area', - 'windows-energy-eff', - 'current-energy-rating', - 'lodgement-year', - 'extension-count', - 'number-open-fireplaces', - 'number-heated-rooms', - # 'lodgement-date', - # 'number-habitable-rooms', - 'windows-description', - # 'local-authority', - 'photo-supply', - 'heat-loss-corridor', - # 'posttown', - # 'address', - 'flat-top-storey', - 'unheated-corridor-length', - 'fixed-lighting-outlets-count', - # 'inspection-date', - 'tenure', - # 'county', - # 'constituency-label', - 'multi-glaze-proportion', - 'solar-water-heating-flag', - # 'address2', - 'energy-tariff', - 'floor-height', - 'constituency', - # 'uprn-source', - 'transaction-type', - 'floor-energy-eff', - # 'postcode', - 'lodgement-month', - 'lighting-cost-current', - 'glazed-area', - # 'address1', - 'floor-env-eff', - 'main-heating-controls' - ] + drop_columns: ["heating_kwh", 'lodgement-datetime', 'lodgement-date', 'number-habitable-rooms', 'local-authority', 'posttown', 'address', 'inspection-date', + "county", "constituency-label", 'address2', 'uprn-source', 'postcode', 'address1', 'mainheat-env-eff', 'environment-impact-current', 'hot-water-env-eff', 'floor-env-eff'] + retain_features: null generate_predictions: input_dataclient_type: local diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 06f4eb4..a52895f 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -22,60 +22,24 @@ stages: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: - heating_kwh - default.feature_processor.feature_processor_config.retain_features: - - uprn - - heating-cost-current - - co2-emissions-current - - hot-water-cost-current - - total-floor-area - - secondheat-description - - environment-impact-current - - floor-description - - mainheat-energy-eff - - current-energy-efficiency + - lodgement-datetime + - lodgement-date + - number-habitable-rooms + - local-authority + - posttown + - address + - inspection-date + - county + - constituency-label + - address2 + - uprn-source + - postcode + - address1 - mainheat-env-eff - - walls-energy-eff - - roof-energy-eff - - property-type - - mainheat-description + - environment-impact-current - hot-water-env-eff - - mechanical-ventilation - - floor-level - - built-form - - walls-description - - mainheatcont-description - - roof-description - - energy-consumption-current - - construction-age-band - - hotwater-description - - main-fuel - - hot-water-energy-eff - - co2-emiss-curr-per-floor-area - - windows-energy-eff - - current-energy-rating - - lodgement-year - - extension-count - - number-open-fireplaces - - number-heated-rooms - - windows-description - - photo-supply - - heat-loss-corridor - - flat-top-storey - - unheated-corridor-length - - fixed-lighting-outlets-count - - tenure - - multi-glaze-proportion - - solar-water-heating-flag - - energy-tariff - - floor-height - - constituency - - transaction-type - - floor-energy-eff - - lodgement-month - - lighting-cost-current - - glazed-area - floor-env-eff - - main-heating-controls + default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: hot_water_kwh @@ -90,8 +54,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 295ac4fd05a1a3373401a7318b0b5186.dir - size: 13131853 + md5: 0364b2ef5dd7674f97473fdecf3f3a02.dir + size: 35173792 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -102,8 +66,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 295ac4fd05a1a3373401a7318b0b5186.dir - size: 13131853 + md5: 0364b2ef5dd7674f97473fdecf3f3a02.dir + size: 35173792 nfiles: 2 params: configs/build_model.yaml: @@ -135,17 +99,17 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 3e48cec68f702bc822eed8fcb2c5c603.dir - size: 1787931 + md5: 56598af2325ec699349cdb166b1e807b.dir + size: 1789771 nfiles: 1 - path: data/model/ hash: md5 - md5: 37f7480141e920c68faacd39478a1a68.dir - size: 451364406 - nfiles: 35 + md5: ce995d18c2f40aefe1f5757d621bb4d4.dir + size: 592064916 + nfiles: 36 - path: metrics/fit_metrics.json hash: md5 - md5: e7a2a5efea57b1ddd1431b713d78bb11 + md5: 4c169dc1d437e5fea43e47616f46dafc size: 219 generate_predictions: cmd: python 3_generate_predictions.py @@ -156,13 +120,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 37f7480141e920c68faacd39478a1a68.dir - size: 451364406 - nfiles: 35 + md5: ce995d18c2f40aefe1f5757d621bb4d4.dir + size: 592064916 + nfiles: 36 - path: data/prepared_data hash: md5 - md5: 295ac4fd05a1a3373401a7318b0b5186.dir - size: 13131853 + md5: 0364b2ef5dd7674f97473fdecf3f3a02.dir + size: 35173792 nfiles: 2 params: configs/settings.yaml: @@ -174,8 +138,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 9267a66c6fae4da5a589faab76fac14c.dir - size: 192482 + md5: 590da9bdeb1e1b442e52bce52f1da0dc.dir + size: 192586 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -186,13 +150,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 9267a66c6fae4da5a589faab76fac14c.dir - size: 192482 + md5: 590da9bdeb1e1b442e52bce52f1da0dc.dir + size: 192586 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 295ac4fd05a1a3373401a7318b0b5186.dir - size: 13131853 + md5: 0364b2ef5dd7674f97473fdecf3f3a02.dir + size: 35173792 nfiles: 2 params: configs/settings.yaml: @@ -202,7 +166,7 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 3a08c29f028f5e3cb50fb8cd3608e2f4 + md5: b80014eacb59a824aff78667352e7c95 size: 221 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py