From bcb505084f7454c5c06c221265ec1d1854cec3ca Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 6 Aug 2024 22:18:27 +0100 Subject: [PATCH] use retain features again with remove env features --- .../src/pipeline/configs/settings.yaml | 56 ++++++++- modules/ml-pipeline/src/pipeline/dvc.lock | 115 +++++++++++------- 2 files changed, 127 insertions(+), 44 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index b484049..aeb5907 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -36,9 +36,59 @@ default: subsample_seed: 0 target: heating_kwh identifier_columns: ["uprn"] - drop_columns: ["hot_water_kwh", 'lodgement-datetime', 'lodgement-date', 'number-habitable-rooms', 'local-authority', 'posttown', 'address', 'inspection-date', - "county", "constituency-label", 'address2', 'uprn-source', 'postcode', 'address1', 'mainheat-env-eff', 'environment-impact-current', 'hot-water-env-eff', 'floor-env-eff'] - retain_features: null + drop_columns: ["hot_water_kwh"] + retain_features: [ + 'uprn', + 'heating-cost-current', + 'co2-emissions-current', + 'hot-water-cost-current', + 'total-floor-area', + 'secondheat-description', + 'floor-description', + 'mainheat-energy-eff', + 'current-energy-efficiency', + 'walls-energy-eff', + 'roof-energy-eff', + 'property-type', + 'mainheat-description', + 'mechanical-ventilation', + 'floor-level', + 'built-form', + 'walls-description', + 'mainheatcont-description', + 'roof-description', + 'energy-consumption-current', + 'construction-age-band', + 'hotwater-description', + 'main-fuel', + 'hot-water-energy-eff', + 'co2-emiss-curr-per-floor-area', + 'windows-energy-eff', + 'current-energy-rating', + 'lodgement-year', + 'extension-count', + 'number-open-fireplaces', + 'number-heated-rooms', + 'windows-description', + 'photo-supply', + 'heat-loss-corridor', + 'flat-top-storey', + 'unheated-corridor-length', + 'fixed-lighting-outlets-count', + 'tenure', + 'multi-glaze-proportion', + 'solar-water-heating-flag', + 'energy-tariff', + 'floor-height', + 'constituency', + 'transaction-type', + 'floor-energy-eff', + 'lodgement-month', + 'lighting-cost-current', + 'glazed-area', + 'main-heating-controls', + 'estimate_annual_kwh', + ] generate_predictions: input_dataclient_type: local diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 536a3bc..8fd1507 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -22,24 +22,57 @@ stages: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: - hot_water_kwh - - lodgement-datetime - - lodgement-date - - number-habitable-rooms - - local-authority - - posttown - - address - - inspection-date - - county - - constituency-label - - address2 - - uprn-source - - postcode - - address1 - - mainheat-env-eff - - environment-impact-current - - hot-water-env-eff - - floor-env-eff default.feature_processor.feature_processor_config.retain_features: + - uprn + - heating-cost-current + - co2-emissions-current + - hot-water-cost-current + - total-floor-area + - secondheat-description + - floor-description + - mainheat-energy-eff + - current-energy-efficiency + - walls-energy-eff + - roof-energy-eff + - property-type + - mainheat-description + - mechanical-ventilation + - floor-level + - built-form + - walls-description + - mainheatcont-description + - roof-description + - energy-consumption-current + - construction-age-band + - hotwater-description + - main-fuel + - hot-water-energy-eff + - co2-emiss-curr-per-floor-area + - windows-energy-eff + - current-energy-rating + - lodgement-year + - extension-count + - number-open-fireplaces + - number-heated-rooms + - windows-description + - photo-supply + - heat-loss-corridor + - flat-top-storey + - unheated-corridor-length + - fixed-lighting-outlets-count + - tenure + - multi-glaze-proportion + - solar-water-heating-flag + - energy-tariff + - floor-height + - constituency + - transaction-type + - floor-energy-eff + - lodgement-month + - lighting-cost-current + - glazed-area + - main-heating-controls + - estimate_annual_kwh default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: heating_kwh @@ -54,8 +87,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 382d5d02772d4ead3a31fa9420c03417.dir - size: 29570807 + md5: c0cb87bd677d46f4c1a608c6d6ee6110.dir + size: 11062844 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -66,8 +99,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 382d5d02772d4ead3a31fa9420c03417.dir - size: 29570807 + md5: c0cb87bd677d46f4c1a608c6d6ee6110.dir + size: 11062844 nfiles: 2 params: configs/build_model.yaml: @@ -99,18 +132,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 3ea5e827470bb96408fa70bc45ed6b58.dir - size: 1545844 + md5: 5c694b2ec23baca2e7242d3802ba09fe.dir + size: 1546012 nfiles: 1 - path: data/model/ hash: md5 - md5: cff0bc3b08dfb48fff929bb4991ea2f5.dir - size: 291336375 - nfiles: 35 + md5: 314c5043b1a421e847af8d3126fba788.dir + size: 278676507 + nfiles: 37 - path: metrics/fit_metrics.json hash: md5 - md5: 63b660c30b855ee0d86b0c1be4ad537e - size: 220 + md5: 2e5ab3102b145f5c909232e66210a261 + size: 222 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -120,13 +153,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: cff0bc3b08dfb48fff929bb4991ea2f5.dir - size: 291336375 - nfiles: 35 + md5: 314c5043b1a421e847af8d3126fba788.dir + size: 278676507 + nfiles: 37 - path: data/prepared_data hash: md5 - md5: 382d5d02772d4ead3a31fa9420c03417.dir - size: 29570807 + md5: c0cb87bd677d46f4c1a608c6d6ee6110.dir + size: 11062844 nfiles: 2 params: configs/settings.yaml: @@ -138,8 +171,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 88c5ad48fd6035600135a4541f2811a8.dir - size: 163584 + md5: 10b0612c10dfaa78e08ccc673c6f984e.dir + size: 163560 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -150,13 +183,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 88c5ad48fd6035600135a4541f2811a8.dir - size: 163584 + md5: 10b0612c10dfaa78e08ccc673c6f984e.dir + size: 163560 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 382d5d02772d4ead3a31fa9420c03417.dir - size: 29570807 + md5: c0cb87bd677d46f4c1a608c6d6ee6110.dir + size: 11062844 nfiles: 2 params: configs/settings.yaml: @@ -166,8 +199,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: f3fd84bd242e9f806aaedbb560f2ac7e - size: 219 + md5: 22794cfc31f6ffd98eb1db4852677b4f + size: 220 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: