diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index bbf7f23..e24abf5 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -30,9 +30,59 @@ default: subsample_seed: 0 target: hot_water_kwh identifier_columns: ["uprn"] - drop_columns: ["heating_kwh", 'lodgement-datetime', 'lodgement-date', 'number-habitable-rooms', 'local-authority', 'posttown', 'address', 'inspection-date', - "county", "constituency-label", 'address2', 'uprn-source', 'postcode', 'address1', 'mainheat-env-eff', 'environment-impact-current', 'hot-water-env-eff', 'floor-env-eff'] - retain_features: null + drop_columns: ["heating_kwh"] + retain_features: [ + 'uprn', + 'heating-cost-current', + 'co2-emissions-current', + 'hot-water-cost-current', + 'total-floor-area', + 'secondheat-description', + 'floor-description', + 'mainheat-energy-eff', + 'current-energy-efficiency', + 'walls-energy-eff', + 'roof-energy-eff', + 'property-type', + 'mainheat-description', + 'mechanical-ventilation', + 'floor-level', + 'built-form', + 'walls-description', + 'mainheatcont-description', + 'roof-description', + 'energy-consumption-current', + 'construction-age-band', + 'hotwater-description', + 'main-fuel', + 'hot-water-energy-eff', + 'co2-emiss-curr-per-floor-area', + 'windows-energy-eff', + 'current-energy-rating', + 'lodgement-year', + 'extension-count', + 'number-open-fireplaces', + 'number-heated-rooms', + 'windows-description', + 'photo-supply', + 'heat-loss-corridor', + 'flat-top-storey', + 'unheated-corridor-length', + 'fixed-lighting-outlets-count', + 'tenure', + 'multi-glaze-proportion', + 'solar-water-heating-flag', + 'energy-tariff', + 'floor-height', + 'constituency', + 'transaction-type', + 'floor-energy-eff', + 'lodgement-month', + 'lighting-cost-current', + 'glazed-area', + 'main-heating-controls', + 'estimate_annual_kwh', + ] generate_predictions: input_dataclient_type: local diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index a52895f..ff64aca 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -22,24 +22,57 @@ stages: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: - heating_kwh - - lodgement-datetime - - lodgement-date - - number-habitable-rooms - - local-authority - - posttown - - address - - inspection-date - - county - - constituency-label - - address2 - - uprn-source - - postcode - - address1 - - mainheat-env-eff - - environment-impact-current - - hot-water-env-eff - - floor-env-eff default.feature_processor.feature_processor_config.retain_features: + - uprn + - heating-cost-current + - co2-emissions-current + - hot-water-cost-current + - total-floor-area + - secondheat-description + - floor-description + - mainheat-energy-eff + - current-energy-efficiency + - walls-energy-eff + - roof-energy-eff + - property-type + - mainheat-description + - mechanical-ventilation + - floor-level + - built-form + - walls-description + - mainheatcont-description + - roof-description + - energy-consumption-current + - construction-age-band + - hotwater-description + - main-fuel + - hot-water-energy-eff + - co2-emiss-curr-per-floor-area + - windows-energy-eff + - current-energy-rating + - lodgement-year + - extension-count + - number-open-fireplaces + - number-heated-rooms + - windows-description + - photo-supply + - heat-loss-corridor + - flat-top-storey + - unheated-corridor-length + - fixed-lighting-outlets-count + - tenure + - multi-glaze-proportion + - solar-water-heating-flag + - energy-tariff + - floor-height + - constituency + - transaction-type + - floor-energy-eff + - lodgement-month + - lighting-cost-current + - glazed-area + - main-heating-controls + - estimate_annual_kwh default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: hot_water_kwh @@ -54,8 +87,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 0364b2ef5dd7674f97473fdecf3f3a02.dir - size: 35173792 + md5: 19abfc15e24cd8a869a0f3f087e09584.dir + size: 13396685 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -66,8 +99,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 0364b2ef5dd7674f97473fdecf3f3a02.dir - size: 35173792 + md5: 19abfc15e24cd8a869a0f3f087e09584.dir + size: 13396685 nfiles: 2 params: configs/build_model.yaml: @@ -99,18 +132,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 56598af2325ec699349cdb166b1e807b.dir - size: 1789771 + md5: 0e2f1131b24eafa09ccbab91a36cac6c.dir + size: 1787805 nfiles: 1 - path: data/model/ hash: md5 - md5: ce995d18c2f40aefe1f5757d621bb4d4.dir - size: 592064916 + md5: 77c0900cda64c1beaef6782d04fd712e.dir + size: 518798187 nfiles: 36 - path: metrics/fit_metrics.json hash: md5 - md5: 4c169dc1d437e5fea43e47616f46dafc - size: 219 + md5: 0e03bb46cd03e9521095bf141ee92ed7 + size: 220 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -120,13 +153,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: ce995d18c2f40aefe1f5757d621bb4d4.dir - size: 592064916 + md5: 77c0900cda64c1beaef6782d04fd712e.dir + size: 518798187 nfiles: 36 - path: data/prepared_data hash: md5 - md5: 0364b2ef5dd7674f97473fdecf3f3a02.dir - size: 35173792 + md5: 19abfc15e24cd8a869a0f3f087e09584.dir + size: 13396685 nfiles: 2 params: configs/settings.yaml: @@ -138,8 +171,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 590da9bdeb1e1b442e52bce52f1da0dc.dir - size: 192586 + md5: efc7b9dd9d40e42ad93e3e5acbeac92d.dir + size: 192502 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -150,13 +183,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 590da9bdeb1e1b442e52bce52f1da0dc.dir - size: 192586 + md5: efc7b9dd9d40e42ad93e3e5acbeac92d.dir + size: 192502 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 0364b2ef5dd7674f97473fdecf3f3a02.dir - size: 35173792 + md5: 19abfc15e24cd8a869a0f3f087e09584.dir + size: 13396685 nfiles: 2 params: configs/settings.yaml: @@ -166,8 +199,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: b80014eacb59a824aff78667352e7c95 - size: 221 + md5: a8b7699dd2b171da5fead1fb04d954aa + size: 220 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: