diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index ed2e4a3..d29d7ac 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -192,6 +192,7 @@ def remove_high_ratio_of_area_to_rooms(df): # Remove top 0.05% of area-to-heated-rooms df = df[df['area-to-heated-rooms'] < df['area-to-heated-rooms'].quantile(0.9995)].reset_index(drop=True) + df = df.drop(columns=['area-to-heated-rooms']) return df def add_estimate_annual_kwh(df): diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 28b5f2c..aeb5907 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -36,9 +36,59 @@ default: subsample_seed: 0 target: heating_kwh identifier_columns: ["uprn"] - drop_columns: ["hot_water_kwh", 'lodgement-datetime', 'lodgement-date', 'number-habitable-rooms', 'local-authority', 'posttown', 'address', 'inspection-date', - "county", "constituency-label", 'address2', 'uprn-source', 'postcode', 'address1',] - retain_features: null + drop_columns: ["hot_water_kwh"] + retain_features: [ + 'uprn', + 'heating-cost-current', + 'co2-emissions-current', + 'hot-water-cost-current', + 'total-floor-area', + 'secondheat-description', + 'floor-description', + 'mainheat-energy-eff', + 'current-energy-efficiency', + 'walls-energy-eff', + 'roof-energy-eff', + 'property-type', + 'mainheat-description', + 'mechanical-ventilation', + 'floor-level', + 'built-form', + 'walls-description', + 'mainheatcont-description', + 'roof-description', + 'energy-consumption-current', + 'construction-age-band', + 'hotwater-description', + 'main-fuel', + 'hot-water-energy-eff', + 'co2-emiss-curr-per-floor-area', + 'windows-energy-eff', + 'current-energy-rating', + 'lodgement-year', + 'extension-count', + 'number-open-fireplaces', + 'number-heated-rooms', + 'windows-description', + 'photo-supply', + 'heat-loss-corridor', + 'flat-top-storey', + 'unheated-corridor-length', + 'fixed-lighting-outlets-count', + 'tenure', + 'multi-glaze-proportion', + 'solar-water-heating-flag', + 'energy-tariff', + 'floor-height', + 'constituency', + 'transaction-type', + 'floor-energy-eff', + 'lodgement-month', + 'lighting-cost-current', + 'glazed-area', + 'main-heating-controls', + 'estimate_annual_kwh', + ] generate_predictions: input_dataclient_type: local diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index e1f65f5..8fd1507 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -22,20 +22,57 @@ stages: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: - hot_water_kwh - - lodgement-datetime - - lodgement-date - - number-habitable-rooms - - local-authority - - posttown - - address - - inspection-date - - county - - constituency-label - - address2 - - uprn-source - - postcode - - address1 default.feature_processor.feature_processor_config.retain_features: + - uprn + - heating-cost-current + - co2-emissions-current + - hot-water-cost-current + - total-floor-area + - secondheat-description + - floor-description + - mainheat-energy-eff + - current-energy-efficiency + - walls-energy-eff + - roof-energy-eff + - property-type + - mainheat-description + - mechanical-ventilation + - floor-level + - built-form + - walls-description + - mainheatcont-description + - roof-description + - energy-consumption-current + - construction-age-band + - hotwater-description + - main-fuel + - hot-water-energy-eff + - co2-emiss-curr-per-floor-area + - windows-energy-eff + - current-energy-rating + - lodgement-year + - extension-count + - number-open-fireplaces + - number-heated-rooms + - windows-description + - photo-supply + - heat-loss-corridor + - flat-top-storey + - unheated-corridor-length + - fixed-lighting-outlets-count + - tenure + - multi-glaze-proportion + - solar-water-heating-flag + - energy-tariff + - floor-height + - constituency + - transaction-type + - floor-energy-eff + - lodgement-month + - lighting-cost-current + - glazed-area + - main-heating-controls + - estimate_annual_kwh default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: heating_kwh @@ -50,8 +87,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: f506f1f059945c0f014c3f505a63726c.dir - size: 30388447 + md5: c0cb87bd677d46f4c1a608c6d6ee6110.dir + size: 11062844 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -62,8 +99,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: f506f1f059945c0f014c3f505a63726c.dir - size: 30388447 + md5: c0cb87bd677d46f4c1a608c6d6ee6110.dir + size: 11062844 nfiles: 2 params: configs/build_model.yaml: @@ -95,18 +132,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 9a2abeada227b8bb4c13d6c745bef581.dir - size: 1547064 + md5: 5c694b2ec23baca2e7242d3802ba09fe.dir + size: 1546012 nfiles: 1 - path: data/model/ hash: md5 - md5: 43b72f9284e92842cbc82bc7cc0950e2.dir - size: 506201607 - nfiles: 36 + md5: 314c5043b1a421e847af8d3126fba788.dir + size: 278676507 + nfiles: 37 - path: metrics/fit_metrics.json hash: md5 - md5: 4a496483bffad3efe671f29110729e48 - size: 221 + md5: 2e5ab3102b145f5c909232e66210a261 + size: 222 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -116,13 +153,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 43b72f9284e92842cbc82bc7cc0950e2.dir - size: 506201607 - nfiles: 36 + md5: 314c5043b1a421e847af8d3126fba788.dir + size: 278676507 + nfiles: 37 - path: data/prepared_data hash: md5 - md5: f506f1f059945c0f014c3f505a63726c.dir - size: 30388447 + md5: c0cb87bd677d46f4c1a608c6d6ee6110.dir + size: 11062844 nfiles: 2 params: configs/settings.yaml: @@ -134,8 +171,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 88832d623c3e437eaec221307ac33aae.dir - size: 163584 + md5: 10b0612c10dfaa78e08ccc673c6f984e.dir + size: 163560 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -146,13 +183,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 88832d623c3e437eaec221307ac33aae.dir - size: 163584 + md5: 10b0612c10dfaa78e08ccc673c6f984e.dir + size: 163560 nfiles: 1 - path: data/prepared_data hash: md5 - md5: f506f1f059945c0f014c3f505a63726c.dir - size: 30388447 + md5: c0cb87bd677d46f4c1a608c6d6ee6110.dir + size: 11062844 nfiles: 2 params: configs/settings.yaml: @@ -162,8 +199,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: f2783bdec0f0974b6d799609c6189467 - size: 222 + md5: 22794cfc31f6ffd98eb1db4852677b4f + size: 220 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: