diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index c55af40..ed2e4a3 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -194,12 +194,17 @@ def remove_high_ratio_of_area_to_rooms(df): df = df[df['area-to-heated-rooms'] < df['area-to-heated-rooms'].quantile(0.9995)].reset_index(drop=True) return df +def add_estimate_annual_kwh(df): + df['estimate_annual_kwh'] = df['energy-consumption-current'] * df['total-floor-area'] + return df + business_logic = { "add_features_from_code": add_features_from_code, "remove_heatingkwh_bottom_percentile": remove_heatingkwh_bottom_percentile, - "round_to_100s": round_to_100s, + # "round_to_100s": round_to_100s, "enforce_minimum_habitable_room_size": enforce_minimum_habitable_room_size, - "remove_high_ratio_of_area_to_rooms": remove_high_ratio_of_area_to_rooms + "remove_high_ratio_of_area_to_rooms": remove_high_ratio_of_area_to_rooms, + "add_estimate_annual_kwh": add_estimate_annual_kwh, # "keep_non_zero_rdsap": keep_non_zero_rdsap, # "keep_flats": keep_flats, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index b0102ff..28b5f2c 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -36,75 +36,9 @@ default: subsample_seed: 0 target: heating_kwh identifier_columns: ["uprn"] - drop_columns: ["hot_water_kwh"] - retain_features: [ - 'uprn', - 'heating-cost-current', - 'co2-emissions-current', - 'hot-water-cost-current', - 'total-floor-area', - 'secondheat-description', - 'environment-impact-current', - 'floor-description', - 'mainheat-energy-eff', - 'current-energy-efficiency', - 'mainheat-env-eff', - 'walls-energy-eff', - 'roof-energy-eff', - 'property-type', - 'mainheat-description', - 'hot-water-env-eff', - 'mechanical-ventilation', - 'floor-level', - 'built-form', - 'walls-description', - 'mainheatcont-description', - 'roof-description', - 'energy-consumption-current', - 'construction-age-band', - 'hotwater-description', - # 'lodgement-datetime', - 'main-fuel', - 'hot-water-energy-eff', - 'co2-emiss-curr-per-floor-area', - 'windows-energy-eff', - 'current-energy-rating', - 'lodgement-year', - 'extension-count', - 'number-open-fireplaces', - 'number-heated-rooms', - # 'lodgement-date', - # 'number-habitable-rooms', - 'windows-description', - # 'local-authority', - 'photo-supply', - 'heat-loss-corridor', - # 'posttown', - # 'address', - 'flat-top-storey', - 'unheated-corridor-length', - 'fixed-lighting-outlets-count', - # 'inspection-date', - 'tenure', - # 'county', - # 'constituency-label', - 'multi-glaze-proportion', - 'solar-water-heating-flag', - # 'address2', - 'energy-tariff', - 'floor-height', - 'constituency', - # 'uprn-source', - 'transaction-type', - 'floor-energy-eff', - # 'postcode', - 'lodgement-month', - 'lighting-cost-current', - 'glazed-area', - # 'address1', - 'floor-env-eff', - 'main-heating-controls' - ] + drop_columns: ["hot_water_kwh", 'lodgement-datetime', 'lodgement-date', 'number-habitable-rooms', 'local-authority', 'posttown', 'address', 'inspection-date', + "county", "constituency-label", 'address2', 'uprn-source', 'postcode', 'address1',] + retain_features: null generate_predictions: input_dataclient_type: local diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 59ef25f..e1f65f5 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -22,60 +22,20 @@ stages: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: - hot_water_kwh + - lodgement-datetime + - lodgement-date + - number-habitable-rooms + - local-authority + - posttown + - address + - inspection-date + - county + - constituency-label + - address2 + - uprn-source + - postcode + - address1 default.feature_processor.feature_processor_config.retain_features: - - uprn - - heating-cost-current - - co2-emissions-current - - hot-water-cost-current - - total-floor-area - - secondheat-description - - environment-impact-current - - floor-description - - mainheat-energy-eff - - current-energy-efficiency - - mainheat-env-eff - - walls-energy-eff - - roof-energy-eff - - property-type - - mainheat-description - - hot-water-env-eff - - mechanical-ventilation - - floor-level - - built-form - - walls-description - - mainheatcont-description - - roof-description - - energy-consumption-current - - construction-age-band - - hotwater-description - - main-fuel - - hot-water-energy-eff - - co2-emiss-curr-per-floor-area - - windows-energy-eff - - current-energy-rating - - lodgement-year - - extension-count - - number-open-fireplaces - - number-heated-rooms - - windows-description - - photo-supply - - heat-loss-corridor - - flat-top-storey - - unheated-corridor-length - - fixed-lighting-outlets-count - - tenure - - multi-glaze-proportion - - solar-water-heating-flag - - energy-tariff - - floor-height - - constituency - - transaction-type - - floor-energy-eff - - lodgement-month - - lighting-cost-current - - glazed-area - - floor-env-eff - - main-heating-controls default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: heating_kwh @@ -90,8 +50,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: f5e520d6cc27dcd0d306cfdbebd324ff.dir - size: 10405713 + md5: f506f1f059945c0f014c3f505a63726c.dir + size: 30388447 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -102,8 +62,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: f5e520d6cc27dcd0d306cfdbebd324ff.dir - size: 10405713 + md5: f506f1f059945c0f014c3f505a63726c.dir + size: 30388447 nfiles: 2 params: configs/build_model.yaml: @@ -135,18 +95,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: dee9c58e45081cf5734895a18f31906f.dir - size: 1545644 + md5: 9a2abeada227b8bb4c13d6c745bef581.dir + size: 1547064 nfiles: 1 - path: data/model/ hash: md5 - md5: 2da6dc420a308a31e5450ab24b7d4c40.dir - size: 297721035 - nfiles: 35 + md5: 43b72f9284e92842cbc82bc7cc0950e2.dir + size: 506201607 + nfiles: 36 - path: metrics/fit_metrics.json hash: md5 - md5: 23032c58977677c6790415aa79e48aa8 - size: 216 + md5: 4a496483bffad3efe671f29110729e48 + size: 221 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -156,13 +116,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 2da6dc420a308a31e5450ab24b7d4c40.dir - size: 297721035 - nfiles: 35 + md5: 43b72f9284e92842cbc82bc7cc0950e2.dir + size: 506201607 + nfiles: 36 - path: data/prepared_data hash: md5 - md5: f5e520d6cc27dcd0d306cfdbebd324ff.dir - size: 10405713 + md5: f506f1f059945c0f014c3f505a63726c.dir + size: 30388447 nfiles: 2 params: configs/settings.yaml: @@ -174,8 +134,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: d93b71cd8f21df7928a423db8a2c4e2b.dir - size: 163544 + md5: 88832d623c3e437eaec221307ac33aae.dir + size: 163584 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -186,13 +146,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: d93b71cd8f21df7928a423db8a2c4e2b.dir - size: 163544 + md5: 88832d623c3e437eaec221307ac33aae.dir + size: 163584 nfiles: 1 - path: data/prepared_data hash: md5 - md5: f5e520d6cc27dcd0d306cfdbebd324ff.dir - size: 10405713 + md5: f506f1f059945c0f014c3f505a63726c.dir + size: 30388447 nfiles: 2 params: configs/settings.yaml: @@ -202,8 +162,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: f611572ff9273930f0c386903ee2ba63 - size: 217 + md5: f2783bdec0f0974b6d799609c6189467 + size: 222 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: