From 8cabd8fc7307318d6287978132454369c1b5153e Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 5 Aug 2024 23:05:17 +0100 Subject: [PATCH 1/2] Add new hotwaterkwh model, with removed features --- .../src/pipeline/configs/settings.yaml | 100 +++++++++++------ modules/ml-pipeline/src/pipeline/dvc.lock | 103 +++++++++++++----- 2 files changed, 147 insertions(+), 56 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 571682f..5fb1d1d 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -18,12 +18,7 @@ default: prepare_data: input_dataclient_type: aws-s3 output_dataclient_type: local - # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-07-03-23-11-39/dataset_rooms.parquet - data_filepath: s3://retrofit-data-dev/energy_consumption/2024-07-08/energy_consumption_dataset.parquet + data_filepath: s3://retrofit-data-dev/energy_consumption/2024-07-25/energy_consumption_dataset.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -35,32 +30,75 @@ default: subsample_seed: 0 target: hot_water_kwh identifier_columns: ["uprn"] - # drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"] drop_columns: ["heating_kwh"] - retain_features: null - # retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending', - # 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending', - # 'walls_energy_eff_ending', 'secondheat_description_ending', - # 'property_type', 'mainheatc_energy_eff_ending', 'built_form', - # 'walls_insulation_thickness_ending', 'potential_energy_efficiency', - # 'transaction_type_ending', - # 'floor_thermal_transmittance_ending', - # 'low_energy_lighting_ending', 'heat_demand_starting', - # 'photo_supply_ending', 'carbon_starting', - # 'walls_thermal_transmittance_ending', - # 'roof_insulation_thickness_ending', - # 'total_floor_area_ending', 'number_open_fireplaces_ending', - # 'windows_energy_eff_ending', - # 'floor_height_ending', - # 'extension_count_ending', - # 'has_air_source_heat_pump_ending', - # 'charging_system_ending', 'construction_age_band', 'glazed_type_ending', - # 'roof_thermal_transmittance_ending', - # 'floor_insulation_thickness_ending', 'has_mains_gas_ending', - # 'estimated_perimeter_starting', 'energy_consumption_potential', - # 'environment_impact_potential', 'heater_type_ending', - # 'multi_glaze_proportion_ending', - # 'lighting_energy_eff_ending', 'fixed_lighting_outlets_count'] + retain_features: [ + 'uprn', + 'heating-cost-current', + 'co2-emissions-current', + 'hot-water-cost-current', + 'total-floor-area', + 'secondheat-description', + 'environment-impact-current', + 'floor-description', + 'mainheat-energy-eff', + 'current-energy-efficiency', + 'mainheat-env-eff', + 'walls-energy-eff', + 'roof-energy-eff', + 'property-type', + 'mainheat-description', + 'hot-water-env-eff', + 'mechanical-ventilation', + 'floor-level', + 'built-form', + 'walls-description', + 'mainheatcont-description', + 'roof-description', + 'energy-consumption-current', + 'construction-age-band', + 'hotwater-description', + # 'lodgement-datetime', + 'main-fuel', + 'hot-water-energy-eff', + 'co2-emiss-curr-per-floor-area', + 'windows-energy-eff', + 'current-energy-rating', + 'lodgement-year', + 'extension-count', + 'number-open-fireplaces', + 'number-heated-rooms', + # 'lodgement-date', + # 'number-habitable-rooms', + 'windows-description', + # 'local-authority', + 'photo-supply', + 'heat-loss-corridor', + # 'posttown', + # 'address', + 'flat-top-storey', + 'unheated-corridor-length', + 'fixed-lighting-outlets-count', + # 'inspection-date', + 'tenure', + # 'county', + # 'constituency-label', + 'multi-glaze-proportion', + 'solar-water-heating-flag', + # 'address2', + 'energy-tariff', + 'floor-height', + 'constituency', + # 'uprn-source', + 'transaction-type', + 'floor-energy-eff', + # 'postcode', + 'lodgement-month', + 'lighting-cost-current', + 'glazed-area', + # 'address1', + 'floor-env-eff', + 'main-heating-controls' + ] generate_predictions: input_dataclient_type: local diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 6062508..06f4eb4 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -23,12 +23,65 @@ stages: default.feature_processor.feature_processor_config.drop_columns: - heating_kwh default.feature_processor.feature_processor_config.retain_features: + - uprn + - heating-cost-current + - co2-emissions-current + - hot-water-cost-current + - total-floor-area + - secondheat-description + - environment-impact-current + - floor-description + - mainheat-energy-eff + - current-energy-efficiency + - mainheat-env-eff + - walls-energy-eff + - roof-energy-eff + - property-type + - mainheat-description + - hot-water-env-eff + - mechanical-ventilation + - floor-level + - built-form + - walls-description + - mainheatcont-description + - roof-description + - energy-consumption-current + - construction-age-band + - hotwater-description + - main-fuel + - hot-water-energy-eff + - co2-emiss-curr-per-floor-area + - windows-energy-eff + - current-energy-rating + - lodgement-year + - extension-count + - number-open-fireplaces + - number-heated-rooms + - windows-description + - photo-supply + - heat-loss-corridor + - flat-top-storey + - unheated-corridor-length + - fixed-lighting-outlets-count + - tenure + - multi-glaze-proportion + - solar-water-heating-flag + - energy-tariff + - floor-height + - constituency + - transaction-type + - floor-energy-eff + - lodgement-month + - lighting-cost-current + - glazed-area + - floor-env-eff + - main-heating-controls default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: hot_water_kwh default.feature_processor.feature_processor_type: dataframe default.prepare_data.data_filepath: - s3://retrofit-data-dev/energy_consumption/2024-07-08/energy_consumption_dataset.parquet + s3://retrofit-data-dev/energy_consumption/2024-07-25/energy_consumption_dataset.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -37,8 +90,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 322c8294651dea6c4db9e06157a91ffd.dir - size: 23387145 + md5: 295ac4fd05a1a3373401a7318b0b5186.dir + size: 13131853 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -49,8 +102,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 322c8294651dea6c4db9e06157a91ffd.dir - size: 23387145 + md5: 295ac4fd05a1a3373401a7318b0b5186.dir + size: 13131853 nfiles: 2 params: configs/build_model.yaml: @@ -82,18 +135,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: b149b2be5ed3105e73b02000b9912422.dir - size: 724848 + md5: 3e48cec68f702bc822eed8fcb2c5c603.dir + size: 1787931 nfiles: 1 - path: data/model/ hash: md5 - md5: 3fe37e27b51fe6d9472252f219fd9126.dir - size: 465478726 - nfiles: 36 + md5: 37f7480141e920c68faacd39478a1a68.dir + size: 451364406 + nfiles: 35 - path: metrics/fit_metrics.json hash: md5 - md5: c27dcce525b763fa7c2c55820ae72727 - size: 225 + md5: e7a2a5efea57b1ddd1431b713d78bb11 + size: 219 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -103,13 +156,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 3fe37e27b51fe6d9472252f219fd9126.dir - size: 465478726 - nfiles: 36 + md5: 37f7480141e920c68faacd39478a1a68.dir + size: 451364406 + nfiles: 35 - path: data/prepared_data hash: md5 - md5: 322c8294651dea6c4db9e06157a91ffd.dir - size: 23387145 + md5: 295ac4fd05a1a3373401a7318b0b5186.dir + size: 13131853 nfiles: 2 params: configs/settings.yaml: @@ -121,8 +174,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 07db4158559475e73ffb06ff95a6c869.dir - size: 77435 + md5: 9267a66c6fae4da5a589faab76fac14c.dir + size: 192482 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -133,13 +186,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 07db4158559475e73ffb06ff95a6c869.dir - size: 77435 + md5: 9267a66c6fae4da5a589faab76fac14c.dir + size: 192482 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 322c8294651dea6c4db9e06157a91ffd.dir - size: 23387145 + md5: 295ac4fd05a1a3373401a7318b0b5186.dir + size: 13131853 nfiles: 2 params: configs/settings.yaml: @@ -149,8 +202,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: db8eddb1bb0b190188e25de65bdbd8e8 - size: 220 + md5: 3a08c29f028f5e3cb50fb8cd3608e2f4 + size: 221 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: From ec7149e56e69d78c9a3742804c769fc623bf6178 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 5 Aug 2024 23:09:25 +0100 Subject: [PATCH 2/2] change readme --- modules/ml-pipeline/README.MD | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/modules/ml-pipeline/README.MD b/modules/ml-pipeline/README.MD index 2711569..a0f798e 100644 --- a/modules/ml-pipeline/README.MD +++ b/modules/ml-pipeline/README.MD @@ -17,14 +17,15 @@ Within `src` folder, the structure is as follows: # How to develop using this pipeline: -Run `make init`, which will: -- Download pyenv (Python version management) -- Download Python 3.X.X as defined in the `make` file - current 3.10.12 -- Create a virtual environment with this version of python +First, download miniconda to use conda to manage Python Environments +Rund `conda init`, to initialise your terminal + +Change to this directory and run `make init`, which will: +- Create a conda virtual environment with this version of python - current 3.10.12 - Install packages in the training and version control directories in the pipeline folder (dev version if applicable) - Install pre-commit to enable pre-commit hooks -To use the environment, run `source .dev_env_pipeline/bin/activate`. +To use the environment, run `conda activate dev_env_pipeline` To enable the virtual envrionemnt created in vscode: - Open settings