diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..84abbe6 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,9 @@ +modules/ml-pipeline/src/pipeline/data/predictions +modules/ml-pipeline/src/pipeline/data/fit_predictions +modules/ml-pipeline/src/pipeline/data/prepared_data +modules/ml-pipeline/src/pipeline/data/model/allmodels +modules/ml-pipeline/src/pipeline/metrics +modules/ml-pipeline/src/pipeline/__pycache__ +modules/ml-pipeline/src/pipeline/.dvc +modules/ml-pipeline/src/pipeline/analysis +modules/ml-pipeline/src/pipeline/metrics diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 6e34d36..265a324 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -19,8 +19,8 @@ jobs: - name: Install Serverless and plugins run: | - npm install -g serverless - npm install -g serverless-domain-manager + npm install -g serverless@^3.38.0 + npm install -g serverless-domain-manager@^7.3.8 - name: Install DVC run: | diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 97c4388..2fea343 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,25 +8,25 @@ "active": true }, "sap": { - "version": "v0.11.0", + "version": "v0.14.0", "stage": { - "dev": "v0.11.0" + "dev": "v0.14.0" }, "registered": true, "active": true }, "heat": { - "version": "v0.4.0", + "version": "v0.5.0", "stage": { - "dev": "v0.4.0" + "dev": "v0.5.0" }, "registered": true, "active": true }, "carbon": { - "version": "v0.4.0", + "version": "v0.5.0", "stage": { - "dev": "v0.4.0" + "dev": "v0.5.0" }, "registered": true, "active": true diff --git a/deployment/.dockerignore b/deployment/.dockerignore index e01cbd5..c4103de 100644 --- a/deployment/.dockerignore +++ b/deployment/.dockerignore @@ -1,4 +1,9 @@ -modules/ml-pipeline/src/pipeline/data/predictions* -modules/ml-pipeline/src/pipeline/data/prepared_data* -modules/ml-pipeline/src/pipeline/data/model/allmodels* -modules/ml-pipeline/src/pipeline/metrics* +modules/ml-pipeline/src/pipeline/data/predictions +modules/ml-pipeline/src/pipeline/data/fit_predictions +modules/ml-pipeline/src/pipeline/data/prepared_data +modules/ml-pipeline/src/pipeline/data/model/allmodels +modules/ml-pipeline/src/pipeline/metrics +modules/ml-pipeline/src/__pycache__ +modules/ml-pipeline/src/.dvc +modules/ml-pipeline/src/analysis +modules/ml-pipeline/src/metrics diff --git a/modules/ml-pipeline/src/.dockerignore b/modules/ml-pipeline/src/.dockerignore index 14f71d7..bf48a5e 100644 --- a/modules/ml-pipeline/src/.dockerignore +++ b/modules/ml-pipeline/src/.dockerignore @@ -1,4 +1,8 @@ -pipeline/data/predictions* -pipeline/data/prepared_data/train.parquet* -pipeline/data/model/allmodels* -pipeline/metrics* +pipeline/data/predictions +pipeline/data/fit_predictions +pipeline/data/prepared_data/train.parquet +pipeline/data/fit_predictions +pipeline/data/model/allmodels +pipeline/metrics +pipeline/.dvc +pipeline/analysis diff --git a/modules/ml-pipeline/src/Prediction.Dockerfile b/modules/ml-pipeline/src/Prediction.Dockerfile index a6fc539..e0a292c 100644 --- a/modules/ml-pipeline/src/Prediction.Dockerfile +++ b/modules/ml-pipeline/src/Prediction.Dockerfile @@ -1,7 +1,7 @@ # Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow) FROM python:3.10.12-slim -RUN apt-get update && apt-get install -y libgomp1 +RUN apt-get update && apt-get install -y libgomp1 gcc python3-dev COPY pipeline/requirements/predictions/requirements.txt requirements.txt diff --git a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml index 2df0cb6..0d4ee07 100644 --- a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml @@ -5,6 +5,9 @@ default: scenario_data_filepaths: # - s3://retrofit-data-dev/scenario_data/22-03-2024-19-20-09/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet - - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet + # - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet + # - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet + # - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet + - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet comparison_output_filepath: ./metrics/scenario_table.md metrics_output_filepath: ./metrics/scenario_metrics.md diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index f42b2be..838e9a9 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -18,7 +18,10 @@ default: prepare_data: input_dataclient_type: aws-s3 output_dataclient_type: local - data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 104dc83..31315db 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -35,52 +35,12 @@ stages: - number_habitable_rooms - number_heated_rooms default.feature_processor.feature_processor_config.retain_features: - - uprn - - sap_starting - - hot_water_energy_eff_ending - - mainheat_energy_eff_ending - - constituency - - roof_energy_eff_ending - - walls_energy_eff_ending - - secondheat_description_ending - - property_type - - mainheatc_energy_eff_ending - - built_form - - walls_insulation_thickness_ending - - potential_energy_efficiency - - transaction_type_ending - - floor_thermal_transmittance_ending - - low_energy_lighting_ending - - heat_demand_starting - - photo_supply_ending - - carbon_starting - - walls_thermal_transmittance_ending - - roof_insulation_thickness_ending - - total_floor_area_ending - - number_open_fireplaces_ending - - windows_energy_eff_ending - - floor_height_ending - - extension_count_ending - - has_air_source_heat_pump_ending - - charging_system_ending - - construction_age_band - - glazed_type_ending - - roof_thermal_transmittance_ending - - floor_insulation_thickness_ending - - has_mains_gas_ending - - estimated_perimeter_starting - - energy_consumption_potential - - environment_impact_potential - - heater_type_ending - - multi_glaze_proportion_ending - - lighting_energy_eff_ending - - fixed_lighting_outlets_count default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe default.prepare_data.data_filepath: - s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet + s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -89,8 +49,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: 80c9e138146a1d96b9d16091c207e2e8.dir + size: 45056059 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -101,8 +61,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: 80c9e138146a1d96b9d16091c207e2e8.dir + size: 45056059 nfiles: 2 params: configs/build_model.yaml: @@ -134,17 +94,17 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: de46250d454c4d713ab580b10ff3fd31.dir - size: 3349318 + md5: d9c9afc05e8780db47c0548b19bf7d19.dir + size: 3349989 nfiles: 1 - path: data/model/ hash: md5 - md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir - size: 735951861 - nfiles: 35 + md5: 13c3100e1486c27a83a8a47491077842.dir + size: 773523079 + nfiles: 36 - path: metrics/fit_metrics.json hash: md5 - md5: 8a952a5e884c268e6059357a627b9251 + md5: 2ff70a2a45813e1bcdf2ea3aa8e07d4a size: 224 generate_predictions: cmd: python 3_generate_predictions.py @@ -155,13 +115,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir - size: 735951861 - nfiles: 35 + md5: 13c3100e1486c27a83a8a47491077842.dir + size: 773523079 + nfiles: 36 - path: data/prepared_data hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: 80c9e138146a1d96b9d16091c207e2e8.dir + size: 45056059 nfiles: 2 params: configs/settings.yaml: @@ -173,8 +133,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir - size: 463563 + md5: 5d07bcebf3160a72bb18dfd79106e85c.dir + size: 463197 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -185,13 +145,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir - size: 463563 + md5: 5d07bcebf3160a72bb18dfd79106e85c.dir + size: 463197 nfiles: 1 - path: data/prepared_data hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: 80c9e138146a1d96b9d16091c207e2e8.dir + size: 45056059 nfiles: 2 params: configs/settings.yaml: @@ -201,30 +161,30 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 9f863f47799d42c101eba3b03a179455 - size: 224 + md5: 3e08df02fd5c5d094bcf936e1338d596 + size: 223 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: - path: 5_generate_scenarios.py hash: md5 - md5: a18f6c6ae2082f038df47386cf3e418e - size: 4896 + md5: 40506749fefd926d47c60ff5b16db307 + size: 5337 params: configs/scenarios.yaml: default.scenarios: input_dataclient_type: aws-s3 output_dataclient_type: local scenario_data_filepaths: - - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet + - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet comparison_output_filepath: ./metrics/scenario_table.md metrics_output_filepath: ./metrics/scenario_metrics.md outs: - path: metrics/scenario_metrics.md hash: md5 - md5: 64e7db945ff655ae03c20c9845f19106 + md5: fa4d6d7bbd7818613800da5f8f37ea96 size: 363 - path: metrics/scenario_table.md hash: md5 - md5: d4f8afe07b774374aeaa48f1b7b8a5fc + md5: d6baf100a1623cc2467c2f8221d314c9 size: 2133 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt index 734419a..4dc4c36 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 pandas==2.1.4 -autogluon==1.0.0 +autogluon.tabular[all]==1.0.0 dynaconf==3.2.1 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt index 937b000..35bdb05 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 pandas==2.1.4 -autogluon==1.0.0 +autogluon.tabular[all]==1.0.0 dynaconf==3.2.1 pyarrow==13.0.0 PyYAML==6.0.1 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index fe06a4d..93a042e 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 pandas==2.1.4 -autogluon==1.0.0 +autogluon.tabular[all]==1.0.0 ray==2.6.3 dynaconf==3.2.1 alibi==0.9.5 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt index a5bccd3..edeb764 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt @@ -1,4 +1,4 @@ boto3==1.28.41 pandas==2.1.4 -autogluon==1.0.0 +autogluon.tabular[all]==1.0.0 dynaconf==3.2.1 diff --git a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt index a2b9531..173550d 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt @@ -1,4 +1,4 @@ -dvc==3.36.0 -dvc-s3==3.0.1 -gto==1.6.1 +dvc==3.51.0 +dvc-s3==3.2.0 +gto==1.7.1 pyOpenSSL==23.3.0