From f92c97f6cfb832feff1fee4a77a484251bf3f81f Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 9 Feb 2024 16:19:47 +0000 Subject: [PATCH 01/53] drop days_starting and days_ending --- .../src/pipeline/configs/build_model.yaml | 2 +- .../src/pipeline/configs/settings.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 56 ++++++++++--------- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index fcec7f7..f4770a7 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -14,7 +14,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 + time_limit: 400 presets: medium_quality excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 4327e64..98cf6dc 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -35,7 +35,7 @@ default: subsample_seed: 0 target: sap_ending identifier_columns: ["uprn"] - drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending"] + drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index f15978f..9bb73b8 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: 1_prepare_data.py hash: md5 - md5: 1793a35e71751d3c84f9affc67ecb9a8 - size: 4296 + md5: 11a3b8bfdfe199ab7ecc39ccc5652649 + size: 4298 params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: @@ -15,6 +15,8 @@ stages: - rdsap_change - heat_demand_ending - carbon_ending + - days_to_starting + - days_to_ending default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 @@ -29,8 +31,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 84fa631bd02686b052d6a7144eafd38e.dir - size: 43859225 + md5: f85c36a5dfd31a897538b3934d5fb997.dir + size: 41375196 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +43,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 84fa631bd02686b052d6a7144eafd38e.dir - size: 43859225 + md5: f85c36a5dfd31a897538b3934d5fb997.dir + size: 41375196 nfiles: 2 params: configs/build_model.yaml: @@ -59,7 +61,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 4000 + time_limit: 400 presets: medium_quality excluded_model_types: - RF @@ -73,18 +75,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: ede187e9d0bffdef054f573f3c2bd222.dir - size: 3578590 + md5: 991e6c55826953aa7c2be573369ec96f.dir + size: 3574047 nfiles: 1 - path: data/model/ hash: md5 - md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir - size: 814720415 - nfiles: 31 + md5: f8a8b7462831bd46b1e2df47d73bb69d.dir + size: 391430703 + nfiles: 23 - path: metrics/fit_metrics.json hash: md5 - md5: c45b84f12971a0156e4f3d85d3e725f5 - size: 218 + md5: 35a66a845854cc6fee9dd10860e216bb + size: 225 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -94,13 +96,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir - size: 814720415 - nfiles: 31 + md5: f8a8b7462831bd46b1e2df47d73bb69d.dir + size: 391430703 + nfiles: 23 - path: data/prepared_data hash: md5 - md5: 84fa631bd02686b052d6a7144eafd38e.dir - size: 43859225 + md5: f85c36a5dfd31a897538b3934d5fb997.dir + size: 41375196 nfiles: 2 params: configs/settings.yaml: @@ -112,8 +114,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 5e60ca251af51de6fef3d0c659f8bb27.dir - size: 627416 + md5: 94b7381ac318b1ca18e0bc086778f7ce.dir + size: 626160 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -124,13 +126,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 5e60ca251af51de6fef3d0c659f8bb27.dir - size: 627416 + md5: 94b7381ac318b1ca18e0bc086778f7ce.dir + size: 626160 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 84fa631bd02686b052d6a7144eafd38e.dir - size: 43859225 + md5: f85c36a5dfd31a897538b3934d5fb997.dir + size: 41375196 nfiles: 2 params: configs/settings.yaml: @@ -140,8 +142,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 033efa4d4044b6b6fc92dd37194727fa - size: 225 + md5: 4d8681f7c0f41f97be52d6b1ae039c5b + size: 224 startup_cleanup: cmd: python 0_startup_cleanup.py deps: From a98fc9d93a3ab95de5cd97b22dc947600ca83b55 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 9 Feb 2024 16:27:01 +0000 Subject: [PATCH 02/53] Update Registry --- MODEL_REGISTRY.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 1bcceec..86ee6be 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.5.0", + "version": "v0.6.0", "stage": { "dev": "v0.5.0" }, @@ -16,17 +16,17 @@ "active": true }, "heat": { - "version": "v0.3.0", + "version": "v0.4.0", "stage": { - "dev": "v0.3.0" + "dev": "v0.4.0" }, "registered": true, "active": true }, "carbon": { - "version": "v0.3.0", + "version": "v0.4.0", "stage": { - "dev": "v0.3.0" + "dev": "v0.4.0" }, "registered": true, "active": true From f17119382b1a095ec8ad080bcb5e7768fb5b89d0 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 9 Feb 2024 16:27:45 +0000 Subject: [PATCH 03/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 86ee6be..126d696 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.6.0", "stage": { - "dev": "v0.5.0" + "dev": "v0.6.0" }, "registered": true, "active": true From 778bff37fb7808518cf570b321ab00f7bccd7726 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 9 Feb 2024 18:46:19 +0000 Subject: [PATCH 04/53] 4000 model --- .../src/pipeline/configs/build_model.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 46 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index f4770a7..fcec7f7 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -14,7 +14,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 400 + time_limit: 4000 presets: medium_quality excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 9bb73b8..69729e0 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -31,8 +31,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: f85c36a5dfd31a897538b3934d5fb997.dir - size: 41375196 + md5: cfaebbb77306750fcc9a39adbb40015b.dir + size: 41367957 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -43,8 +43,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: f85c36a5dfd31a897538b3934d5fb997.dir - size: 41375196 + md5: cfaebbb77306750fcc9a39adbb40015b.dir + size: 41367957 nfiles: 2 params: configs/build_model.yaml: @@ -61,7 +61,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 400 + time_limit: 4000 presets: medium_quality excluded_model_types: - RF @@ -75,17 +75,17 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 991e6c55826953aa7c2be573369ec96f.dir - size: 3574047 + md5: f15cbb7486924de81c5bf032b5ca962d.dir + size: 3572461 nfiles: 1 - path: data/model/ hash: md5 - md5: f8a8b7462831bd46b1e2df47d73bb69d.dir - size: 391430703 - nfiles: 23 + md5: 5d41efafd16cda31e10a0ca1a0a19759.dir + size: 798325885 + nfiles: 31 - path: metrics/fit_metrics.json hash: md5 - md5: 35a66a845854cc6fee9dd10860e216bb + md5: 7790bc6b081a2c933547f67c843fef10 size: 225 generate_predictions: cmd: python 3_generate_predictions.py @@ -96,13 +96,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: f8a8b7462831bd46b1e2df47d73bb69d.dir - size: 391430703 - nfiles: 23 + md5: 5d41efafd16cda31e10a0ca1a0a19759.dir + size: 798325885 + nfiles: 31 - path: data/prepared_data hash: md5 - md5: f85c36a5dfd31a897538b3934d5fb997.dir - size: 41375196 + md5: cfaebbb77306750fcc9a39adbb40015b.dir + size: 41367957 nfiles: 2 params: configs/settings.yaml: @@ -114,8 +114,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 94b7381ac318b1ca18e0bc086778f7ce.dir - size: 626160 + md5: 0f476eaeaca81fbc7fac9400c77f653a.dir + size: 626833 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -126,13 +126,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 94b7381ac318b1ca18e0bc086778f7ce.dir - size: 626160 + md5: 0f476eaeaca81fbc7fac9400c77f653a.dir + size: 626833 nfiles: 1 - path: data/prepared_data hash: md5 - md5: f85c36a5dfd31a897538b3934d5fb997.dir - size: 41375196 + md5: cfaebbb77306750fcc9a39adbb40015b.dir + size: 41367957 nfiles: 2 params: configs/settings.yaml: @@ -142,7 +142,7 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 4d8681f7c0f41f97be52d6b1ae039c5b + md5: e000a99df92dc0c7ca86b020cbcd6b5b size: 224 startup_cleanup: cmd: python 0_startup_cleanup.py From bc44376e0782cf53172ddf3ab849b7bdf2d52284 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 9 Feb 2024 18:53:22 +0000 Subject: [PATCH 05/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 126d696..3cf3a2e 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.6.0", + "version": "v0.7.0", "stage": { "dev": "v0.6.0" }, From 8a1e2958b4773ca54f9a4a096327465118312ac7 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 9 Feb 2024 18:54:16 +0000 Subject: [PATCH 06/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 3cf3a2e..f0ea2bb 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.7.0", "stage": { - "dev": "v0.6.0" + "dev": "v0.7.0" }, "registered": true, "active": true From eeb653c0415e8c6a24144c5a98af76fb16f8f525 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sat, 10 Feb 2024 11:03:38 +0000 Subject: [PATCH 07/53] new model --- .../src/pipeline/configs/settings.yaml | 4 +- modules/ml-pipeline/src/pipeline/dvc.lock | 44 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 98cf6dc..19b0a5b 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -22,8 +22,8 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet - data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + # data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet train_proportion: 1 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 69729e0..826e654 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -22,7 +22,7 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -31,8 +31,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: cfaebbb77306750fcc9a39adbb40015b.dir - size: 41367957 + md5: 3c77fa10cd1cd503eb4d2540394629f6.dir + size: 42626894 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -43,8 +43,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: cfaebbb77306750fcc9a39adbb40015b.dir - size: 41367957 + md5: 3c77fa10cd1cd503eb4d2540394629f6.dir + size: 42626894 nfiles: 2 params: configs/build_model.yaml: @@ -75,17 +75,17 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: f15cbb7486924de81c5bf032b5ca962d.dir - size: 3572461 + md5: e0a11ac6e4adf69d6180c0217c639a0e.dir + size: 3680908 nfiles: 1 - path: data/model/ hash: md5 - md5: 5d41efafd16cda31e10a0ca1a0a19759.dir - size: 798325885 + md5: bdaaf823857f9dc7b6ee2d4b88927cc1.dir + size: 805896324 nfiles: 31 - path: metrics/fit_metrics.json hash: md5 - md5: 7790bc6b081a2c933547f67c843fef10 + md5: 0ed5b1141bbb8bc3156e7c056b29f3cd size: 225 generate_predictions: cmd: python 3_generate_predictions.py @@ -96,13 +96,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 5d41efafd16cda31e10a0ca1a0a19759.dir - size: 798325885 + md5: bdaaf823857f9dc7b6ee2d4b88927cc1.dir + size: 805896324 nfiles: 31 - path: data/prepared_data hash: md5 - md5: cfaebbb77306750fcc9a39adbb40015b.dir - size: 41367957 + md5: 3c77fa10cd1cd503eb4d2540394629f6.dir + size: 42626894 nfiles: 2 params: configs/settings.yaml: @@ -114,8 +114,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 0f476eaeaca81fbc7fac9400c77f653a.dir - size: 626833 + md5: 38707d16ae1e2330cc03f524db9cdd60.dir + size: 648730 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -126,13 +126,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 0f476eaeaca81fbc7fac9400c77f653a.dir - size: 626833 + md5: 38707d16ae1e2330cc03f524db9cdd60.dir + size: 648730 nfiles: 1 - path: data/prepared_data hash: md5 - md5: cfaebbb77306750fcc9a39adbb40015b.dir - size: 41367957 + md5: 3c77fa10cd1cd503eb4d2540394629f6.dir + size: 42626894 nfiles: 2 params: configs/settings.yaml: @@ -142,8 +142,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: e000a99df92dc0c7ca86b020cbcd6b5b - size: 224 + md5: 145e7ac84ab4a4407b23695a632b4d91 + size: 226 startup_cleanup: cmd: python 0_startup_cleanup.py deps: From d21fd1c4e8a52ee2d969ca9b82ca98b57c2b6295 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Mon, 12 Feb 2024 18:33:28 +0000 Subject: [PATCH 08/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index f0ea2bb..6c4d91c 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.7.0", + "version": "v0.8.0", "stage": { "dev": "v0.7.0" }, From babbc155e9bd7adc5fc08817d7b3b9772ed227bb Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Mon, 12 Feb 2024 18:34:09 +0000 Subject: [PATCH 09/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 6c4d91c..5c1c431 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.8.0", "stage": { - "dev": "v0.7.0" + "dev": "v0.8.0" }, "registered": true, "active": true From 603dfe2eab8b3625d4d17e9a9cc7c0c2b86fb28a Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 15 Feb 2024 15:10:49 +0000 Subject: [PATCH 10/53] new model with starting and ending rooms --- .../analysis/feature_importance.parquet | Bin 6080 -> 6758 bytes .../src/pipeline/configs/build_model.yaml | 2 +- .../src/pipeline/configs/settings.yaml | 3 +- modules/ml-pipeline/src/pipeline/dvc.lock | 52 +++++++++--------- 4 files changed, 29 insertions(+), 28 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet b/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet index 6960946afcbe8499ae94f5d5b47c2554cf117f77..ec8b0d37671d244abd59d06f908271ac8dff1a60 100644 GIT binary patch delta 4632 zcmb`LZCDf6w#R4Z1t*XR2@(hj$bdmWi3XG^wQNI$3JPtoQl-{$k_==t$;6on2uDcM zYOB>&5w%rYHLYmXs)Yc%sd~iPv+X_J=YF^!?tSi^FTa`D zd-ls(Yp?ZxX5YZ;gZ-~qO3ZqNewALoUOz^ETBgS$FziHV*UQQwMD4@Vha*RPbfs?k z$~47@u&Az_q_^cGHm(@3ZhX~an6~oNy2QubaLwIW$qPK)u)nfp{Vm%AI3)D7!uF95 zVBN~mgI?^s4`VN+?Vfh|52(2j{q_R)AMjkLzW&78F4)yuMK}0`F1Tjsu(a0|U2x0E z;?u34{tnlTm~VZt_;>i_t1nI%)9-g^{CHkdzs^orc{cNxl$)J!xozF0qt`m&EAliS zR?rEze6oGth37lrkK^X6r(}1+!D-ctYROLcFn{t{aH#`+{UUs+^}7yueS6-rBcF9Z zYSZqu1M4~<`(%yE-#ViM7Be?q?e=uQ(!u>U_px`t?KKY%5wCZ^X-74yZ@$(6Yd6O1 zIyANeF1>m7wPlEBW^M~{7k9vWVCx+*vjZM{Fym2ZdgNg!A#?+h6!6GxYykL+pUcf zWcmG{8orQVP211oy4RzQC%P(ju9M(`TB6^vPf&kdLg$0U68w@G6LF#ewHNIEX6rl& z)=oMM#&?l}|E-jQI&l$#&h{1SXOPCiXrEx}Q@pHEKnN${RFb$`1@g10{O-+8cs zmEirotLYXyTA^q?K7^9s`tTF|GG3QpV)De%P2(kamUe#*Uz6aW!&JDrM1r4t+X~ZO zkzn*F*YzLsCFuD4+7VTr1Pz|WC)#t-_wSp2K91O2w=bzHTY`P7CftcZoIP*oXXmpd zSa!yvn2h+T|4+wrn+Hp9(k0LIotbFDu`NpyGbFeIKRbD@S%QV0_dH(@l;BL`UdCub z7WqXO6^}ToAuL9RIH>*~WC&v2`FV|P{UsQ@AHJp+V&0zZHBy=cQ|msO8=5M?#=@Nu z#r-6>aI`5NASRyw)$U7{px@NK>qrlDGjdp;%U>HMsDC6&Pfn8Ha1;C9tVHx)KkTIm z2@+&f=eIwKm!Ool{?wj0+=P!@^QYAr!arOs;aS4vi zd~)Mh5459Ax=qUv6S%K(TSFw6wDyZ9C5SU-r4$t+Dz!5wB|m-M|MOqhm1rU9f4HY0 zZamc3+NzM?gsMY_7a;c9P`}rRcycK-`n(*CJ0!Th32`HRt9z>--6-pt>;9BUuqtHc z{7ytw8hj^+7?!a}szgjodU7%Wv3B@B)xQG?K02B?|0Lq}%^lI7BAy`)<0}#QDP?b$ zA%55^zwKL0f-U!sMQ=b{w7JN=4DsIP?+-K}&fT5f@d9F2%UA9GRCM$Gq03hchzGu# z)jt9;`{>E#3PkMh3z_P#r`J9B=f8P$*D`MtdfqfN>h?m!Ux=Zzh9Tzl2vSd;+=H#; zt(%`ACVd(G0`ccPn7RJ0{;$XPV0>fE&hGAeP><>F$n;&Z0s7`#xn8Re(TBtu@Mrnf zfIsUOg{AyAZ_%Q@2GVvpVzV6JsSO}}l{RrzNl^ajXR906to?jl^7kF|NJv%v( zs-i`Prx*{zS5#9>c{#dupypnv*-f(^Cqs+*@fau?W#>GC$cjG3W3P@vtvttp7qUlT z3DM}Gigvk#{5%Yc0}Hi&2K99)nE?()u=A`}WH}F|nmK|%9lbo~Wq7fg605xo*rr>I z+4HT~&`{K5^RX_GD(84NsjrD-J%Z0gD<{N?PS!rv!w6tLxB*HMyqw5*M3#2NmEf$M z1?RyYgUHh!fwrq4js_(@@hf7nQWBUak4!|vmb*BPr$i^iyJ;7tEHJoPQKUU~T|TKO zc?alTB4^_+u1dFD?P4pOGCKtsi=zae*Xyb#2KEg{ zBim`-#(DHr=i#ru)%SsLC!W0N`ZFE}3pb->07OGt-fc1RALAoX=yb#fCa*fudQS z5;z}kXDDNr+Us+BiB-zZoN~FT3>#{9(tHJSJ0(;LBI72W!}Jy{eMrIk3a*T#D;Ua# z41u%-R^g&+7>B77&?QD+$yJPa6&O?T;>mz8=IzAMG-QO@FHjY<&OnbF&Y*~KqX2VI z$my(`39Qr|RVJjK1UnTz_nd*g{i=lFl`PMB+{mXC>-N%iF(v^sSho>*0ZBKJ4B)hf z0~axBai!MAAUBa3l2ruwmq9UZR-iXv!h}iS4~2)zuEK&eU?MAc zS#!T}{jStZfR5+1vmYVUE-Vn4-n5s1)28 z%2nh~po5%VoTbdW9K0Nia3}apVTwR=%r_VX=JZ7#MXpD2V7DfbacIkxx^Un{i{3I> z?d2IE5MVW6m*p$Kkp)CF`ARJ7K_7T%m&v454Y3p_(7x%ci>3K$Vmv0029jBKFybyg zYeg(FzN3n9xqwYe+qg=`(nPcty=;D+^e>4;UPq_Z=SeTn3Mi-;(3I=Nh$@6CaM=m9s7v3Vl0W$%@sMRIsAH zT9Xy3MlD{@reGXEQ{4<6f&uy?#KWRHDl0}cg#;r&nie^c;Z^nI8{~r2PIsGu(M;L~ue$4Fflb?e()GlvDY>oxX zc4A;hd6L{>C~)X6;3x~w4*{j$oxd0(+upE(dIMUIiz^pNVGzn2JWZ9O)aT=wyi)M8 zf@8PT5ZUh9z&VhRyF`u|iHiz{PctJ_b1pR2&7!J-PK0J;n7WGN9O>IE1js#n&~GUM zn!qG8r%{GAF2%gZ5~2vaEzVx!pe0wVD#_PR3oo9Lxe{BY}GvS43~T;BesB zkh9f3@{(UGa4wqfRid#husp$$GdsK>8)G##TqmfkmTP*K+Qn4}f!#ot-$zD{HWer> zPLLKUxVfo}OLLAOk1L(|R~Ac4xpgUTR;g{7 zgQjjF65Onv=hE$5Mbmpxf$+F13Wm-jal!8A632z>*2j8TFC$Q_$2u9s<}tlyY`D2c z76c|80e*;dIIAc-XOf3vA#?8FcFtz8gT*OG1MsMNS2F0hEf2gEmmTa`jN+4MCnj|vHuz8l;y7Dft!&&@bA7j+mFq#I{rLVfEnr-xrAb$%-?ttBiS1PeCyjCnsxO6_ zRy>yW^HV7&O5)&hi2#Z+!DWmotP3x&HkK*`N?%MwgLlC<^h2U&{hqvy!Ji1jw6;kU zAo|tkOiLX0CP7+O0HHKOmTkmJzmXG0-9r7;?4BV+KhSMzkP*?k*+VX7gv$scmbPVt z72M6tKBXW`SpOUoRyI&Z45a79M10`UdwfQVn8}AbFx|g$F z?m@&gwb7yNXBxK^NxdLW^d|ck(pb@4-y-uhTWN^(wf6BufGk9bdc~?9U=G=P=+mS7 zw~F`=g@|X^GB^uM>5VkY2qapB|0*e;X--2Fs^(9X81hzhj=g$A!dg0=ww#!%A z(WmH2{W(#&7bQEN=eUyn>))I0Xy>-pa#HjDL7fq&_7>-J>XX)Kec}t1@o~NCIFHtLM(gy`vC}FCt*>@!XVdvQoqz6kzx&D+Ui8q*M9xaT&;+dBZC+B7(^t0~qSx)e zU%wZ#b>iPno2QEC+OGD~SEh()>*j}n+2?Xa^s7v}{t%}1#@&eLIU)-BCgu<4FprL2 z55ActqOXXoHuno6N_b?=I+-q_il&QA)tGDU4aWT?SwtIFrpEX&Czu-V^u>uNI&bQ@ z&6t;;4u;z?w+$YZJ(?h*sF`cOJsT^c)r)fqHphslcw=C-a=%7I#N(kqhN(qld}H^5 z6qSgYLshBYDMfUmHK_QTC=r!}t^3V9g^04ZuX>h>IWDK?U}mIN4xT-h_7REY^DF6JVD26UPG7i)5-;E9OEL3B)ps*7_uaVu4Bjyg*`!CMN+TYsZ|oc**@rOm@w~tRhU`e;=Hbk4YR!EKkQw_;6!h66U$*GiGUk7(mM# z<@;24A#`){umW=v@%9}Ov)1t>FvfxKG!4WA-~Vt3g4bVo^xX)mN)O~e#PkNgw0IPA zThG(wXD}=6GrxEjbL%tL`Q4ab7m)=n%#Tk-Ee^p{&kL#A_Iw20zdCNh%IAR*v?nZ< z58#E;>(0=4%!+-5QtP)PXo@~ot-~h|6~M(etfHQ z*W(ei{qD`~9hjBl{%YIuU<8dHah2{F9YKQy*9$&f=#|Aw zeiB&Pz3kQHD^{)oAnhw{M7+&{zI=tjKkfgAdV`Y8^>*EC% z-krC$xj3&Pr7#o=mDK7D7Di_>SH{E7W@$@J#ono2s;COMU83;D74ntvm#ZtRiKTz&kh(`6_iOea!RN(@w_D!FI(Jx>4Ee z5Cjh+`?{e~0{6700ZK`*E&-OzVB8+T&3SxXOjCe!2%Ylm21yDIJ(z8v8?Ey zrG8J~7{G+As`;y+8V*L_ zS!e^JGnh#$qrMdDsnZ%QdIP&Olc((AHBa#_|25JIjX@I`}%jEOS1 zGl}&t+Z{NXTxzwE~0}Xb^5v5vg=S|si@JZh?N`*C)E0g@ecHJskW@~W=q+@KLCfEv5~aMWF%d!@{ukkJw@{%td0UyDU4N>6>XRFM*x1f z9W)Pjum#ibsXQ#x%&|T{4iG$^9|cH1e5T<8UdACb`=Bnn3AVSXI&^)Y|73O_7z;^z zOaDm&_Kufz*|5`H9D~!zfp3(IB!Y2<#}iheiG7`&8R=s@Y%}YEyZd87mO5GfY*)lH zBF^b|_;|O2lij)C9-gDP+^>Qgbh`mL0>)>(9^T1%oZR9-#w#$o`^e@g@@V?!G0OY_ zxZ~R-97nHHXyY6ai{)@X`%XAE!c8M7XM=MpdV|0?`E%#G75N~6+!Ai{vksYl7USey zoCghM&&{~>-YK^cgPfzrs2!WdhlKI;&R8?Qf=8r zKu5p}Nvz9f$90hRRWsdJ^H%@@mp`f=W`{RT&1oao11hUN6MyA*;>>Kuo}{N~OL7Cz z-CDwC?;MQ)Dde<5H}B@W4DVvRRu9M7)TI3QJgT>8lLT6Z2#uY^cY(7}+rvFLzZEu0 zi1(5MbZ&MUAfw59;n>#;_3{0Xhy$Q5BZ$naJf5Ei$T(=H-^u-ujBZCMPTuMfvaCW2 zM=y`EhVuc1g4#TuRc5~ja|Lxv*vu;-q;BKqx)zYPNHg}@HaWO_9z0vuolkwPJx_nC z9hGt{EeppM3KQp&uni8Uw{$Tc&WfudPKgyDI_q}6RwBDZHNeKkGZIBo<1q>J%Vs51 zk#MW5p}!Bt)?XPahEa97P!VWVRS;H=kJ=Ij&0`*;b%fRK?)i`EE6@^W!!GvlP5#v5 zJ*^pMBW4hhtlRD2U$`0pr@)o94|4G9g&pPN7_WnEYTv6$h_!{KVE^I@*o2>u;VZBn zYEw7&%un4SWxd7C>UHweA$(RqWUXfh74F8sV?bU|9H{#wh+SGdf!zyV%fBflE4mAd z(j?$Ta)aut2C0-R0WZv)4C<00ncbeInml8d~u(~2z9#Iidn@a7Q5+w(z9q5%s5c0>7p|zhhOdj0Yk&9FIKCLB7Hx)ZwnakxAf(a4hS`YBX+?HU+7+2K`IZmk-h>{_nkzCiKVV1nav1 z#q&S(ez^cjf~2zG5JDar)_X?wMz1O=E(8mKv}LGwK~y4*d%M?7>eK0I9~GsYQpf$@ zckCUGvdnoZiHVEJ{I|D|z3%a9$v+Hb{@;dzAG=Y2n`#Mmg8Hr%H@9hodZ76IFl9|J L(T#mG_V@np0*bs) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index fcec7f7..1acea2a 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -16,6 +16,6 @@ default: eval_metric: mean_squared_error #mean_absolute_error time_limit: 4000 presets: medium_quality - excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] + excluded_model_types: ['RF', 'NN_TORCH', 'KNN', 'XT', 'CAT', 'FASTAI'] infer_limit: 0.05 infer_limit_batch_size: 10000 diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 19b0a5b..9b24faf 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -22,7 +22,8 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_rooms.parquet # data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet train_proportion: 1 output_train_filepath: ./data/prepared_data/train.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 826e654..2f513d4 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -22,7 +22,7 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_rooms.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -31,8 +31,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 3c77fa10cd1cd503eb4d2540394629f6.dir - size: 42626894 + md5: 8f0f5481075094460ab852ace2fa9b7a.dir + size: 43692138 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -43,8 +43,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 3c77fa10cd1cd503eb4d2540394629f6.dir - size: 42626894 + md5: 8f0f5481075094460ab852ace2fa9b7a.dir + size: 43692138 nfiles: 2 params: configs/build_model.yaml: @@ -65,28 +65,28 @@ stages: presets: medium_quality excluded_model_types: - RF - - FASTAI - - CAT - NN_TORCH - KNN - XT + - CAT + - FASTAI infer_limit: 0.05 infer_limit_batch_size: 10000 outs: - path: data/fit_predictions/ hash: md5 - md5: e0a11ac6e4adf69d6180c0217c639a0e.dir - size: 3680908 + md5: e2a05a84a14d35516a6cda8e0a1e963c.dir + size: 3681005 nfiles: 1 - path: data/model/ hash: md5 - md5: bdaaf823857f9dc7b6ee2d4b88927cc1.dir - size: 805896324 - nfiles: 31 + md5: 7b0382d001ed2bd7aec5c8112f69d129.dir + size: 793365790 + nfiles: 30 - path: metrics/fit_metrics.json hash: md5 - md5: 0ed5b1141bbb8bc3156e7c056b29f3cd - size: 225 + md5: bcfd8d3bd3af858fa3dc26433bc8cd9e + size: 224 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -96,13 +96,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: bdaaf823857f9dc7b6ee2d4b88927cc1.dir - size: 805896324 - nfiles: 31 + md5: 7b0382d001ed2bd7aec5c8112f69d129.dir + size: 793365790 + nfiles: 30 - path: data/prepared_data hash: md5 - md5: 3c77fa10cd1cd503eb4d2540394629f6.dir - size: 42626894 + md5: 8f0f5481075094460ab852ace2fa9b7a.dir + size: 43692138 nfiles: 2 params: configs/settings.yaml: @@ -114,8 +114,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 38707d16ae1e2330cc03f524db9cdd60.dir - size: 648730 + md5: 90b5275b5d9829a42573ade3f5a025d2.dir + size: 648526 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -126,13 +126,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 38707d16ae1e2330cc03f524db9cdd60.dir - size: 648730 + md5: 90b5275b5d9829a42573ade3f5a025d2.dir + size: 648526 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 3c77fa10cd1cd503eb4d2540394629f6.dir - size: 42626894 + md5: 8f0f5481075094460ab852ace2fa9b7a.dir + size: 43692138 nfiles: 2 params: configs/settings.yaml: @@ -142,7 +142,7 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 145e7ac84ab4a4407b23695a632b4d91 + md5: be48389ba2755e6c18e41243aaa9bb81 size: 226 startup_cleanup: cmd: python 0_startup_cleanup.py From 86352ce0ce7e6f26168e9fb04c0a103d49313118 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 16 Feb 2024 14:51:31 +0000 Subject: [PATCH 11/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 5c1c431..258d0b4 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.8.0", + "version": "v0.9.0", "stage": { "dev": "v0.8.0" }, From 6e76716fbce59b4d99ee2c45f23583d3f52b674d Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 16 Feb 2024 14:52:15 +0000 Subject: [PATCH 12/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 258d0b4..d838705 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.9.0", "stage": { - "dev": "v0.8.0" + "dev": "v0.9.0" }, "registered": true, "active": true From a9b50c8a2dbd1379bf19a1991116b4116cd2c657 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 16 Feb 2024 16:23:37 +0000 Subject: [PATCH 13/53] revert change on sap-dev-model --- modules/ml-pipeline/src/pipeline/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/ml-pipeline/src/pipeline/README.md b/modules/ml-pipeline/src/pipeline/README.md index d47f864..d44e220 100644 --- a/modules/ml-pipeline/src/pipeline/README.md +++ b/modules/ml-pipeline/src/pipeline/README.md @@ -37,3 +37,4 @@ Workflow: - This experiment will have the corresponding .dvc files for the hashed model and data - Use version control as normal - git add, git commit etc +- To revert change, use `git checkout {COMMIT_HASH}`, followed by `git switch -c {NEW_BRANCH_NAME}` From 0e2bff9d645221e9543e612396d051af8d424d93 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 16 Feb 2024 16:30:13 +0000 Subject: [PATCH 14/53] revert changes --- .../src/pipeline/configs/build_model.yaml | 2 +- .../src/pipeline/configs/settings.yaml | 3 +- modules/ml-pipeline/src/pipeline/dvc.lock | 52 +++++++++---------- 3 files changed, 28 insertions(+), 29 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 1acea2a..fcec7f7 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -16,6 +16,6 @@ default: eval_metric: mean_squared_error #mean_absolute_error time_limit: 4000 presets: medium_quality - excluded_model_types: ['RF', 'NN_TORCH', 'KNN', 'XT', 'CAT', 'FASTAI'] + excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 infer_limit_batch_size: 10000 diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 9b24faf..19b0a5b 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -22,8 +22,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_rooms.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet # data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet train_proportion: 1 output_train_filepath: ./data/prepared_data/train.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 2f513d4..826e654 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -22,7 +22,7 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_rooms.parquet + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -31,8 +31,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 8f0f5481075094460ab852ace2fa9b7a.dir - size: 43692138 + md5: 3c77fa10cd1cd503eb4d2540394629f6.dir + size: 42626894 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -43,8 +43,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 8f0f5481075094460ab852ace2fa9b7a.dir - size: 43692138 + md5: 3c77fa10cd1cd503eb4d2540394629f6.dir + size: 42626894 nfiles: 2 params: configs/build_model.yaml: @@ -65,28 +65,28 @@ stages: presets: medium_quality excluded_model_types: - RF + - FASTAI + - CAT - NN_TORCH - KNN - XT - - CAT - - FASTAI infer_limit: 0.05 infer_limit_batch_size: 10000 outs: - path: data/fit_predictions/ hash: md5 - md5: e2a05a84a14d35516a6cda8e0a1e963c.dir - size: 3681005 + md5: e0a11ac6e4adf69d6180c0217c639a0e.dir + size: 3680908 nfiles: 1 - path: data/model/ hash: md5 - md5: 7b0382d001ed2bd7aec5c8112f69d129.dir - size: 793365790 - nfiles: 30 + md5: bdaaf823857f9dc7b6ee2d4b88927cc1.dir + size: 805896324 + nfiles: 31 - path: metrics/fit_metrics.json hash: md5 - md5: bcfd8d3bd3af858fa3dc26433bc8cd9e - size: 224 + md5: 0ed5b1141bbb8bc3156e7c056b29f3cd + size: 225 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -96,13 +96,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 7b0382d001ed2bd7aec5c8112f69d129.dir - size: 793365790 - nfiles: 30 + md5: bdaaf823857f9dc7b6ee2d4b88927cc1.dir + size: 805896324 + nfiles: 31 - path: data/prepared_data hash: md5 - md5: 8f0f5481075094460ab852ace2fa9b7a.dir - size: 43692138 + md5: 3c77fa10cd1cd503eb4d2540394629f6.dir + size: 42626894 nfiles: 2 params: configs/settings.yaml: @@ -114,8 +114,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 90b5275b5d9829a42573ade3f5a025d2.dir - size: 648526 + md5: 38707d16ae1e2330cc03f524db9cdd60.dir + size: 648730 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -126,13 +126,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 90b5275b5d9829a42573ade3f5a025d2.dir - size: 648526 + md5: 38707d16ae1e2330cc03f524db9cdd60.dir + size: 648730 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 8f0f5481075094460ab852ace2fa9b7a.dir - size: 43692138 + md5: 3c77fa10cd1cd503eb4d2540394629f6.dir + size: 42626894 nfiles: 2 params: configs/settings.yaml: @@ -142,7 +142,7 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: be48389ba2755e6c18e41243aaa9bb81 + md5: 145e7ac84ab4a4407b23695a632b4d91 size: 226 startup_cleanup: cmd: python 0_startup_cleanup.py From 99e883584b7a16415d21723779b76f170d5e916d Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 16 Feb 2024 16:35:54 +0000 Subject: [PATCH 15/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index d838705..0742b6e 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.9.0", + "version": "v0.10.0", "stage": { "dev": "v0.9.0" }, From 7f2f80af22ba388eb027e313d28b8066ba2022f4 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 16 Feb 2024 16:36:38 +0000 Subject: [PATCH 16/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 0742b6e..143a5f6 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.10.0", "stage": { - "dev": "v0.9.0" + "dev": "v0.10.0" }, "registered": true, "active": true From 2221283de40931958dae784d3f2ac671ed7bdf20 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 16 Feb 2024 16:43:23 +0000 Subject: [PATCH 17/53] try the scenario cml --- .github/workflows/MLPipelinePullRequest.yml | 4 + .../src/pipeline/5_generate_scenarios.py | 125 ++++++++++++++++++ modules/ml-pipeline/src/pipeline/config.py | 1 + .../src/pipeline/configs/build_model.yaml | 2 +- .../src/pipeline/configs/scenarios.yaml | 9 ++ .../src/pipeline/core/DataClient.py | 10 +- modules/ml-pipeline/src/pipeline/dvc.lock | 68 ++++++---- modules/ml-pipeline/src/pipeline/dvc.yaml | 11 ++ .../src/pipeline/metrics/.gitignore | 1 + 9 files changed, 205 insertions(+), 26 deletions(-) create mode 100644 modules/ml-pipeline/src/pipeline/5_generate_scenarios.py create mode 100644 modules/ml-pipeline/src/pipeline/configs/scenarios.yaml diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index cbc379d..ceb6800 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -98,6 +98,10 @@ jobs: git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH} dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md + echo "## Scenario metrics" > report.md + + cat metrics/scenarios/scenario_table.md >> report.md + cml comment create report.md # echo "## Residuals plot from model" >> report.md diff --git a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py new file mode 100644 index 0000000..28bcb9d --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py @@ -0,0 +1,125 @@ +""" +Fourth part of the pipeline: +After the model is built and metrics are generated, +we want to test this model against known scenarios +""" + +import os +import pandas as pd +from core.interface.InterfaceModels import MLModel +from core.interface.InterfaceDataClient import DataClient +from configs.post_prediction_logic import post_prediction_logic +from core.DataClient import dataclient_factory +from core.MLModels import model_factory +from core.Logger import logger +from config import settings + +logger.info(f"--- Initiate Parameters ---") + +RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") + +client_params = settings.client +prepare_data_params = settings.prepare_data +build_model_params = settings.build_model +generate_predictions_params = settings.generate_predictions +generate_metrics_params = settings.generate_metrics +feature_process_params = settings.feature_processor +scenarios_params = settings.scenarios + +model_filepath = build_model_params["model_save_filepath"] +target = feature_process_params["feature_processor_config"]["target"] +scenario_data_filepaths = scenarios_params["scenario_data_filepaths"] +predictions_column_name = generate_predictions_params["predictions_column_name"] +output_filepath = scenarios_params["output_filepath"] + +logger.info(f"--- Initiate MLModel ---") + +model = model_factory(build_model_params["model_type"]) + +logger.info(f"--- Initiate DataClient ---") + +# Use data client for input and output, as we use dvc to cache later to the cloud +input_dataclient_type = scenarios_params["input_dataclient_type"] +input_dataclient = dataclient_factory( + dataclient_type=input_dataclient_type, + dataclient_config=client_params[input_dataclient_type], +) + +output_dataclient_type = scenarios_params["output_dataclient_type"] +output_dataclient = dataclient_factory( + dataclient_type=output_dataclient_type, + dataclient_config=client_params[output_dataclient_type], +) + + +def generate_scenario_predictions( + input_dataclient: DataClient, + output_dataclient: DataClient, + model: MLModel, + model_filepath: str, + scenario_data_filepaths: list, + predictions_column_name: str, + output_filepath: str, +): + """ + Given the new model, we generate prediction for expected scenarios + """ + + logger.info("--- Loading Scenario Data ---") + + scenario_data = pd.DataFrame() + + # Can have multiple scenario data files + for scenario_data_filepath in scenario_data_filepaths: + scenario_data = pd.concat( + [ + scenario_data, + input_dataclient.load_data(scenario_data_filepath, load_config=None), + ] + ) + + logger.info("--- Loading Model ---") + + model.load_model(model_filepath) + + logger.info("--- Generating Predictions ---") + + predictions = model.predict( + data=scenario_data, post_prediction_logic=post_prediction_logic + ) + + logger.info("--- Generate Scenario Predicted Impact ---") + + predictions_df = pd.DataFrame(predictions) + predictions_df.columns = [predictions_column_name] + + scenario_data = pd.concat([scenario_data, predictions_df], axis=1) + scenario_data["predicted_impact"] = abs( + scenario_data[predictions_column_name] - scenario_data["sap_starting"] + ) + + logger.info("--- Save prediction into metrics ---") + + output_df = scenario_data[["uprn", "id", "impact", "predicted_impact"]] + + output_dataclient.save_data( + obj=output_df, location=output_filepath, save_config=None + ) + + +if __name__ == "__main__": + logger.info(f"--- {__file__} - Start! ---") + + logger.info(f"--- Generate Scenario Predictions ---") + + generate_scenario_predictions( + input_dataclient=input_dataclient, + output_dataclient=output_dataclient, + model=model, + model_filepath=model_filepath, + scenario_data_filepaths=scenario_data_filepaths, + predictions_column_name=predictions_column_name, + output_filepath=output_filepath, + ) + + logger.info(f"--- {__file__} - Complete! ---") diff --git a/modules/ml-pipeline/src/pipeline/config.py b/modules/ml-pipeline/src/pipeline/config.py index 7a7366b..bac430c 100644 --- a/modules/ml-pipeline/src/pipeline/config.py +++ b/modules/ml-pipeline/src/pipeline/config.py @@ -7,6 +7,7 @@ settings = Dynaconf( "./configs/settings.yaml", "./configs/build_model.yaml", "./configs/analysis.yaml", + "./configs/scenarios.yaml", ], ) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 1acea2a..add3da1 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -14,7 +14,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 + time_limit: 60 presets: medium_quality excluded_model_types: ['RF', 'NN_TORCH', 'KNN', 'XT', 'CAT', 'FASTAI'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml new file mode 100644 index 0000000..29b6672 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml @@ -0,0 +1,9 @@ +default: + scenarios: + input_dataclient_type: aws-s3 + output_dataclient_type: local + scenario_data_filepaths: + [ + s3://retrofit-data-dev/scenario_data/recommendations_scoring_data.parquet, + ] + output_filepath: ./metrics/scenario_table.md diff --git a/modules/ml-pipeline/src/pipeline/core/DataClient.py b/modules/ml-pipeline/src/pipeline/core/DataClient.py index 53f4072..b38ca32 100644 --- a/modules/ml-pipeline/src/pipeline/core/DataClient.py +++ b/modules/ml-pipeline/src/pipeline/core/DataClient.py @@ -245,7 +245,8 @@ class LocalClient: save_methods = { ".parquet": self._save_parquet, - ".json": self._save_json + ".json": self._save_json, + ".md": self._save_md, # "": _save_directory(**save_config), # ADD MORE save_methods HERE } @@ -294,3 +295,10 @@ class LocalClient: # Write the contents of the buffer to the local file with open(location, "wb") as f: f.write(buffer.getvalue()) + + def _save_md(self, obj: pd.DataFrame, location: str, save_config: dict): + """ + Save object as markdown + """ + + obj.to_markdown(location, **save_config) diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 2f513d4..5959200 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -31,8 +31,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 8f0f5481075094460ab852ace2fa9b7a.dir - size: 43692138 + md5: 86d085385f7e170d951e95d5e9d0f0bc.dir + size: 43684784 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -43,8 +43,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 8f0f5481075094460ab852ace2fa9b7a.dir - size: 43692138 + md5: 86d085385f7e170d951e95d5e9d0f0bc.dir + size: 43684784 nfiles: 2 params: configs/build_model.yaml: @@ -61,7 +61,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 4000 + time_limit: 60 presets: medium_quality excluded_model_types: - RF @@ -75,17 +75,17 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: e2a05a84a14d35516a6cda8e0a1e963c.dir - size: 3681005 + md5: 69cbcceee3e360e0040a7c45ed72ef7f.dir + size: 3674358 nfiles: 1 - path: data/model/ hash: md5 - md5: 7b0382d001ed2bd7aec5c8112f69d129.dir - size: 793365790 - nfiles: 30 + md5: 09757210fdbaa9ad216a84285cf1cbf2.dir + size: 353975267 + nfiles: 21 - path: metrics/fit_metrics.json hash: md5 - md5: bcfd8d3bd3af858fa3dc26433bc8cd9e + md5: 69be95e8d60eb7cef41ec1e69fa9d2ce size: 224 generate_predictions: cmd: python 3_generate_predictions.py @@ -96,13 +96,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 7b0382d001ed2bd7aec5c8112f69d129.dir - size: 793365790 - nfiles: 30 + md5: 09757210fdbaa9ad216a84285cf1cbf2.dir + size: 353975267 + nfiles: 21 - path: data/prepared_data hash: md5 - md5: 8f0f5481075094460ab852ace2fa9b7a.dir - size: 43692138 + md5: 86d085385f7e170d951e95d5e9d0f0bc.dir + size: 43684784 nfiles: 2 params: configs/settings.yaml: @@ -114,8 +114,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 90b5275b5d9829a42573ade3f5a025d2.dir - size: 648526 + md5: 2a0421436d59d95e52a51571c34e0ce9.dir + size: 647012 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -126,13 +126,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 90b5275b5d9829a42573ade3f5a025d2.dir - size: 648526 + md5: 2a0421436d59d95e52a51571c34e0ce9.dir + size: 647012 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 8f0f5481075094460ab852ace2fa9b7a.dir - size: 43692138 + md5: 86d085385f7e170d951e95d5e9d0f0bc.dir + size: 43684784 nfiles: 2 params: configs/settings.yaml: @@ -142,8 +142,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: be48389ba2755e6c18e41243aaa9bb81 - size: 226 + md5: 83698142cedb9fb4df5ab82f408690a2 + size: 222 startup_cleanup: cmd: python 0_startup_cleanup.py deps: @@ -155,3 +155,23 @@ stages: configs/settings.yaml: default.startup_cleanup.artefacts: ./data default.startup_cleanup.metrics: ./metrics + generate_scenerio_metrics: + cmd: python 5_generate_scenarios.py + deps: + - path: 5_generate_scenarios.py + hash: md5 + md5: 30f80ffeb6ee50c5f7b82943a4dc7702 + size: 4014 + params: + configs/scenarios.yaml: + default.scenarios: + input_dataclient_type: aws-s3 + output_dataclient_type: local + scenario_data_filepaths: + - s3://retrofit-data-dev/scenario_data/recommendations_scoring_data.parquet + output_filepath: ./metrics/scenario_table.md + outs: + - path: metrics/scenario_table.md + hash: md5 + md5: 36b1b26224ebbbfd5b2bbb15ae173247 + size: 1648 diff --git a/modules/ml-pipeline/src/pipeline/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml index 58889cc..b513184 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/dvc.yaml @@ -71,6 +71,17 @@ stages: outs: - metrics/metrics.json always_changed: true + generate_scenerio_metrics: + cmd: python 5_generate_scenarios.py + deps: + - 5_generate_scenarios.py + params: + - configs/scenarios.yaml: + - default.scenarios + outs: + - metrics/scenario_table.md + always_changed: true metrics: - metrics/metrics.json - metrics/fit_metrics.json + - metrics/scenario_table.md diff --git a/modules/ml-pipeline/src/pipeline/metrics/.gitignore b/modules/ml-pipeline/src/pipeline/metrics/.gitignore index e6fbc8d..189c2ee 100644 --- a/modules/ml-pipeline/src/pipeline/metrics/.gitignore +++ b/modules/ml-pipeline/src/pipeline/metrics/.gitignore @@ -1,2 +1,3 @@ /fit_metrics.json /metrics.json +/scenario_table.md From 49e66411ce272fe3ed297f173a742d26d5551384 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 16 Feb 2024 16:51:43 +0000 Subject: [PATCH 18/53] test this version --- modules/ml-pipeline/src/pipeline/dvc.lock | 44 +++++++++++------------ modules/ml-pipeline/src/pipeline/dvc.yaml | 1 - 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 20e33ef..c872404 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -31,8 +31,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 86d085385f7e170d951e95d5e9d0f0bc.dir - size: 43684784 + md5: 174752a2b228f7af687fe91de77ca0b8.dir + size: 42622503 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -43,8 +43,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 86d085385f7e170d951e95d5e9d0f0bc.dir - size: 43684784 + md5: 174752a2b228f7af687fe91de77ca0b8.dir + size: 42622503 nfiles: 2 params: configs/build_model.yaml: @@ -75,17 +75,17 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 69cbcceee3e360e0040a7c45ed72ef7f.dir - size: 3674358 + md5: a7e32ced2c7ca88a1e80ed0c2135388d.dir + size: 3675177 nfiles: 1 - path: data/model/ hash: md5 - md5: 09757210fdbaa9ad216a84285cf1cbf2.dir - size: 353975267 + md5: 6d81c99ee00e03bba69db468161dfe19.dir + size: 335451645 nfiles: 21 - path: metrics/fit_metrics.json hash: md5 - md5: 69be95e8d60eb7cef41ec1e69fa9d2ce + md5: 296fd7785e867da96eec96683384c444 size: 224 generate_predictions: cmd: python 3_generate_predictions.py @@ -96,13 +96,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 09757210fdbaa9ad216a84285cf1cbf2.dir - size: 353975267 + md5: 6d81c99ee00e03bba69db468161dfe19.dir + size: 335451645 nfiles: 21 - path: data/prepared_data hash: md5 - md5: 86d085385f7e170d951e95d5e9d0f0bc.dir - size: 43684784 + md5: 174752a2b228f7af687fe91de77ca0b8.dir + size: 42622503 nfiles: 2 params: configs/settings.yaml: @@ -114,8 +114,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 2a0421436d59d95e52a51571c34e0ce9.dir - size: 647012 + md5: 3fd770fe0f8064cfc30c2b68575f9e7f.dir + size: 647505 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -126,13 +126,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 2a0421436d59d95e52a51571c34e0ce9.dir - size: 647012 + md5: 3fd770fe0f8064cfc30c2b68575f9e7f.dir + size: 647505 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 86d085385f7e170d951e95d5e9d0f0bc.dir - size: 43684784 + md5: 174752a2b228f7af687fe91de77ca0b8.dir + size: 42622503 nfiles: 2 params: configs/settings.yaml: @@ -142,8 +142,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 83698142cedb9fb4df5ab82f408690a2 - size: 222 + md5: fa4972e309c6e278d986f305543b3084 + size: 223 startup_cleanup: cmd: python 0_startup_cleanup.py deps: @@ -173,5 +173,5 @@ stages: outs: - path: metrics/scenario_table.md hash: md5 - md5: 36b1b26224ebbbfd5b2bbb15ae173247 + md5: 634d39623623a82ce8554a38d3fb82b0 size: 1648 diff --git a/modules/ml-pipeline/src/pipeline/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml index b513184..5ce35ce 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/dvc.yaml @@ -84,4 +84,3 @@ stages: metrics: - metrics/metrics.json - metrics/fit_metrics.json - - metrics/scenario_table.md From fe430c432651da8d46c3f385534908558f21c182 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 16 Feb 2024 16:54:18 +0000 Subject: [PATCH 19/53] test this version --- .github/workflows/MLPipelinePullRequest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index ceb6800..962132c 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -100,7 +100,7 @@ jobs: echo "## Scenario metrics" > report.md - cat metrics/scenarios/scenario_table.md >> report.md + cat metrics/scenario_table.md >> report.md cml comment create report.md From 81e7c2a4bd954532a70638a284dc189f149e5dfb Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 16 Feb 2024 16:57:37 +0000 Subject: [PATCH 20/53] test this version --- .github/workflows/MLPipelinePullRequest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index 962132c..493aef9 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -98,7 +98,7 @@ jobs: git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH} dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md - echo "## Scenario metrics" > report.md + echo "## Scenario metrics" >> report.md cat metrics/scenario_table.md >> report.md From cec3cc60e7fd8cc0739182421e2334b3a6d340fa Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sat, 17 Feb 2024 16:26:49 +0000 Subject: [PATCH 21/53] test less features --- .../src/pipeline/configs/build_model.yaml | 2 +- .../src/pipeline/configs/settings.yaml | 33 +++++- modules/ml-pipeline/src/pipeline/dvc.lock | 111 ++++++++++++++---- 3 files changed, 116 insertions(+), 30 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 66981bf..fcec7f7 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -14,7 +14,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 60 + time_limit: 4000 presets: medium_quality excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 19b0a5b..dc28a9a 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -24,7 +24,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet # data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet - train_proportion: 1 + train_proportion: 0.95 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -36,8 +36,35 @@ default: target: sap_ending identifier_columns: ["uprn"] drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"] - # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] - retain_features: null + retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending', + 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending', + 'walls_energy_eff_ending', 'secondheat_description_ending', + 'property_type', 'mainheatc_energy_eff_ending', 'built_form', + 'walls_insulation_thickness_ending', 'potential_energy_efficiency', + 'transaction_type_ending', 'mainheat_energy_eff_starting', + 'floor_thermal_transmittance_ending', 'hot_water_energy_eff_starting', + 'low_energy_lighting_ending', 'heat_demand_starting', + 'photo_supply_ending', 'carbon_starting', + 'walls_thermal_transmittance_ending', 'fuel_type_ending', + 'roof_insulation_thickness_ending', 'transaction_type_starting', + 'total_floor_area_ending', 'number_open_fireplaces_ending', + 'roof_insulation_thickness', 'windows_energy_eff_ending', + 'walls_insulation_thickness', 'floor_height_ending', + 'secondheat_description_starting', 'floor_thermal_transmittance', + 'mainheatc_energy_eff_starting', 'extension_count_ending', + 'has_air_source_heat_pump_ending', 'walls_energy_eff_starting', + 'charging_system_ending', 'construction_age_band', 'glazed_type_ending', + 'roof_thermal_transmittance_ending', + 'floor_insulation_thickness_ending', 'has_mains_gas_ending', + 'estimated_perimeter_starting', 'energy_consumption_potential', + 'environment_impact_potential', 'roof_energy_eff_starting', + 'another_property_below', 'heater_type_ending', + 'walls_thermal_transmittance', 'total_floor_area_starting', + 'multi_glaze_proportion_ending', 'is_suspended', + 'floor_height_starting', 'lighting_energy_eff_ending', + 'energy_tariff_ending', 'fixed_lighting_outlets_count', + 'low_energy_lighting_starting', 'mechanical_ventilation_ending'] + # retain_features: null generate_predictions: input_dataclient_type: local diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index c872404..05844e3 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -18,6 +18,65 @@ stages: - days_to_starting - days_to_ending default.feature_processor.feature_processor_config.retain_features: + - uprn + - sap_starting + - hot_water_energy_eff_ending + - mainheat_energy_eff_ending + - constituency + - roof_energy_eff_ending + - walls_energy_eff_ending + - secondheat_description_ending + - property_type + - mainheatc_energy_eff_ending + - built_form + - walls_insulation_thickness_ending + - potential_energy_efficiency + - transaction_type_ending + - mainheat_energy_eff_starting + - floor_thermal_transmittance_ending + - hot_water_energy_eff_starting + - low_energy_lighting_ending + - heat_demand_starting + - photo_supply_ending + - carbon_starting + - walls_thermal_transmittance_ending + - fuel_type_ending + - roof_insulation_thickness_ending + - transaction_type_starting + - total_floor_area_ending + - number_open_fireplaces_ending + - roof_insulation_thickness + - windows_energy_eff_ending + - walls_insulation_thickness + - floor_height_ending + - secondheat_description_starting + - floor_thermal_transmittance + - mainheatc_energy_eff_starting + - extension_count_ending + - has_air_source_heat_pump_ending + - walls_energy_eff_starting + - charging_system_ending + - construction_age_band + - glazed_type_ending + - roof_thermal_transmittance_ending + - floor_insulation_thickness_ending + - has_mains_gas_ending + - estimated_perimeter_starting + - energy_consumption_potential + - environment_impact_potential + - roof_energy_eff_starting + - another_property_below + - heater_type_ending + - walls_thermal_transmittance + - total_floor_area_starting + - multi_glaze_proportion_ending + - is_suspended + - floor_height_starting + - lighting_energy_eff_ending + - energy_tariff_ending + - fixed_lighting_outlets_count + - low_energy_lighting_starting + - mechanical_ventilation_ending default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending @@ -27,12 +86,12 @@ stages: default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet - default.prepare_data.train_proportion: 1 + default.prepare_data.train_proportion: 0.95 outs: - path: data/prepared_data/ hash: md5 - md5: 174752a2b228f7af687fe91de77ca0b8.dir - size: 42622503 + md5: 59f8ea78ec225f5a05de451c6145e2d5.dir + size: 34059502 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -43,8 +102,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 174752a2b228f7af687fe91de77ca0b8.dir - size: 42622503 + md5: 59f8ea78ec225f5a05de451c6145e2d5.dir + size: 34059502 nfiles: 2 params: configs/build_model.yaml: @@ -61,7 +120,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 60 + time_limit: 4000 presets: medium_quality excluded_model_types: - RF @@ -75,18 +134,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: a7e32ced2c7ca88a1e80ed0c2135388d.dir - size: 3675177 + md5: bb74626ff3d33581efe750955cdff860.dir + size: 3539589 nfiles: 1 - path: data/model/ hash: md5 - md5: 6d81c99ee00e03bba69db468161dfe19.dir - size: 335451645 - nfiles: 21 + md5: e100d4dcccc1c7d30367b0ca0672e3af.dir + size: 654714285 + nfiles: 31 - path: metrics/fit_metrics.json hash: md5 - md5: 296fd7785e867da96eec96683384c444 - size: 224 + md5: d074f5aa588d3405be65a9684f192465 + size: 226 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -96,13 +155,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 6d81c99ee00e03bba69db468161dfe19.dir - size: 335451645 - nfiles: 21 + md5: e100d4dcccc1c7d30367b0ca0672e3af.dir + size: 654714285 + nfiles: 31 - path: data/prepared_data hash: md5 - md5: 174752a2b228f7af687fe91de77ca0b8.dir - size: 42622503 + md5: 59f8ea78ec225f5a05de451c6145e2d5.dir + size: 34059502 nfiles: 2 params: configs/settings.yaml: @@ -114,8 +173,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 3fd770fe0f8064cfc30c2b68575f9e7f.dir - size: 647505 + md5: 36e26c509176caae6290f75ad486810d.dir + size: 232044 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -126,13 +185,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 3fd770fe0f8064cfc30c2b68575f9e7f.dir - size: 647505 + md5: 36e26c509176caae6290f75ad486810d.dir + size: 232044 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 174752a2b228f7af687fe91de77ca0b8.dir - size: 42622503 + md5: 59f8ea78ec225f5a05de451c6145e2d5.dir + size: 34059502 nfiles: 2 params: configs/settings.yaml: @@ -142,7 +201,7 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: fa4972e309c6e278d986f305543b3084 + md5: 7b71931c5857358ca2603889de6abb3a size: 223 startup_cleanup: cmd: python 0_startup_cleanup.py @@ -173,5 +232,5 @@ stages: outs: - path: metrics/scenario_table.md hash: md5 - md5: 634d39623623a82ce8554a38d3fb82b0 + md5: 72db7530c9ca42470ee8bd1a1e7b52b4 size: 1648 From d5f40a8eb294924e0525904d6ee1864999d77c23 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sat, 17 Feb 2024 21:17:34 +0000 Subject: [PATCH 22/53] only ending --- .../src/pipeline/configs/settings.yaml | 29 ++++---- modules/ml-pipeline/src/pipeline/dvc.lock | 67 +++++++------------ 2 files changed, 36 insertions(+), 60 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index dc28a9a..a85d3ab 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -24,7 +24,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet # data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet - train_proportion: 0.95 + train_proportion: 0.98 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -41,29 +41,24 @@ default: 'walls_energy_eff_ending', 'secondheat_description_ending', 'property_type', 'mainheatc_energy_eff_ending', 'built_form', 'walls_insulation_thickness_ending', 'potential_energy_efficiency', - 'transaction_type_ending', 'mainheat_energy_eff_starting', - 'floor_thermal_transmittance_ending', 'hot_water_energy_eff_starting', + 'transaction_type_ending', + 'floor_thermal_transmittance_ending', 'low_energy_lighting_ending', 'heat_demand_starting', 'photo_supply_ending', 'carbon_starting', - 'walls_thermal_transmittance_ending', 'fuel_type_ending', - 'roof_insulation_thickness_ending', 'transaction_type_starting', + 'walls_thermal_transmittance_ending', + 'roof_insulation_thickness_ending', 'total_floor_area_ending', 'number_open_fireplaces_ending', - 'roof_insulation_thickness', 'windows_energy_eff_ending', - 'walls_insulation_thickness', 'floor_height_ending', - 'secondheat_description_starting', 'floor_thermal_transmittance', - 'mainheatc_energy_eff_starting', 'extension_count_ending', - 'has_air_source_heat_pump_ending', 'walls_energy_eff_starting', + 'windows_energy_eff_ending', + 'floor_height_ending', + 'extension_count_ending', + 'has_air_source_heat_pump_ending', 'charging_system_ending', 'construction_age_band', 'glazed_type_ending', 'roof_thermal_transmittance_ending', 'floor_insulation_thickness_ending', 'has_mains_gas_ending', 'estimated_perimeter_starting', 'energy_consumption_potential', - 'environment_impact_potential', 'roof_energy_eff_starting', - 'another_property_below', 'heater_type_ending', - 'walls_thermal_transmittance', 'total_floor_area_starting', - 'multi_glaze_proportion_ending', 'is_suspended', - 'floor_height_starting', 'lighting_energy_eff_ending', - 'energy_tariff_ending', 'fixed_lighting_outlets_count', - 'low_energy_lighting_starting', 'mechanical_ventilation_ending'] + 'environment_impact_potential', 'heater_type_ending', + 'multi_glaze_proportion_ending', + 'lighting_energy_eff_ending', 'fixed_lighting_outlets_count'] # retain_features: null generate_predictions: diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 05844e3..71a9c44 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -32,29 +32,19 @@ stages: - walls_insulation_thickness_ending - potential_energy_efficiency - transaction_type_ending - - mainheat_energy_eff_starting - floor_thermal_transmittance_ending - - hot_water_energy_eff_starting - low_energy_lighting_ending - heat_demand_starting - photo_supply_ending - carbon_starting - walls_thermal_transmittance_ending - - fuel_type_ending - roof_insulation_thickness_ending - - transaction_type_starting - total_floor_area_ending - number_open_fireplaces_ending - - roof_insulation_thickness - windows_energy_eff_ending - - walls_insulation_thickness - floor_height_ending - - secondheat_description_starting - - floor_thermal_transmittance - - mainheatc_energy_eff_starting - extension_count_ending - has_air_source_heat_pump_ending - - walls_energy_eff_starting - charging_system_ending - construction_age_band - glazed_type_ending @@ -64,19 +54,10 @@ stages: - estimated_perimeter_starting - energy_consumption_potential - environment_impact_potential - - roof_energy_eff_starting - - another_property_below - heater_type_ending - - walls_thermal_transmittance - - total_floor_area_starting - multi_glaze_proportion_ending - - is_suspended - - floor_height_starting - lighting_energy_eff_ending - - energy_tariff_ending - fixed_lighting_outlets_count - - low_energy_lighting_starting - - mechanical_ventilation_ending default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending @@ -86,12 +67,12 @@ stages: default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet - default.prepare_data.train_proportion: 0.95 + default.prepare_data.train_proportion: 0.98 outs: - path: data/prepared_data/ hash: md5 - md5: 59f8ea78ec225f5a05de451c6145e2d5.dir - size: 34059502 + md5: 544427230544c2cc526334e246db4845.dir + size: 26132493 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -102,8 +83,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 59f8ea78ec225f5a05de451c6145e2d5.dir - size: 34059502 + md5: 544427230544c2cc526334e246db4845.dir + size: 26132493 nfiles: 2 params: configs/build_model.yaml: @@ -134,18 +115,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: bb74626ff3d33581efe750955cdff860.dir - size: 3539589 + md5: 8f9e2059782dd55d3ecdad54b4551f6a.dir + size: 3630849 nfiles: 1 - path: data/model/ hash: md5 - md5: e100d4dcccc1c7d30367b0ca0672e3af.dir - size: 654714285 + md5: e031eb3c3fdb63917aabfea745b56ac6.dir + size: 618445494 nfiles: 31 - path: metrics/fit_metrics.json hash: md5 - md5: d074f5aa588d3405be65a9684f192465 - size: 226 + md5: e68009f5b66230b3ee4cd2ffc9a2d697 + size: 222 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -155,13 +136,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: e100d4dcccc1c7d30367b0ca0672e3af.dir - size: 654714285 + md5: e031eb3c3fdb63917aabfea745b56ac6.dir + size: 618445494 nfiles: 31 - path: data/prepared_data hash: md5 - md5: 59f8ea78ec225f5a05de451c6145e2d5.dir - size: 34059502 + md5: 544427230544c2cc526334e246db4845.dir + size: 26132493 nfiles: 2 params: configs/settings.yaml: @@ -173,8 +154,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 36e26c509176caae6290f75ad486810d.dir - size: 232044 + md5: 1c14c9ac9711f5d33a60890e3ca72454.dir + size: 90361 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -185,13 +166,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 36e26c509176caae6290f75ad486810d.dir - size: 232044 + md5: 1c14c9ac9711f5d33a60890e3ca72454.dir + size: 90361 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 59f8ea78ec225f5a05de451c6145e2d5.dir - size: 34059502 + md5: 544427230544c2cc526334e246db4845.dir + size: 26132493 nfiles: 2 params: configs/settings.yaml: @@ -201,8 +182,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 7b71931c5857358ca2603889de6abb3a - size: 223 + md5: 98e59ea9569522a8665c4e6c1bea7473 + size: 222 startup_cleanup: cmd: python 0_startup_cleanup.py deps: @@ -232,5 +213,5 @@ stages: outs: - path: metrics/scenario_table.md hash: md5 - md5: 72db7530c9ca42470ee8bd1a1e7b52b4 + md5: 3ee1966a06c1e5b9c37797597be94797 size: 1648 From ad2c4d60197a708a3565356c121deb9067476f7e Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 21 Mar 2024 14:41:58 +0000 Subject: [PATCH 23/53] upgrade autogluon --- .../src/pipeline/configs/build_model.yaml | 5 +- .../src/pipeline/configs/settings.yaml | 2 +- .../ml-pipeline/src/pipeline/core/MLModels.py | 4 +- modules/ml-pipeline/src/pipeline/dvc.lock | 56 ++++++++++--------- .../predictions/requirements-dev.txt | 4 +- .../requirements/predictions/requirements.txt | 4 +- .../training/requirements-dev.txt | 7 ++- .../requirements/training/requirements.txt | 4 +- 8 files changed, 46 insertions(+), 40 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index fcec7f7..6fbf094 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -14,8 +14,9 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 - presets: medium_quality + time_limit: 1800 + presets: good_quality excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 infer_limit_batch_size: 10000 + ag_args_ensemble: {'num_folds_parallel': 2} diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 19b0a5b..4757d91 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -24,7 +24,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet # data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet - train_proportion: 1 + train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py index 4fc572a..257261d 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLModels.py +++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py @@ -25,7 +25,7 @@ def model_factory(model_type: str) -> MLModel: models = { "SKLearnLinearRegression": SKLearnLinearRegression(), "SKLearnSVMRegression": SKLearnSVMRegression(), - "AutogluonAutoML": AutogluonAutoML() + "AutogluonAutoML": AutogluonAutoML(), # ADD OTHER MODELS HERE } @@ -151,6 +151,7 @@ class AutogluonAutoML: "excluded_model_types", "infer_limit", "infer_limit_batch_size", + "ag_args_ensemble", ] def load_model(self, path: Union[Path, str]) -> None: @@ -207,6 +208,7 @@ class AutogluonAutoML: excluded_model_types=model_hyperparameters["excluded_model_types"], infer_limit=model_hyperparameters["infer_limit"], infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"], + ag_args_ensemble=model_hyperparameters["ag_args_ensemble"], ) def predict( diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 826e654..530a3c8 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -27,12 +27,12 @@ stages: default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet - default.prepare_data.train_proportion: 1 + default.prepare_data.train_proportion: 0.9 outs: - path: data/prepared_data/ hash: md5 - md5: 3c77fa10cd1cd503eb4d2540394629f6.dir - size: 42626894 + md5: 3d1144848fce4ce50f6abfaec5235552.dir + size: 46392840 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -43,8 +43,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 3c77fa10cd1cd503eb4d2540394629f6.dir - size: 42626894 + md5: 3d1144848fce4ce50f6abfaec5235552.dir + size: 46392840 nfiles: 2 params: configs/build_model.yaml: @@ -61,8 +61,8 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 4000 - presets: medium_quality + time_limit: 1800 + presets: good_quality excluded_model_types: - RF - FASTAI @@ -72,21 +72,23 @@ stages: - XT infer_limit: 0.05 infer_limit_batch_size: 10000 + ag_args_ensemble: + num_folds_parallel: 2 outs: - path: data/fit_predictions/ hash: md5 - md5: e0a11ac6e4adf69d6180c0217c639a0e.dir - size: 3680908 + md5: 346b6611afbf2070e038bf945249a86e.dir + size: 3384302 nfiles: 1 - path: data/model/ hash: md5 - md5: bdaaf823857f9dc7b6ee2d4b88927cc1.dir - size: 805896324 - nfiles: 31 + md5: 8e37f21728cd092660bafa8c32dc109f.dir + size: 423840922 + nfiles: 118 - path: metrics/fit_metrics.json hash: md5 - md5: 0ed5b1141bbb8bc3156e7c056b29f3cd - size: 225 + md5: d63e1a8d31503055835ac35149554e41 + size: 223 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -96,13 +98,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: bdaaf823857f9dc7b6ee2d4b88927cc1.dir - size: 805896324 - nfiles: 31 + md5: 8e37f21728cd092660bafa8c32dc109f.dir + size: 423840922 + nfiles: 118 - path: data/prepared_data hash: md5 - md5: 3c77fa10cd1cd503eb4d2540394629f6.dir - size: 42626894 + md5: 3d1144848fce4ce50f6abfaec5235552.dir + size: 46392840 nfiles: 2 params: configs/settings.yaml: @@ -114,8 +116,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 38707d16ae1e2330cc03f524db9cdd60.dir - size: 648730 + md5: d148baf508140353d62c16d6ab0fb6b7.dir + size: 469224 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -126,13 +128,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 38707d16ae1e2330cc03f524db9cdd60.dir - size: 648730 + md5: d148baf508140353d62c16d6ab0fb6b7.dir + size: 469224 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 3c77fa10cd1cd503eb4d2540394629f6.dir - size: 42626894 + md5: 3d1144848fce4ce50f6abfaec5235552.dir + size: 46392840 nfiles: 2 params: configs/settings.yaml: @@ -142,8 +144,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 145e7ac84ab4a4407b23695a632b4d91 - size: 226 + md5: 196232f94b563ac525cf65ee5cc6d639 + size: 222 startup_cleanup: cmd: python 0_startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt index 0d259fb..258981d 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 dynaconf==3.2.0 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt index afad9be..2ab48e9 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 dynaconf==3.2.0 pyarrow==13.0.0 PyYAML==6.0.1 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index d8c5907..2024d84 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -1,9 +1,10 @@ joblib==1.3.2 boto3==1.28.17 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 +ray==2.6.3 dynaconf==3.2.0 -alibi==0.9.4 +alibi==0.9.5 shap==0.42.1 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt index bbdc2fa..84452a3 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt @@ -1,4 +1,4 @@ boto3==1.28.41 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 dynaconf==3.2.0 From 8a9b5877b53c6dbff2d4f45e28fcb40a80b443d6 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 26 Mar 2024 22:30:50 +0000 Subject: [PATCH 24/53] medium model with scenario and upgraded autogluon --- .github/workflows/MLPipelinePullRequest.yml | 4 + .../src/pipeline/5_generate_scenarios.py | 125 ++++++++++++++++++ modules/ml-pipeline/src/pipeline/config.py | 1 + .../src/pipeline/configs/build_model.yaml | 4 +- .../src/pipeline/configs/scenarios.yaml | 8 ++ .../src/pipeline/configs/settings.yaml | 37 ++++-- .../src/pipeline/core/DataClient.py | 10 +- modules/ml-pipeline/src/pipeline/dvc.lock | 96 +++++++++----- modules/ml-pipeline/src/pipeline/dvc.yaml | 10 ++ .../src/pipeline/metrics/.gitignore | 1 + 10 files changed, 250 insertions(+), 46 deletions(-) create mode 100644 modules/ml-pipeline/src/pipeline/5_generate_scenarios.py create mode 100644 modules/ml-pipeline/src/pipeline/configs/scenarios.yaml diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index cbc379d..493aef9 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -98,6 +98,10 @@ jobs: git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH} dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md + echo "## Scenario metrics" >> report.md + + cat metrics/scenario_table.md >> report.md + cml comment create report.md # echo "## Residuals plot from model" >> report.md diff --git a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py new file mode 100644 index 0000000..28bcb9d --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py @@ -0,0 +1,125 @@ +""" +Fourth part of the pipeline: +After the model is built and metrics are generated, +we want to test this model against known scenarios +""" + +import os +import pandas as pd +from core.interface.InterfaceModels import MLModel +from core.interface.InterfaceDataClient import DataClient +from configs.post_prediction_logic import post_prediction_logic +from core.DataClient import dataclient_factory +from core.MLModels import model_factory +from core.Logger import logger +from config import settings + +logger.info(f"--- Initiate Parameters ---") + +RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") + +client_params = settings.client +prepare_data_params = settings.prepare_data +build_model_params = settings.build_model +generate_predictions_params = settings.generate_predictions +generate_metrics_params = settings.generate_metrics +feature_process_params = settings.feature_processor +scenarios_params = settings.scenarios + +model_filepath = build_model_params["model_save_filepath"] +target = feature_process_params["feature_processor_config"]["target"] +scenario_data_filepaths = scenarios_params["scenario_data_filepaths"] +predictions_column_name = generate_predictions_params["predictions_column_name"] +output_filepath = scenarios_params["output_filepath"] + +logger.info(f"--- Initiate MLModel ---") + +model = model_factory(build_model_params["model_type"]) + +logger.info(f"--- Initiate DataClient ---") + +# Use data client for input and output, as we use dvc to cache later to the cloud +input_dataclient_type = scenarios_params["input_dataclient_type"] +input_dataclient = dataclient_factory( + dataclient_type=input_dataclient_type, + dataclient_config=client_params[input_dataclient_type], +) + +output_dataclient_type = scenarios_params["output_dataclient_type"] +output_dataclient = dataclient_factory( + dataclient_type=output_dataclient_type, + dataclient_config=client_params[output_dataclient_type], +) + + +def generate_scenario_predictions( + input_dataclient: DataClient, + output_dataclient: DataClient, + model: MLModel, + model_filepath: str, + scenario_data_filepaths: list, + predictions_column_name: str, + output_filepath: str, +): + """ + Given the new model, we generate prediction for expected scenarios + """ + + logger.info("--- Loading Scenario Data ---") + + scenario_data = pd.DataFrame() + + # Can have multiple scenario data files + for scenario_data_filepath in scenario_data_filepaths: + scenario_data = pd.concat( + [ + scenario_data, + input_dataclient.load_data(scenario_data_filepath, load_config=None), + ] + ) + + logger.info("--- Loading Model ---") + + model.load_model(model_filepath) + + logger.info("--- Generating Predictions ---") + + predictions = model.predict( + data=scenario_data, post_prediction_logic=post_prediction_logic + ) + + logger.info("--- Generate Scenario Predicted Impact ---") + + predictions_df = pd.DataFrame(predictions) + predictions_df.columns = [predictions_column_name] + + scenario_data = pd.concat([scenario_data, predictions_df], axis=1) + scenario_data["predicted_impact"] = abs( + scenario_data[predictions_column_name] - scenario_data["sap_starting"] + ) + + logger.info("--- Save prediction into metrics ---") + + output_df = scenario_data[["uprn", "id", "impact", "predicted_impact"]] + + output_dataclient.save_data( + obj=output_df, location=output_filepath, save_config=None + ) + + +if __name__ == "__main__": + logger.info(f"--- {__file__} - Start! ---") + + logger.info(f"--- Generate Scenario Predictions ---") + + generate_scenario_predictions( + input_dataclient=input_dataclient, + output_dataclient=output_dataclient, + model=model, + model_filepath=model_filepath, + scenario_data_filepaths=scenario_data_filepaths, + predictions_column_name=predictions_column_name, + output_filepath=output_filepath, + ) + + logger.info(f"--- {__file__} - Complete! ---") diff --git a/modules/ml-pipeline/src/pipeline/config.py b/modules/ml-pipeline/src/pipeline/config.py index 7a7366b..bac430c 100644 --- a/modules/ml-pipeline/src/pipeline/config.py +++ b/modules/ml-pipeline/src/pipeline/config.py @@ -7,6 +7,7 @@ settings = Dynaconf( "./configs/settings.yaml", "./configs/build_model.yaml", "./configs/analysis.yaml", + "./configs/scenarios.yaml", ], ) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 6fbf094..a36bfbc 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -15,8 +15,8 @@ default: problem_type: regression eval_metric: mean_squared_error #mean_absolute_error time_limit: 1800 - presets: good_quality - excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] + presets: medium_quality + excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 infer_limit_batch_size: 10000 ag_args_ensemble: {'num_folds_parallel': 2} diff --git a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml new file mode 100644 index 0000000..e76336a --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml @@ -0,0 +1,8 @@ +default: + scenarios: + input_dataclient_type: aws-s3 + output_dataclient_type: local + scenario_data_filepaths: + # - s3://retrofit-data-dev/scenario_data/22-03-2024-19-20-09/recommendations_scoring_data.parquet + - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet + output_filepath: ./metrics/scenario_table.md diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 4757d91..f42b2be 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -18,12 +18,7 @@ default: prepare_data: input_dataclient_type: aws-s3 output_dataclient_type: local - # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet - # data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -35,9 +30,35 @@ default: subsample_seed: 0 target: sap_ending identifier_columns: ["uprn"] - drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"] - # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] + # drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"] + drop_columns: [ + "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending", + 'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending', + 'number_habitable_rooms', 'number_heated_rooms'] retain_features: null + # retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending', + # 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending', + # 'walls_energy_eff_ending', 'secondheat_description_ending', + # 'property_type', 'mainheatc_energy_eff_ending', 'built_form', + # 'walls_insulation_thickness_ending', 'potential_energy_efficiency', + # 'transaction_type_ending', + # 'floor_thermal_transmittance_ending', + # 'low_energy_lighting_ending', 'heat_demand_starting', + # 'photo_supply_ending', 'carbon_starting', + # 'walls_thermal_transmittance_ending', + # 'roof_insulation_thickness_ending', + # 'total_floor_area_ending', 'number_open_fireplaces_ending', + # 'windows_energy_eff_ending', + # 'floor_height_ending', + # 'extension_count_ending', + # 'has_air_source_heat_pump_ending', + # 'charging_system_ending', 'construction_age_band', 'glazed_type_ending', + # 'roof_thermal_transmittance_ending', + # 'floor_insulation_thickness_ending', 'has_mains_gas_ending', + # 'estimated_perimeter_starting', 'energy_consumption_potential', + # 'environment_impact_potential', 'heater_type_ending', + # 'multi_glaze_proportion_ending', + # 'lighting_energy_eff_ending', 'fixed_lighting_outlets_count'] generate_predictions: input_dataclient_type: local diff --git a/modules/ml-pipeline/src/pipeline/core/DataClient.py b/modules/ml-pipeline/src/pipeline/core/DataClient.py index 53f4072..b38ca32 100644 --- a/modules/ml-pipeline/src/pipeline/core/DataClient.py +++ b/modules/ml-pipeline/src/pipeline/core/DataClient.py @@ -245,7 +245,8 @@ class LocalClient: save_methods = { ".parquet": self._save_parquet, - ".json": self._save_json + ".json": self._save_json, + ".md": self._save_md, # "": _save_directory(**save_config), # ADD MORE save_methods HERE } @@ -294,3 +295,10 @@ class LocalClient: # Write the contents of the buffer to the local file with open(location, "wb") as f: f.write(buffer.getvalue()) + + def _save_md(self, obj: pd.DataFrame, location: str, save_config: dict): + """ + Save object as markdown + """ + + obj.to_markdown(location, **save_config) diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 530a3c8..fcc035b 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -1,5 +1,16 @@ schema: '2.0' stages: + startup_cleanup: + cmd: python 0_startup_cleanup.py + deps: + - path: 0_startup_cleanup.py + hash: md5 + md5: b1b12f6b6393fbf8b83d23684df0a3d4 + size: 1220 + params: + configs/settings.yaml: + default.startup_cleanup.artefacts: ./data + default.startup_cleanup.metrics: ./metrics prepare_data: cmd: python 1_prepare_data.py deps: @@ -17,12 +28,19 @@ stages: - carbon_ending - days_to_starting - days_to_ending + - number_habitable_rooms_starting + - number_habitable_rooms_ending + - number_heated_rooms_starting + - number_heated_rooms_ending + - number_habitable_rooms + - number_heated_rooms default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + default.prepare_data.data_filepath: + s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -31,8 +49,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 3d1144848fce4ce50f6abfaec5235552.dir - size: 46392840 + md5: efa416abea618ae6220a0c3d597603cf.dir + size: 44750997 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -43,8 +61,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 3d1144848fce4ce50f6abfaec5235552.dir - size: 46392840 + md5: efa416abea618ae6220a0c3d597603cf.dir + size: 44750997 nfiles: 2 params: configs/build_model.yaml: @@ -62,10 +80,9 @@ stages: problem_type: regression eval_metric: mean_squared_error time_limit: 1800 - presets: good_quality + presets: medium_quality excluded_model_types: - RF - - FASTAI - CAT - NN_TORCH - KNN @@ -77,18 +94,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 346b6611afbf2070e038bf945249a86e.dir - size: 3384302 + md5: de46250d454c4d713ab580b10ff3fd31.dir + size: 3349318 nfiles: 1 - path: data/model/ hash: md5 - md5: 8e37f21728cd092660bafa8c32dc109f.dir - size: 423840922 - nfiles: 118 + md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir + size: 735951861 + nfiles: 35 - path: metrics/fit_metrics.json hash: md5 - md5: d63e1a8d31503055835ac35149554e41 - size: 223 + md5: 8a952a5e884c268e6059357a627b9251 + size: 224 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -98,13 +115,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 8e37f21728cd092660bafa8c32dc109f.dir - size: 423840922 - nfiles: 118 + md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir + size: 735951861 + nfiles: 35 - path: data/prepared_data hash: md5 - md5: 3d1144848fce4ce50f6abfaec5235552.dir - size: 46392840 + md5: efa416abea618ae6220a0c3d597603cf.dir + size: 44750997 nfiles: 2 params: configs/settings.yaml: @@ -116,8 +133,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: d148baf508140353d62c16d6ab0fb6b7.dir - size: 469224 + md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir + size: 463563 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -128,13 +145,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: d148baf508140353d62c16d6ab0fb6b7.dir - size: 469224 + md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir + size: 463563 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 3d1144848fce4ce50f6abfaec5235552.dir - size: 46392840 + md5: efa416abea618ae6220a0c3d597603cf.dir + size: 44750997 nfiles: 2 params: configs/settings.yaml: @@ -144,16 +161,25 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 196232f94b563ac525cf65ee5cc6d639 - size: 222 - startup_cleanup: - cmd: python 0_startup_cleanup.py + md5: 9f863f47799d42c101eba3b03a179455 + size: 224 + generate_scenerio_metrics: + cmd: python 5_generate_scenarios.py deps: - - path: 0_startup_cleanup.py + - path: 5_generate_scenarios.py hash: md5 - md5: b1b12f6b6393fbf8b83d23684df0a3d4 - size: 1220 + md5: 30f80ffeb6ee50c5f7b82943a4dc7702 + size: 4014 params: - configs/settings.yaml: - default.startup_cleanup.artefacts: ./data - default.startup_cleanup.metrics: ./metrics + configs/scenarios.yaml: + default.scenarios: + input_dataclient_type: aws-s3 + output_dataclient_type: local + scenario_data_filepaths: + - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet + output_filepath: ./metrics/scenario_table.md + outs: + - path: metrics/scenario_table.md + hash: md5 + md5: 54856c66fca8b2ebd1fa4dea2d25734a + size: 2133 diff --git a/modules/ml-pipeline/src/pipeline/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml index 58889cc..5ce35ce 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/dvc.yaml @@ -71,6 +71,16 @@ stages: outs: - metrics/metrics.json always_changed: true + generate_scenerio_metrics: + cmd: python 5_generate_scenarios.py + deps: + - 5_generate_scenarios.py + params: + - configs/scenarios.yaml: + - default.scenarios + outs: + - metrics/scenario_table.md + always_changed: true metrics: - metrics/metrics.json - metrics/fit_metrics.json diff --git a/modules/ml-pipeline/src/pipeline/metrics/.gitignore b/modules/ml-pipeline/src/pipeline/metrics/.gitignore index e6fbc8d..189c2ee 100644 --- a/modules/ml-pipeline/src/pipeline/metrics/.gitignore +++ b/modules/ml-pipeline/src/pipeline/metrics/.gitignore @@ -1,2 +1,3 @@ /fit_metrics.json /metrics.json +/scenario_table.md From 9b6aeae0da24418aca6eb38a3f71731fddac0f1e Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 26 Mar 2024 22:32:44 +0000 Subject: [PATCH 25/53] medium model with scenario and upgraded autogluon --- modules/ml-pipeline/src/pipeline/dvc.lock | 84 ----------------------- 1 file changed, 84 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index b3c5814..d6bce15 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -85,21 +85,12 @@ stages: default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet -<<<<<<< HEAD default.prepare_data.train_proportion: 0.9 outs: - path: data/prepared_data/ hash: md5 md5: efa416abea618ae6220a0c3d597603cf.dir size: 44750997 -======= - default.prepare_data.train_proportion: 0.98 - outs: - - path: data/prepared_data/ - hash: md5 - md5: 544427230544c2cc526334e246db4845.dir - size: 26132493 ->>>>>>> d5f40a8eb294924e0525904d6ee1864999d77c23 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -110,13 +101,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 -<<<<<<< HEAD md5: efa416abea618ae6220a0c3d597603cf.dir size: 44750997 -======= - md5: 544427230544c2cc526334e246db4845.dir - size: 26132493 ->>>>>>> d5f40a8eb294924e0525904d6ee1864999d77c23 nfiles: 2 params: configs/build_model.yaml: @@ -148,7 +134,6 @@ stages: outs: - path: data/fit_predictions/ hash: md5 -<<<<<<< HEAD md5: de46250d454c4d713ab580b10ff3fd31.dir size: 3349318 nfiles: 1 @@ -161,20 +146,6 @@ stages: hash: md5 md5: 8a952a5e884c268e6059357a627b9251 size: 224 -======= - md5: 8f9e2059782dd55d3ecdad54b4551f6a.dir - size: 3630849 - nfiles: 1 - - path: data/model/ - hash: md5 - md5: e031eb3c3fdb63917aabfea745b56ac6.dir - size: 618445494 - nfiles: 31 - - path: metrics/fit_metrics.json - hash: md5 - md5: e68009f5b66230b3ee4cd2ffc9a2d697 - size: 222 ->>>>>>> d5f40a8eb294924e0525904d6ee1864999d77c23 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -184,7 +155,6 @@ stages: size: 2464 - path: data/model hash: md5 -<<<<<<< HEAD md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir size: 735951861 nfiles: 35 @@ -192,15 +162,6 @@ stages: hash: md5 md5: efa416abea618ae6220a0c3d597603cf.dir size: 44750997 -======= - md5: e031eb3c3fdb63917aabfea745b56ac6.dir - size: 618445494 - nfiles: 31 - - path: data/prepared_data - hash: md5 - md5: 544427230544c2cc526334e246db4845.dir - size: 26132493 ->>>>>>> d5f40a8eb294924e0525904d6ee1864999d77c23 nfiles: 2 params: configs/settings.yaml: @@ -212,13 +173,8 @@ stages: outs: - path: data/predictions/ hash: md5 -<<<<<<< HEAD md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir size: 463563 -======= - md5: 1c14c9ac9711f5d33a60890e3ca72454.dir - size: 90361 ->>>>>>> d5f40a8eb294924e0525904d6ee1864999d77c23 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -229,7 +185,6 @@ stages: size: 3484 - path: data/predictions hash: md5 -<<<<<<< HEAD md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir size: 463563 nfiles: 1 @@ -237,15 +192,6 @@ stages: hash: md5 md5: efa416abea618ae6220a0c3d597603cf.dir size: 44750997 -======= - md5: 1c14c9ac9711f5d33a60890e3ca72454.dir - size: 90361 - nfiles: 1 - - path: data/prepared_data - hash: md5 - md5: 544427230544c2cc526334e246db4845.dir - size: 26132493 ->>>>>>> d5f40a8eb294924e0525904d6ee1864999d77c23 nfiles: 2 params: configs/settings.yaml: @@ -255,55 +201,25 @@ stages: outs: - path: metrics/metrics.json hash: md5 -<<<<<<< HEAD md5: 9f863f47799d42c101eba3b03a179455 size: 224 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py -======= - md5: 98e59ea9569522a8665c4e6c1bea7473 - size: 222 - startup_cleanup: - cmd: python 0_startup_cleanup.py ->>>>>>> d5f40a8eb294924e0525904d6ee1864999d77c23 deps: - path: 5_generate_scenarios.py hash: md5 md5: 30f80ffeb6ee50c5f7b82943a4dc7702 size: 4014 params: -<<<<<<< HEAD -======= - configs/settings.yaml: - default.startup_cleanup.artefacts: ./data - default.startup_cleanup.metrics: ./metrics - generate_scenerio_metrics: - cmd: python 5_generate_scenarios.py - deps: - - path: 5_generate_scenarios.py - hash: md5 - md5: 30f80ffeb6ee50c5f7b82943a4dc7702 - size: 4014 - params: ->>>>>>> d5f40a8eb294924e0525904d6ee1864999d77c23 configs/scenarios.yaml: default.scenarios: input_dataclient_type: aws-s3 output_dataclient_type: local scenario_data_filepaths: -<<<<<<< HEAD - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet -======= - - s3://retrofit-data-dev/scenario_data/recommendations_scoring_data.parquet ->>>>>>> d5f40a8eb294924e0525904d6ee1864999d77c23 output_filepath: ./metrics/scenario_table.md outs: - path: metrics/scenario_table.md hash: md5 -<<<<<<< HEAD md5: 54856c66fca8b2ebd1fa4dea2d25734a size: 2133 -======= - md5: 3ee1966a06c1e5b9c37797597be94797 - size: 1648 ->>>>>>> d5f40a8eb294924e0525904d6ee1864999d77c23 From c3985e2104d9acfa112ad4b0247a47755c552e97 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 27 Mar 2024 12:22:58 +0000 Subject: [PATCH 26/53] add metrics for scenarios --- .github/workflows/MLPipelinePullRequest.yml | 6 +++- .../src/pipeline/5_generate_scenarios.py | 33 ++++++++++++++++--- .../src/pipeline/configs/scenarios.yaml | 6 ++-- modules/ml-pipeline/src/pipeline/dvc.lock | 15 ++++++--- modules/ml-pipeline/src/pipeline/dvc.yaml | 1 + .../src/pipeline/metrics/.gitignore | 1 + 6 files changed, 50 insertions(+), 12 deletions(-) diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index 493aef9..8e59cc8 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -98,10 +98,14 @@ jobs: git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH} dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md - echo "## Scenario metrics" >> report.md + echo "## Scenario comparison" >> report.md cat metrics/scenario_table.md >> report.md + echo "## Scenario metrics" >> report.md + + cat metrics/scenario_metrics.md >> report.md + cml comment create report.md # echo "## Residuals plot from model" >> report.md diff --git a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py index 28bcb9d..9d2fa68 100644 --- a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py +++ b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py @@ -8,9 +8,11 @@ import os import pandas as pd from core.interface.InterfaceModels import MLModel from core.interface.InterfaceDataClient import DataClient +from core.interface.InterfaceMetrics import MLMetrics from configs.post_prediction_logic import post_prediction_logic from core.DataClient import dataclient_factory from core.MLModels import model_factory +from core.MLMetrics import metrics_factory from core.Logger import logger from config import settings @@ -30,7 +32,8 @@ model_filepath = build_model_params["model_save_filepath"] target = feature_process_params["feature_processor_config"]["target"] scenario_data_filepaths = scenarios_params["scenario_data_filepaths"] predictions_column_name = generate_predictions_params["predictions_column_name"] -output_filepath = scenarios_params["output_filepath"] +comparison_output_filepath = scenarios_params["comparison_output_filepath"] +metrics_output_filepath = scenarios_params["metrics_output_filepath"] logger.info(f"--- Initiate MLModel ---") @@ -51,15 +54,21 @@ output_dataclient = dataclient_factory( dataclient_config=client_params[output_dataclient_type], ) +logger.info(f"--- Initiate MLMetrics ---") + +metrics = metrics_factory(generate_metrics_params["metrics_type"]) + def generate_scenario_predictions( input_dataclient: DataClient, output_dataclient: DataClient, model: MLModel, + metrics: MLMetrics, model_filepath: str, scenario_data_filepaths: list, predictions_column_name: str, - output_filepath: str, + comparison_output_filepath: str, + metrics_output_filepath: str, ): """ Given the new model, we generate prediction for expected scenarios @@ -98,16 +107,30 @@ def generate_scenario_predictions( scenario_data[predictions_column_name] - scenario_data["sap_starting"] ) + logger.info("--- Generate Metrics ---") + + metrics_dict = metrics.generate_metrics( + scenario_data["impact"], scenario_data["predicted_impact"] + ) + + metrics_df = pd.DataFrame(metrics_dict, index=[0]).T.reset_index() + metrics_df.columns = ["metric", "value"] + logger.info("--- Save prediction into metrics ---") output_df = scenario_data[["uprn", "id", "impact", "predicted_impact"]] output_dataclient.save_data( - obj=output_df, location=output_filepath, save_config=None + obj=output_df, location=comparison_output_filepath, save_config=None + ) + + output_dataclient.save_data( + obj=metrics_df, location=metrics_output_filepath, save_config=None ) if __name__ == "__main__": + logger.info(f"--- {__file__} - Start! ---") logger.info(f"--- Generate Scenario Predictions ---") @@ -116,10 +139,12 @@ if __name__ == "__main__": input_dataclient=input_dataclient, output_dataclient=output_dataclient, model=model, + metrics=metrics, model_filepath=model_filepath, scenario_data_filepaths=scenario_data_filepaths, predictions_column_name=predictions_column_name, - output_filepath=output_filepath, + comparison_output_filepath=comparison_output_filepath, + metrics_output_filepath=metrics_output_filepath, ) logger.info(f"--- {__file__} - Complete! ---") diff --git a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml index e76336a..2df0cb6 100644 --- a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml @@ -4,5 +4,7 @@ default: output_dataclient_type: local scenario_data_filepaths: # - s3://retrofit-data-dev/scenario_data/22-03-2024-19-20-09/recommendations_scoring_data.parquet - - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet - output_filepath: ./metrics/scenario_table.md + # - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet + - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet + comparison_output_filepath: ./metrics/scenario_table.md + metrics_output_filepath: ./metrics/scenario_metrics.md diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index d6bce15..104dc83 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -208,18 +208,23 @@ stages: deps: - path: 5_generate_scenarios.py hash: md5 - md5: 30f80ffeb6ee50c5f7b82943a4dc7702 - size: 4014 + md5: a18f6c6ae2082f038df47386cf3e418e + size: 4896 params: configs/scenarios.yaml: default.scenarios: input_dataclient_type: aws-s3 output_dataclient_type: local scenario_data_filepaths: - - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet - output_filepath: ./metrics/scenario_table.md + - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet + comparison_output_filepath: ./metrics/scenario_table.md + metrics_output_filepath: ./metrics/scenario_metrics.md outs: + - path: metrics/scenario_metrics.md + hash: md5 + md5: 64e7db945ff655ae03c20c9845f19106 + size: 363 - path: metrics/scenario_table.md hash: md5 - md5: 54856c66fca8b2ebd1fa4dea2d25734a + md5: d4f8afe07b774374aeaa48f1b7b8a5fc size: 2133 diff --git a/modules/ml-pipeline/src/pipeline/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml index 5ce35ce..6026a83 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/dvc.yaml @@ -80,6 +80,7 @@ stages: - default.scenarios outs: - metrics/scenario_table.md + - metrics/scenario_metrics.md always_changed: true metrics: - metrics/metrics.json diff --git a/modules/ml-pipeline/src/pipeline/metrics/.gitignore b/modules/ml-pipeline/src/pipeline/metrics/.gitignore index 189c2ee..6427764 100644 --- a/modules/ml-pipeline/src/pipeline/metrics/.gitignore +++ b/modules/ml-pipeline/src/pipeline/metrics/.gitignore @@ -1,3 +1,4 @@ /fit_metrics.json /metrics.json /scenario_table.md +/scenario_metrics.md From 1bb1f8d61fb2f157290e014113589b69383798cd Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 27 Mar 2024 12:30:31 +0000 Subject: [PATCH 27/53] add metrics for scenarios --- .github/workflows/MLPipelinePullRequest.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index 8e59cc8..451b0a8 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -102,6 +102,8 @@ jobs: cat metrics/scenario_table.md >> report.md + echo "" >> report.md + echo "## Scenario metrics" >> report.md cat metrics/scenario_metrics.md >> report.md From 64a5c9383377b4b9d3f96bc60a205f9134591ba4 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Thu, 28 Mar 2024 09:30:30 +0000 Subject: [PATCH 28/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 143a5f6..d864119 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.10.0", + "version": "v0.10.1", "stage": { "dev": "v0.10.0" }, From a7bb61433a0b9183fd550a6a6afbc79c67fb209c Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Thu, 28 Mar 2024 09:31:07 +0000 Subject: [PATCH 29/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index d864119..a3fa295 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.10.1", "stage": { - "dev": "v0.10.0" + "dev": "v0.10.1" }, "registered": true, "active": true From 7aeaa9a5f67b4d4efc38d7fb13acaa94d3773d20 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 28 Mar 2024 15:13:20 +0000 Subject: [PATCH 30/53] add c++ to docker, fixed dynaconf --- deployment/Dockerfile.prediction.lambda | 2 +- .../src/pipeline/requirements/predictions/requirements-dev.txt | 2 +- .../src/pipeline/requirements/predictions/requirements.txt | 2 +- .../src/pipeline/requirements/training/requirements-dev.txt | 2 +- .../src/pipeline/requirements/training/requirements.txt | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/deployment/Dockerfile.prediction.lambda b/deployment/Dockerfile.prediction.lambda index a2520ba..f8000bf 100644 --- a/deployment/Dockerfile.prediction.lambda +++ b/deployment/Dockerfile.prediction.lambda @@ -9,7 +9,7 @@ ARG RUNTIME_ENVIRONMENT ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT} # Install necessary build tools - required to test locally -RUN yum install -y gcc python3-devel +RUN yum install -y gcc python3-devel gcc-c++ # Install python packages COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt index 258981d..734419a 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt @@ -2,6 +2,6 @@ joblib==1.3.2 boto3==1.28.17 pandas==2.1.4 autogluon==1.0.0 -dynaconf==3.2.0 +dynaconf==3.2.1 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt index 2ab48e9..937b000 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt @@ -2,6 +2,6 @@ joblib==1.3.2 boto3==1.28.17 pandas==2.1.4 autogluon==1.0.0 -dynaconf==3.2.0 +dynaconf==3.2.1 pyarrow==13.0.0 PyYAML==6.0.1 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index 2024d84..fe06a4d 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -3,7 +3,7 @@ boto3==1.28.17 pandas==2.1.4 autogluon==1.0.0 ray==2.6.3 -dynaconf==3.2.0 +dynaconf==3.2.1 alibi==0.9.5 shap==0.42.1 pyarrow==13.0.0 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt index 84452a3..a5bccd3 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt @@ -1,4 +1,4 @@ boto3==1.28.41 pandas==2.1.4 autogluon==1.0.0 -dynaconf==3.2.0 +dynaconf==3.2.1 From 94a6aaa38f9c0745e38625bca9d6964f33e7777c Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Thu, 28 Mar 2024 15:22:33 +0000 Subject: [PATCH 31/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index a3fa295..9adf478 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.10.1", + "version": "v0.11.0", "stage": { "dev": "v0.10.1" }, From e97c01c36664e31fa314e65360451ed92b9c246d Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Thu, 28 Mar 2024 15:23:18 +0000 Subject: [PATCH 32/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 9adf478..97c4388 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.11.0", "stage": { - "dev": "v0.10.1" + "dev": "v0.11.0" }, "registered": true, "active": true From 179c334b6e1737e7836c3997a994dc21c6f3609e Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 19 Apr 2024 14:38:57 +0100 Subject: [PATCH 33/53] add switch to turn off scenario data (for carbon and heat for now) --- .../ml-pipeline/src/pipeline/5_generate_scenarios.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py index 9d2fa68..6debe32 100644 --- a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py +++ b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py @@ -78,6 +78,18 @@ def generate_scenario_predictions( scenario_data = pd.DataFrame() + # If we have no scenario data, we can save empty dataframes + if scenario_data_filepaths is None: + logger.info("No scenario data filepaths provided") + output_dataclient.save_data( + obj=scenario_data, location=comparison_output_filepath, save_config=None + ) + + output_dataclient.save_data( + obj=scenario_data, location=metrics_output_filepath, save_config=None + ) + return + # Can have multiple scenario data files for scenario_data_filepath in scenario_data_filepaths: scenario_data = pd.concat( From 620c1d10a1a009d2416ecc927793e4caa9787715 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 19 Apr 2024 16:22:06 +0100 Subject: [PATCH 34/53] correct the dockerignore files and test model with just tabular --- .dockerignore | 9 ++ deployment/.dockerignore | 12 ++- modules/ml-pipeline/src/.dockerignore | 10 +- modules/ml-pipeline/src/Prediction.Dockerfile | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 94 ++++++------------- .../predictions/requirements-dev.txt | 2 +- .../requirements/predictions/requirements.txt | 2 +- .../training/requirements-dev.txt | 2 +- .../requirements/training/requirements.txt | 2 +- 9 files changed, 55 insertions(+), 80 deletions(-) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..84abbe6 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,9 @@ +modules/ml-pipeline/src/pipeline/data/predictions +modules/ml-pipeline/src/pipeline/data/fit_predictions +modules/ml-pipeline/src/pipeline/data/prepared_data +modules/ml-pipeline/src/pipeline/data/model/allmodels +modules/ml-pipeline/src/pipeline/metrics +modules/ml-pipeline/src/pipeline/__pycache__ +modules/ml-pipeline/src/pipeline/.dvc +modules/ml-pipeline/src/pipeline/analysis +modules/ml-pipeline/src/pipeline/metrics diff --git a/deployment/.dockerignore b/deployment/.dockerignore index e01cbd5..8b8a7fb 100644 --- a/deployment/.dockerignore +++ b/deployment/.dockerignore @@ -1,4 +1,8 @@ -modules/ml-pipeline/src/pipeline/data/predictions* -modules/ml-pipeline/src/pipeline/data/prepared_data* -modules/ml-pipeline/src/pipeline/data/model/allmodels* -modules/ml-pipeline/src/pipeline/metrics* +modules/ml-pipeline/src/pipeline/data/predictions +modules/ml-pipeline/src/pipeline/data/prepared_data +modules/ml-pipeline/src/pipeline/data/model/allmodels +modules/ml-pipeline/src/pipeline/metrics +modules/ml-pipeline/src/__pycache__ +modules/ml-pipeline/src/.dvc +modules/ml-pipeline/src/analysis +modules/ml-pipeline/src/metrics diff --git a/modules/ml-pipeline/src/.dockerignore b/modules/ml-pipeline/src/.dockerignore index 14f71d7..5feb57d 100644 --- a/modules/ml-pipeline/src/.dockerignore +++ b/modules/ml-pipeline/src/.dockerignore @@ -1,4 +1,6 @@ -pipeline/data/predictions* -pipeline/data/prepared_data/train.parquet* -pipeline/data/model/allmodels* -pipeline/metrics* +pipeline/data/predictions +pipeline/data/prepared_data/train.parquet +pipeline/data/model/allmodels +pipeline/metrics +pipeline/.dvc +pipeline/analysis diff --git a/modules/ml-pipeline/src/Prediction.Dockerfile b/modules/ml-pipeline/src/Prediction.Dockerfile index a6fc539..e0a292c 100644 --- a/modules/ml-pipeline/src/Prediction.Dockerfile +++ b/modules/ml-pipeline/src/Prediction.Dockerfile @@ -1,7 +1,7 @@ # Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow) FROM python:3.10.12-slim -RUN apt-get update && apt-get install -y libgomp1 +RUN apt-get update && apt-get install -y libgomp1 gcc python3-dev COPY pipeline/requirements/predictions/requirements.txt requirements.txt diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 104dc83..ca4bfdd 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -35,46 +35,6 @@ stages: - number_habitable_rooms - number_heated_rooms default.feature_processor.feature_processor_config.retain_features: - - uprn - - sap_starting - - hot_water_energy_eff_ending - - mainheat_energy_eff_ending - - constituency - - roof_energy_eff_ending - - walls_energy_eff_ending - - secondheat_description_ending - - property_type - - mainheatc_energy_eff_ending - - built_form - - walls_insulation_thickness_ending - - potential_energy_efficiency - - transaction_type_ending - - floor_thermal_transmittance_ending - - low_energy_lighting_ending - - heat_demand_starting - - photo_supply_ending - - carbon_starting - - walls_thermal_transmittance_ending - - roof_insulation_thickness_ending - - total_floor_area_ending - - number_open_fireplaces_ending - - windows_energy_eff_ending - - floor_height_ending - - extension_count_ending - - has_air_source_heat_pump_ending - - charging_system_ending - - construction_age_band - - glazed_type_ending - - roof_thermal_transmittance_ending - - floor_insulation_thickness_ending - - has_mains_gas_ending - - estimated_perimeter_starting - - energy_consumption_potential - - environment_impact_potential - - heater_type_ending - - multi_glaze_proportion_ending - - lighting_energy_eff_ending - - fixed_lighting_outlets_count default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending @@ -89,8 +49,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: a1ff276b1cbd2db0b8e2982cfd524b40.dir + size: 44713582 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -101,8 +61,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: a1ff276b1cbd2db0b8e2982cfd524b40.dir + size: 44713582 nfiles: 2 params: configs/build_model.yaml: @@ -134,18 +94,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: de46250d454c4d713ab580b10ff3fd31.dir - size: 3349318 + md5: 750cd7426e2909ed36bc05601b5e04c4.dir + size: 3349190 nfiles: 1 - path: data/model/ hash: md5 - md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir - size: 735951861 - nfiles: 35 + md5: aa5eb60be553f2a01e63783cf8f1fad1.dir + size: 765461992 + nfiles: 36 - path: metrics/fit_metrics.json hash: md5 - md5: 8a952a5e884c268e6059357a627b9251 - size: 224 + md5: ca5190b3292210c57a58668fdb48296c + size: 226 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -155,13 +115,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir - size: 735951861 - nfiles: 35 + md5: aa5eb60be553f2a01e63783cf8f1fad1.dir + size: 765461992 + nfiles: 36 - path: data/prepared_data hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: a1ff276b1cbd2db0b8e2982cfd524b40.dir + size: 44713582 nfiles: 2 params: configs/settings.yaml: @@ -173,8 +133,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir - size: 463563 + md5: 934c94e5b1a2c70db3dc865ee056f460.dir + size: 463619 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -185,13 +145,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir - size: 463563 + md5: 934c94e5b1a2c70db3dc865ee056f460.dir + size: 463619 nfiles: 1 - path: data/prepared_data hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: a1ff276b1cbd2db0b8e2982cfd524b40.dir + size: 44713582 nfiles: 2 params: configs/settings.yaml: @@ -201,15 +161,15 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 9f863f47799d42c101eba3b03a179455 + md5: aa2a511ac759225549636ba05d6b667c size: 224 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: - path: 5_generate_scenarios.py hash: md5 - md5: a18f6c6ae2082f038df47386cf3e418e - size: 4896 + md5: 40506749fefd926d47c60ff5b16db307 + size: 5337 params: configs/scenarios.yaml: default.scenarios: @@ -222,9 +182,9 @@ stages: outs: - path: metrics/scenario_metrics.md hash: md5 - md5: 64e7db945ff655ae03c20c9845f19106 + md5: 4085a6ea3d044ad2fe7ac63b0a685fed size: 363 - path: metrics/scenario_table.md hash: md5 - md5: d4f8afe07b774374aeaa48f1b7b8a5fc + md5: a2b3da77921b5dcc10f7068646e0eae3 size: 2133 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt index 734419a..35875d9 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 pandas==2.1.4 -autogluon==1.0.0 +autogluon.tabular[lightgbm,xgboost,fastai]==1.0.0 dynaconf==3.2.1 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt index 937b000..6a96822 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 pandas==2.1.4 -autogluon==1.0.0 +autogluon.tabular[lightgbm,xgboost,fastai]==1.0.0 dynaconf==3.2.1 pyarrow==13.0.0 PyYAML==6.0.1 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index fe06a4d..6e1ea2f 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 pandas==2.1.4 -autogluon==1.0.0 +autogluon.tabular[lightgbm,xgboost,fastai]==1.0.0 ray==2.6.3 dynaconf==3.2.1 alibi==0.9.5 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt index a5bccd3..e4e319c 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt @@ -1,4 +1,4 @@ boto3==1.28.41 pandas==2.1.4 -autogluon==1.0.0 +autogluon.tabular[lightgbm,xgboost,fastai]==1.0.0 dynaconf==3.2.1 From 50a3e2d5be0f0e02af3a16785cecab9e6674d8a2 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 19 Apr 2024 16:25:26 +0100 Subject: [PATCH 35/53] correct the dockerignore files and test model with just tabular --- deployment/.dockerignore | 1 + 1 file changed, 1 insertion(+) diff --git a/deployment/.dockerignore b/deployment/.dockerignore index 8b8a7fb..c4103de 100644 --- a/deployment/.dockerignore +++ b/deployment/.dockerignore @@ -1,4 +1,5 @@ modules/ml-pipeline/src/pipeline/data/predictions +modules/ml-pipeline/src/pipeline/data/fit_predictions modules/ml-pipeline/src/pipeline/data/prepared_data modules/ml-pipeline/src/pipeline/data/model/allmodels modules/ml-pipeline/src/pipeline/metrics From 380bd6b595566853ec9d183ca02ad137f4cb82d5 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 19 Apr 2024 17:34:10 +0100 Subject: [PATCH 36/53] correct the dockerignore files and test model with just tabular --- modules/ml-pipeline/src/.dockerignore | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/ml-pipeline/src/.dockerignore b/modules/ml-pipeline/src/.dockerignore index 5feb57d..2e9277d 100644 --- a/modules/ml-pipeline/src/.dockerignore +++ b/modules/ml-pipeline/src/.dockerignore @@ -2,5 +2,4 @@ pipeline/data/predictions pipeline/data/prepared_data/train.parquet pipeline/data/model/allmodels pipeline/metrics -pipeline/.dvc pipeline/analysis From 87e3cc391eff1d3cf3a21bdb7b8631a165b8b5bd Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 19 Apr 2024 17:48:15 +0100 Subject: [PATCH 37/53] push files to s3 --- modules/ml-pipeline/src/.dockerignore | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/ml-pipeline/src/.dockerignore b/modules/ml-pipeline/src/.dockerignore index 2e9277d..5feb57d 100644 --- a/modules/ml-pipeline/src/.dockerignore +++ b/modules/ml-pipeline/src/.dockerignore @@ -2,4 +2,5 @@ pipeline/data/predictions pipeline/data/prepared_data/train.parquet pipeline/data/model/allmodels pipeline/metrics +pipeline/.dvc pipeline/analysis From 7a3477c0e1db4ea28c7c714397ebb670ddea70c7 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 22 Apr 2024 13:30:58 +0100 Subject: [PATCH 38/53] change to all packages --- modules/ml-pipeline/src/.dockerignore | 1 + .../src/pipeline/requirements/predictions/requirements-dev.txt | 2 +- .../src/pipeline/requirements/predictions/requirements.txt | 2 +- .../src/pipeline/requirements/training/requirements-dev.txt | 2 +- .../src/pipeline/requirements/training/requirements.txt | 2 +- 5 files changed, 5 insertions(+), 4 deletions(-) diff --git a/modules/ml-pipeline/src/.dockerignore b/modules/ml-pipeline/src/.dockerignore index 5feb57d..f99f14d 100644 --- a/modules/ml-pipeline/src/.dockerignore +++ b/modules/ml-pipeline/src/.dockerignore @@ -1,5 +1,6 @@ pipeline/data/predictions pipeline/data/prepared_data/train.parquet +pipeline/data/fit_predictions pipeline/data/model/allmodels pipeline/metrics pipeline/.dvc diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt index 35875d9..4dc4c36 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 pandas==2.1.4 -autogluon.tabular[lightgbm,xgboost,fastai]==1.0.0 +autogluon.tabular[all]==1.0.0 dynaconf==3.2.1 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt index 6a96822..35bdb05 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 pandas==2.1.4 -autogluon.tabular[lightgbm,xgboost,fastai]==1.0.0 +autogluon.tabular[all]==1.0.0 dynaconf==3.2.1 pyarrow==13.0.0 PyYAML==6.0.1 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index 6e1ea2f..93a042e 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 pandas==2.1.4 -autogluon.tabular[lightgbm,xgboost,fastai]==1.0.0 +autogluon.tabular[all]==1.0.0 ray==2.6.3 dynaconf==3.2.1 alibi==0.9.5 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt index e4e319c..edeb764 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt @@ -1,4 +1,4 @@ boto3==1.28.41 pandas==2.1.4 -autogluon.tabular[lightgbm,xgboost,fastai]==1.0.0 +autogluon.tabular[all]==1.0.0 dynaconf==3.2.1 From 874b1db5f3db74d490fc75a57bcd9d132647e68d Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 22 Apr 2024 19:01:56 +0100 Subject: [PATCH 39/53] add ignored file to dockerignore --- modules/ml-pipeline/src/.dockerignore | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/ml-pipeline/src/.dockerignore b/modules/ml-pipeline/src/.dockerignore index 5feb57d..c9a79ac 100644 --- a/modules/ml-pipeline/src/.dockerignore +++ b/modules/ml-pipeline/src/.dockerignore @@ -1,4 +1,5 @@ pipeline/data/predictions +pipeline/data/fit_predictions pipeline/data/prepared_data/train.parquet pipeline/data/model/allmodels pipeline/metrics From f43d0774798c0f0042e36e4f4a36d70720a905ac Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 22 Apr 2024 19:10:40 +0100 Subject: [PATCH 40/53] use previous model with new downstream processes --- modules/ml-pipeline/src/pipeline/dvc.lock | 94 ++++++++++++++++------- 1 file changed, 67 insertions(+), 27 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index ca4bfdd..104dc83 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -35,6 +35,46 @@ stages: - number_habitable_rooms - number_heated_rooms default.feature_processor.feature_processor_config.retain_features: + - uprn + - sap_starting + - hot_water_energy_eff_ending + - mainheat_energy_eff_ending + - constituency + - roof_energy_eff_ending + - walls_energy_eff_ending + - secondheat_description_ending + - property_type + - mainheatc_energy_eff_ending + - built_form + - walls_insulation_thickness_ending + - potential_energy_efficiency + - transaction_type_ending + - floor_thermal_transmittance_ending + - low_energy_lighting_ending + - heat_demand_starting + - photo_supply_ending + - carbon_starting + - walls_thermal_transmittance_ending + - roof_insulation_thickness_ending + - total_floor_area_ending + - number_open_fireplaces_ending + - windows_energy_eff_ending + - floor_height_ending + - extension_count_ending + - has_air_source_heat_pump_ending + - charging_system_ending + - construction_age_band + - glazed_type_ending + - roof_thermal_transmittance_ending + - floor_insulation_thickness_ending + - has_mains_gas_ending + - estimated_perimeter_starting + - energy_consumption_potential + - environment_impact_potential + - heater_type_ending + - multi_glaze_proportion_ending + - lighting_energy_eff_ending + - fixed_lighting_outlets_count default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending @@ -49,8 +89,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: a1ff276b1cbd2db0b8e2982cfd524b40.dir - size: 44713582 + md5: efa416abea618ae6220a0c3d597603cf.dir + size: 44750997 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -61,8 +101,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: a1ff276b1cbd2db0b8e2982cfd524b40.dir - size: 44713582 + md5: efa416abea618ae6220a0c3d597603cf.dir + size: 44750997 nfiles: 2 params: configs/build_model.yaml: @@ -94,18 +134,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 750cd7426e2909ed36bc05601b5e04c4.dir - size: 3349190 + md5: de46250d454c4d713ab580b10ff3fd31.dir + size: 3349318 nfiles: 1 - path: data/model/ hash: md5 - md5: aa5eb60be553f2a01e63783cf8f1fad1.dir - size: 765461992 - nfiles: 36 + md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir + size: 735951861 + nfiles: 35 - path: metrics/fit_metrics.json hash: md5 - md5: ca5190b3292210c57a58668fdb48296c - size: 226 + md5: 8a952a5e884c268e6059357a627b9251 + size: 224 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -115,13 +155,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: aa5eb60be553f2a01e63783cf8f1fad1.dir - size: 765461992 - nfiles: 36 + md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir + size: 735951861 + nfiles: 35 - path: data/prepared_data hash: md5 - md5: a1ff276b1cbd2db0b8e2982cfd524b40.dir - size: 44713582 + md5: efa416abea618ae6220a0c3d597603cf.dir + size: 44750997 nfiles: 2 params: configs/settings.yaml: @@ -133,8 +173,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 934c94e5b1a2c70db3dc865ee056f460.dir - size: 463619 + md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir + size: 463563 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -145,13 +185,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 934c94e5b1a2c70db3dc865ee056f460.dir - size: 463619 + md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir + size: 463563 nfiles: 1 - path: data/prepared_data hash: md5 - md5: a1ff276b1cbd2db0b8e2982cfd524b40.dir - size: 44713582 + md5: efa416abea618ae6220a0c3d597603cf.dir + size: 44750997 nfiles: 2 params: configs/settings.yaml: @@ -161,15 +201,15 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: aa2a511ac759225549636ba05d6b667c + md5: 9f863f47799d42c101eba3b03a179455 size: 224 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: - path: 5_generate_scenarios.py hash: md5 - md5: 40506749fefd926d47c60ff5b16db307 - size: 5337 + md5: a18f6c6ae2082f038df47386cf3e418e + size: 4896 params: configs/scenarios.yaml: default.scenarios: @@ -182,9 +222,9 @@ stages: outs: - path: metrics/scenario_metrics.md hash: md5 - md5: 4085a6ea3d044ad2fe7ac63b0a685fed + md5: 64e7db945ff655ae03c20c9845f19106 size: 363 - path: metrics/scenario_table.md hash: md5 - md5: a2b3da77921b5dcc10f7068646e0eae3 + md5: d4f8afe07b774374aeaa48f1b7b8a5fc size: 2133 From b985bbf753e232f0fc321f36f19833751b555746 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 26 May 2024 09:28:00 +0100 Subject: [PATCH 41/53] new model with is_as_built_ending and is assumed columns --- .../src/pipeline/configs/scenarios.yaml | 3 +- .../src/pipeline/configs/settings.yaml | 3 +- modules/ml-pipeline/src/pipeline/dvc.lock | 92 ++++++------------- 3 files changed, 30 insertions(+), 68 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml index 2df0cb6..8190888 100644 --- a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml @@ -5,6 +5,7 @@ default: scenario_data_filepaths: # - s3://retrofit-data-dev/scenario_data/22-03-2024-19-20-09/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet - - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet + # - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet + - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet comparison_output_filepath: ./metrics/scenario_table.md metrics_output_filepath: ./metrics/scenario_metrics.md diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index f42b2be..9d466af 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -18,7 +18,8 @@ default: prepare_data: input_dataclient_type: aws-s3 output_dataclient_type: local - data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 104dc83..f2fc8be 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -35,52 +35,12 @@ stages: - number_habitable_rooms - number_heated_rooms default.feature_processor.feature_processor_config.retain_features: - - uprn - - sap_starting - - hot_water_energy_eff_ending - - mainheat_energy_eff_ending - - constituency - - roof_energy_eff_ending - - walls_energy_eff_ending - - secondheat_description_ending - - property_type - - mainheatc_energy_eff_ending - - built_form - - walls_insulation_thickness_ending - - potential_energy_efficiency - - transaction_type_ending - - floor_thermal_transmittance_ending - - low_energy_lighting_ending - - heat_demand_starting - - photo_supply_ending - - carbon_starting - - walls_thermal_transmittance_ending - - roof_insulation_thickness_ending - - total_floor_area_ending - - number_open_fireplaces_ending - - windows_energy_eff_ending - - floor_height_ending - - extension_count_ending - - has_air_source_heat_pump_ending - - charging_system_ending - - construction_age_band - - glazed_type_ending - - roof_thermal_transmittance_ending - - floor_insulation_thickness_ending - - has_mains_gas_ending - - estimated_perimeter_starting - - energy_consumption_potential - - environment_impact_potential - - heater_type_ending - - multi_glaze_proportion_ending - - lighting_energy_eff_ending - - fixed_lighting_outlets_count default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe default.prepare_data.data_filepath: - s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet + s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -89,8 +49,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: 26bbe6b1dafae18eb50689604b925c87.dir + size: 45002224 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -101,8 +61,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: 26bbe6b1dafae18eb50689604b925c87.dir + size: 45002224 nfiles: 2 params: configs/build_model.yaml: @@ -134,18 +94,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: de46250d454c4d713ab580b10ff3fd31.dir - size: 3349318 + md5: 7ba44b4af6ecedf3ffebcf7512731d3d.dir + size: 3348905 nfiles: 1 - path: data/model/ hash: md5 - md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir - size: 735951861 + md5: ef62a6f9b9336fb5b648589d6e0d54d6.dir + size: 737305293 nfiles: 35 - path: metrics/fit_metrics.json hash: md5 - md5: 8a952a5e884c268e6059357a627b9251 - size: 224 + md5: 38600703e1ece1447e5d0fd80b4de8b7 + size: 217 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -155,13 +115,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir - size: 735951861 + md5: ef62a6f9b9336fb5b648589d6e0d54d6.dir + size: 737305293 nfiles: 35 - path: data/prepared_data hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: 26bbe6b1dafae18eb50689604b925c87.dir + size: 45002224 nfiles: 2 params: configs/settings.yaml: @@ -173,8 +133,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir - size: 463563 + md5: 7d88320b1cd3c690032438fad6cb2200.dir + size: 463523 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -185,13 +145,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir - size: 463563 + md5: 7d88320b1cd3c690032438fad6cb2200.dir + size: 463523 nfiles: 1 - path: data/prepared_data hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: 26bbe6b1dafae18eb50689604b925c87.dir + size: 45002224 nfiles: 2 params: configs/settings.yaml: @@ -201,8 +161,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 9f863f47799d42c101eba3b03a179455 - size: 224 + md5: e17ee59f3a04178a153c5746bf897e74 + size: 223 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: @@ -216,15 +176,15 @@ stages: input_dataclient_type: aws-s3 output_dataclient_type: local scenario_data_filepaths: - - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet + - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet comparison_output_filepath: ./metrics/scenario_table.md metrics_output_filepath: ./metrics/scenario_metrics.md outs: - path: metrics/scenario_metrics.md hash: md5 - md5: 64e7db945ff655ae03c20c9845f19106 + md5: 84fcae91af1480ee0c8fc1616af359b3 size: 363 - path: metrics/scenario_table.md hash: md5 - md5: d4f8afe07b774374aeaa48f1b7b8a5fc + md5: 8571ff812c2f7c71eb0b1534ff6ecff5 size: 2133 From a4dffe527a6fb2e4a53729d32036f5e11fdc9aae Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 26 May 2024 09:47:08 +0100 Subject: [PATCH 42/53] add testing script --- .github/workflows/MLPipelineTESTING.yml | 238 ++++++++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 .github/workflows/MLPipelineTESTING.yml diff --git a/.github/workflows/MLPipelineTESTING.yml b/.github/workflows/MLPipelineTESTING.yml new file mode 100644 index 0000000..92c1792 --- /dev/null +++ b/.github/workflows/MLPipelineTESTING.yml @@ -0,0 +1,238 @@ +name: Register the model for the given pipeline branch (TESTING) + +on: + push: + branches: + - "sap-dev-gto" + +# on: +# pull_request: +# types: +# - closed +# branches: +# - "sap-dev" +# - "heat-dev" +# - "carbon-dev" + +permissions: write-all + +jobs: + Register-Major-Model-Dev: + # if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'major')) }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install packages to register model + run: | + pip install --upgrade pip + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt + + - name: Register Model + run: | + # REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') + REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') + + git config user.name "Github-Bot" + git config user.email "Github-Bot@no-reply.com" + + latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') || false + if [ -z "${latest_version}" ]; then + increment_version="1.0.0" + else + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + major = $1 + 1 # Increment the major version + print major, "0", "0" # Print the new version + }') + fi + + new_tag=${REGISTER_MODEL_NAME}@v${increment_version} + + # git tag -a ${new_tag} -m "Registering new Major Version" + # git push origin ${new_tag} + + # gto show --json > MODEL_REGISTRY.md + # git add . + # git commit -m "Update Registry" + # git push + + Register-Minor-Model-Dev: + # if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'minor')) }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install packages to register model + run: | + pip install --upgrade pip + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt + + - name: Register Model + run: | + # REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') + REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') + + git config user.name "Github-Bot" + git config user.email "Github-Bot@no-reply.com" + + latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') + if [ -z "${latest_version}" ]; then + increment_version="0.1.0" + else + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + minor = $2 + 1 # Increment the minor version + print $1, minor, "0" # Print the new version + }') + fi + + new_tag=${REGISTER_MODEL_NAME}@v${increment_version} + + # git tag -a ${new_tag} -m "Registering new Minor Version" + # git push origin ${new_tag} + + # gto show --json > MODEL_REGISTRY.md + # git add . + # git commit -m "Update Registry" + # git push + + Register-Patch-Model-Dev: + # if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'patch')) }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install packages to register model + run: | + pip install --upgrade pip + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt + + - name: Register Model + run: | + # REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') + REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') + + git config user.name "Github-Bot" + git config user.email "Github-Bot@no-reply.com" + + latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') + if [ -z "${latest_version}" ]; then + increment_version="0.0.1" + else + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + patch = $3 + 1 # Increment the patch version + print $1, $2, patch # Print the new version + }') + fi + + new_tag=${REGISTER_MODEL_NAME}@v${increment_version} + + # git tag -a ${new_tag} -m "Registering new Patch Version" + # git push origin ${new_tag} + + # gto show --json > MODEL_REGISTRY.md + # git add . + # git commit -m "Update Registry" + # git push + + Promote-Artefacts-To-Dev: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Install packages to retrieve artifacts + run: | + pip install --upgrade pip + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt + + - name: Retrieve artifacts (dvc.lock) + env: + AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} + run: | + cd modules/ml-pipeline/src/pipeline + dvc pull -r experiments + + - name: Push artifacts to Dev + env: + AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} + run: | + cd modules/ml-pipeline/src/pipeline + dvc push -r dev + + Register-New-Model-Dev: + needs: [Register-Major-Model-Dev, Register-Minor-Model-Dev, Register-Patch-Model-Dev] + if: | + always() && + (needs.Register-Major-Model-Dev.result == 'success' || needs.Register-Major-Model-Dev.result == 'skipped') && + (needs.Register-Minor-Model-Dev.result == 'success' || needs.Register-Minor-Model-Dev.result == 'skipped') && + (needs.Register-Patch-Model-Dev.result == 'success' || needs.Register-Patch-Model-Dev.result == 'skipped') + + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Install packages to register model + run: | + pip install --upgrade pip + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt + + - name: Register Model + env: + TARGET_BRANCH: ${{ github.base_ref }} + run: | + + REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') + # REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') + + git config user.name "Github-Bot" + git config user.email "Github-Bot@no-reply.com" + + latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/' | awk 'END {print}') + if [ -z "${latest_dev_version}" ]; then + increment_version="1" + else + increment_version=$(echo ${latest_dev_version} | awk '{print $NF}' | awk -F"#" '{print $3}' | awk '{$1++; print}') + fi + + new_tag=${REGISTER_MODEL_NAME}#dev#${increment_version} + latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@" '{print $2}') + + echo ${new_tag} + + commit_hash=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk "/${latest_version}/" | awk '{print $(NF-1)}') + git checkout ${commit_hash} + + # git pull #Get new model registry md file changes + git tag -a ${new_tag} -m "Assigning stage dev to artifact ${REGISTER_MODEL_NAME} version ${latest_version}" + git push origin ${new_tag} + + git checkout ${TARGET_BRANCH} + git fetch --all + git pull + + gto show --json > MODEL_REGISTRY.md + git add . + git commit -m "Update Registry" + git push origin ${TARGET_BRANCH} From 0768ace94787e547c637ac6b6e4ae395db2edf52 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 26 May 2024 09:50:39 +0100 Subject: [PATCH 43/53] add testing script --- .github/workflows/MLPipelineTESTING.yml | 176 ++++++++++++------------ 1 file changed, 89 insertions(+), 87 deletions(-) diff --git a/.github/workflows/MLPipelineTESTING.yml b/.github/workflows/MLPipelineTESTING.yml index 92c1792..f2a200a 100644 --- a/.github/workflows/MLPipelineTESTING.yml +++ b/.github/workflows/MLPipelineTESTING.yml @@ -38,21 +38,23 @@ jobs: git config user.name "Github-Bot" git config user.email "Github-Bot@no-reply.com" - latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') || false - if [ -z "${latest_version}" ]; then - increment_version="1.0.0" - else - increment_version=$(echo ${latest_version} | awk 'BEGIN { - FS="\\." # Set the field separator to a period - OFS="." # Set the output field separator to a period - } - { - major = $1 + 1 # Increment the major version - print major, "0", "0" # Print the new version - }') - fi + gto show - new_tag=${REGISTER_MODEL_NAME}@v${increment_version} + # latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') || false + # if [ -z "${latest_version}" ]; then + # increment_version="1.0.0" + # else + # increment_version=$(echo ${latest_version} | awk 'BEGIN { + # FS="\\." # Set the field separator to a period + # OFS="." # Set the output field separator to a period + # } + # { + # major = $1 + 1 # Increment the major version + # print major, "0", "0" # Print the new version + # }') + # fi + + # new_tag=${REGISTER_MODEL_NAME}@v${increment_version} # git tag -a ${new_tag} -m "Registering new Major Version" # git push origin ${new_tag} @@ -83,21 +85,21 @@ jobs: git config user.name "Github-Bot" git config user.email "Github-Bot@no-reply.com" - latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') - if [ -z "${latest_version}" ]; then - increment_version="0.1.0" - else - increment_version=$(echo ${latest_version} | awk 'BEGIN { - FS="\\." # Set the field separator to a period - OFS="." # Set the output field separator to a period - } - { - minor = $2 + 1 # Increment the minor version - print $1, minor, "0" # Print the new version - }') - fi + # latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') + # if [ -z "${latest_version}" ]; then + # increment_version="0.1.0" + # else + # increment_version=$(echo ${latest_version} | awk 'BEGIN { + # FS="\\." # Set the field separator to a period + # OFS="." # Set the output field separator to a period + # } + # { + # minor = $2 + 1 # Increment the minor version + # print $1, minor, "0" # Print the new version + # }') + # fi - new_tag=${REGISTER_MODEL_NAME}@v${increment_version} + # new_tag=${REGISTER_MODEL_NAME}@v${increment_version} # git tag -a ${new_tag} -m "Registering new Minor Version" # git push origin ${new_tag} @@ -128,21 +130,21 @@ jobs: git config user.name "Github-Bot" git config user.email "Github-Bot@no-reply.com" - latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') - if [ -z "${latest_version}" ]; then - increment_version="0.0.1" - else - increment_version=$(echo ${latest_version} | awk 'BEGIN { - FS="\\." # Set the field separator to a period - OFS="." # Set the output field separator to a period - } - { - patch = $3 + 1 # Increment the patch version - print $1, $2, patch # Print the new version - }') - fi + # latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') + # if [ -z "${latest_version}" ]; then + # increment_version="0.0.1" + # else + # increment_version=$(echo ${latest_version} | awk 'BEGIN { + # FS="\\." # Set the field separator to a period + # OFS="." # Set the output field separator to a period + # } + # { + # patch = $3 + 1 # Increment the patch version + # print $1, $2, patch # Print the new version + # }') + # fi - new_tag=${REGISTER_MODEL_NAME}@v${increment_version} + # new_tag=${REGISTER_MODEL_NAME}@v${increment_version} # git tag -a ${new_tag} -m "Registering new Patch Version" # git push origin ${new_tag} @@ -179,60 +181,60 @@ jobs: cd modules/ml-pipeline/src/pipeline dvc push -r dev - Register-New-Model-Dev: - needs: [Register-Major-Model-Dev, Register-Minor-Model-Dev, Register-Patch-Model-Dev] - if: | - always() && - (needs.Register-Major-Model-Dev.result == 'success' || needs.Register-Major-Model-Dev.result == 'skipped') && - (needs.Register-Minor-Model-Dev.result == 'success' || needs.Register-Minor-Model-Dev.result == 'skipped') && - (needs.Register-Patch-Model-Dev.result == 'success' || needs.Register-Patch-Model-Dev.result == 'skipped') + # Register-New-Model-Dev: + # needs: [Register-Major-Model-Dev, Register-Minor-Model-Dev, Register-Patch-Model-Dev] + # if: | + # always() && + # (needs.Register-Major-Model-Dev.result == 'success' || needs.Register-Major-Model-Dev.result == 'skipped') && + # (needs.Register-Minor-Model-Dev.result == 'success' || needs.Register-Minor-Model-Dev.result == 'skipped') && + # (needs.Register-Patch-Model-Dev.result == 'success' || needs.Register-Patch-Model-Dev.result == 'skipped') - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v3 + # with: + # fetch-depth: 0 - - name: Install packages to register model - run: | - pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt + # - name: Install packages to register model + # run: | + # pip install --upgrade pip + # pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - - name: Register Model - env: - TARGET_BRANCH: ${{ github.base_ref }} - run: | + # - name: Register Model + # env: + # TARGET_BRANCH: ${{ github.base_ref }} + # run: | - REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') - # REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') + # REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') + # # REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') - git config user.name "Github-Bot" - git config user.email "Github-Bot@no-reply.com" + # git config user.name "Github-Bot" + # git config user.email "Github-Bot@no-reply.com" - latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/' | awk 'END {print}') - if [ -z "${latest_dev_version}" ]; then - increment_version="1" - else - increment_version=$(echo ${latest_dev_version} | awk '{print $NF}' | awk -F"#" '{print $3}' | awk '{$1++; print}') - fi + # latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/' | awk 'END {print}') + # if [ -z "${latest_dev_version}" ]; then + # increment_version="1" + # else + # increment_version=$(echo ${latest_dev_version} | awk '{print $NF}' | awk -F"#" '{print $3}' | awk '{$1++; print}') + # fi - new_tag=${REGISTER_MODEL_NAME}#dev#${increment_version} - latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@" '{print $2}') + # new_tag=${REGISTER_MODEL_NAME}#dev#${increment_version} + # latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@" '{print $2}') - echo ${new_tag} + # echo ${new_tag} - commit_hash=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk "/${latest_version}/" | awk '{print $(NF-1)}') - git checkout ${commit_hash} + # commit_hash=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk "/${latest_version}/" | awk '{print $(NF-1)}') + # git checkout ${commit_hash} - # git pull #Get new model registry md file changes - git tag -a ${new_tag} -m "Assigning stage dev to artifact ${REGISTER_MODEL_NAME} version ${latest_version}" - git push origin ${new_tag} + # # git pull #Get new model registry md file changes + # git tag -a ${new_tag} -m "Assigning stage dev to artifact ${REGISTER_MODEL_NAME} version ${latest_version}" + # git push origin ${new_tag} - git checkout ${TARGET_BRANCH} - git fetch --all - git pull + # git checkout ${TARGET_BRANCH} + # git fetch --all + # git pull - gto show --json > MODEL_REGISTRY.md - git add . - git commit -m "Update Registry" - git push origin ${TARGET_BRANCH} + # gto show --json > MODEL_REGISTRY.md + # git add . + # git commit -m "Update Registry" + # git push origin ${TARGET_BRANCH} From 9e23eae27af4c22a5f52280c718b1551a6db31ca Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 26 May 2024 09:54:22 +0100 Subject: [PATCH 44/53] add testing script --- .../pipeline/requirements/version_control/requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt index a2b9531..173550d 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt @@ -1,4 +1,4 @@ -dvc==3.36.0 -dvc-s3==3.0.1 -gto==1.6.1 +dvc==3.51.0 +dvc-s3==3.2.0 +gto==1.7.1 pyOpenSSL==23.3.0 From e0954b52bce8088ec2b1550d2a58fd40de454b87 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 26 May 2024 09:56:05 +0100 Subject: [PATCH 45/53] Upgrade dvc packages to fix pygit2 error --- .github/workflows/MLPipelineTESTING.yml | 240 ------------------------ 1 file changed, 240 deletions(-) delete mode 100644 .github/workflows/MLPipelineTESTING.yml diff --git a/.github/workflows/MLPipelineTESTING.yml b/.github/workflows/MLPipelineTESTING.yml deleted file mode 100644 index f2a200a..0000000 --- a/.github/workflows/MLPipelineTESTING.yml +++ /dev/null @@ -1,240 +0,0 @@ -name: Register the model for the given pipeline branch (TESTING) - -on: - push: - branches: - - "sap-dev-gto" - -# on: -# pull_request: -# types: -# - closed -# branches: -# - "sap-dev" -# - "heat-dev" -# - "carbon-dev" - -permissions: write-all - -jobs: - Register-Major-Model-Dev: - # if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'major')) }} - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Install packages to register model - run: | - pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - - - name: Register Model - run: | - # REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') - REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') - - git config user.name "Github-Bot" - git config user.email "Github-Bot@no-reply.com" - - gto show - - # latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') || false - # if [ -z "${latest_version}" ]; then - # increment_version="1.0.0" - # else - # increment_version=$(echo ${latest_version} | awk 'BEGIN { - # FS="\\." # Set the field separator to a period - # OFS="." # Set the output field separator to a period - # } - # { - # major = $1 + 1 # Increment the major version - # print major, "0", "0" # Print the new version - # }') - # fi - - # new_tag=${REGISTER_MODEL_NAME}@v${increment_version} - - # git tag -a ${new_tag} -m "Registering new Major Version" - # git push origin ${new_tag} - - # gto show --json > MODEL_REGISTRY.md - # git add . - # git commit -m "Update Registry" - # git push - - Register-Minor-Model-Dev: - # if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'minor')) }} - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Install packages to register model - run: | - pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - - - name: Register Model - run: | - # REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') - REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') - - git config user.name "Github-Bot" - git config user.email "Github-Bot@no-reply.com" - - # latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') - # if [ -z "${latest_version}" ]; then - # increment_version="0.1.0" - # else - # increment_version=$(echo ${latest_version} | awk 'BEGIN { - # FS="\\." # Set the field separator to a period - # OFS="." # Set the output field separator to a period - # } - # { - # minor = $2 + 1 # Increment the minor version - # print $1, minor, "0" # Print the new version - # }') - # fi - - # new_tag=${REGISTER_MODEL_NAME}@v${increment_version} - - # git tag -a ${new_tag} -m "Registering new Minor Version" - # git push origin ${new_tag} - - # gto show --json > MODEL_REGISTRY.md - # git add . - # git commit -m "Update Registry" - # git push - - Register-Patch-Model-Dev: - # if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'patch')) }} - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Install packages to register model - run: | - pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - - - name: Register Model - run: | - # REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') - REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') - - git config user.name "Github-Bot" - git config user.email "Github-Bot@no-reply.com" - - # latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') - # if [ -z "${latest_version}" ]; then - # increment_version="0.0.1" - # else - # increment_version=$(echo ${latest_version} | awk 'BEGIN { - # FS="\\." # Set the field separator to a period - # OFS="." # Set the output field separator to a period - # } - # { - # patch = $3 + 1 # Increment the patch version - # print $1, $2, patch # Print the new version - # }') - # fi - - # new_tag=${REGISTER_MODEL_NAME}@v${increment_version} - - # git tag -a ${new_tag} -m "Registering new Patch Version" - # git push origin ${new_tag} - - # gto show --json > MODEL_REGISTRY.md - # git add . - # git commit -m "Update Registry" - # git push - - Promote-Artefacts-To-Dev: - if: github.event.pull_request.merged == true - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - name: Install packages to retrieve artifacts - run: | - pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - - - name: Retrieve artifacts (dvc.lock) - env: - AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} - run: | - cd modules/ml-pipeline/src/pipeline - dvc pull -r experiments - - - name: Push artifacts to Dev - env: - AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} - run: | - cd modules/ml-pipeline/src/pipeline - dvc push -r dev - - # Register-New-Model-Dev: - # needs: [Register-Major-Model-Dev, Register-Minor-Model-Dev, Register-Patch-Model-Dev] - # if: | - # always() && - # (needs.Register-Major-Model-Dev.result == 'success' || needs.Register-Major-Model-Dev.result == 'skipped') && - # (needs.Register-Minor-Model-Dev.result == 'success' || needs.Register-Minor-Model-Dev.result == 'skipped') && - # (needs.Register-Patch-Model-Dev.result == 'success' || needs.Register-Patch-Model-Dev.result == 'skipped') - - # runs-on: ubuntu-latest - # steps: - # - uses: actions/checkout@v3 - # with: - # fetch-depth: 0 - - # - name: Install packages to register model - # run: | - # pip install --upgrade pip - # pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - - # - name: Register Model - # env: - # TARGET_BRANCH: ${{ github.base_ref }} - # run: | - - # REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') - # # REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') - - # git config user.name "Github-Bot" - # git config user.email "Github-Bot@no-reply.com" - - # latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/' | awk 'END {print}') - # if [ -z "${latest_dev_version}" ]; then - # increment_version="1" - # else - # increment_version=$(echo ${latest_dev_version} | awk '{print $NF}' | awk -F"#" '{print $3}' | awk '{$1++; print}') - # fi - - # new_tag=${REGISTER_MODEL_NAME}#dev#${increment_version} - # latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@" '{print $2}') - - # echo ${new_tag} - - # commit_hash=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk "/${latest_version}/" | awk '{print $(NF-1)}') - # git checkout ${commit_hash} - - # # git pull #Get new model registry md file changes - # git tag -a ${new_tag} -m "Assigning stage dev to artifact ${REGISTER_MODEL_NAME} version ${latest_version}" - # git push origin ${new_tag} - - # git checkout ${TARGET_BRANCH} - # git fetch --all - # git pull - - # gto show --json > MODEL_REGISTRY.md - # git add . - # git commit -m "Update Registry" - # git push origin ${TARGET_BRANCH} From a78c5a50b0976541239a696f68b9422ff63fca83 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Sun, 26 May 2024 09:07:46 +0000 Subject: [PATCH 46/53] Update Registry --- MODEL_REGISTRY.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 97c4388..606521a 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.11.0", + "version": "v0.12.0", "stage": { "dev": "v0.11.0" }, @@ -16,17 +16,17 @@ "active": true }, "heat": { - "version": "v0.4.0", + "version": "v0.5.0", "stage": { - "dev": "v0.4.0" + "dev": "v0.5.0" }, "registered": true, "active": true }, "carbon": { - "version": "v0.4.0", + "version": "v0.5.0", "stage": { - "dev": "v0.4.0" + "dev": "v0.5.0" }, "registered": true, "active": true From 396a5ffb087acf0186f3762cc117fad18a0668d2 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Sun, 26 May 2024 09:08:23 +0000 Subject: [PATCH 47/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 606521a..9b31dab 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.12.0", "stage": { - "dev": "v0.11.0" + "dev": "v0.12.0" }, "registered": true, "active": true From 5e0118ca0b1b03d32540050d8b2880b08e7273a8 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 28 May 2024 16:55:47 +0100 Subject: [PATCH 48/53] change deployment - pineed serverless pajkage --- .github/workflows/Deploy.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 6e34d36..265a324 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -19,8 +19,8 @@ jobs: - name: Install Serverless and plugins run: | - npm install -g serverless - npm install -g serverless-domain-manager + npm install -g serverless@^3.38.0 + npm install -g serverless-domain-manager@^7.3.8 - name: Install DVC run: | From 6f00d6afb80d1aa572181402430832674f86e9dd Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 28 May 2024 15:57:55 +0000 Subject: [PATCH 49/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 9b31dab..5d47b16 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.12.0", + "version": "v0.13.0", "stage": { "dev": "v0.12.0" }, From dc260fddd0efdc4a90e80fbd0438a121ca0234f1 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 28 May 2024 15:58:31 +0000 Subject: [PATCH 50/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 5d47b16..ff4b5a3 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.13.0", "stage": { - "dev": "v0.12.0" + "dev": "v0.13.0" }, "registered": true, "active": true From 8399092e20805f614e738924e1e3c69c8d9d5fec Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 28 May 2024 19:58:46 +0100 Subject: [PATCH 51/53] formatting --- .../src/pipeline/configs/scenarios.yaml | 4 +- .../src/pipeline/configs/settings.yaml | 4 +- modules/ml-pipeline/src/pipeline/dvc.lock | 58 +++++++++---------- 3 files changed, 35 insertions(+), 31 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml index 8190888..0d4ee07 100644 --- a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml @@ -6,6 +6,8 @@ default: # - s3://retrofit-data-dev/scenario_data/22-03-2024-19-20-09/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet - - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet + # - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet + # - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet + - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet comparison_output_filepath: ./metrics/scenario_table.md metrics_output_filepath: ./metrics/scenario_metrics.md diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 9d466af..838e9a9 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -19,7 +19,9 @@ default: input_dataclient_type: aws-s3 output_dataclient_type: local # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index f2fc8be..31315db 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -40,7 +40,7 @@ stages: default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe default.prepare_data.data_filepath: - s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet + s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -49,8 +49,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 26bbe6b1dafae18eb50689604b925c87.dir - size: 45002224 + md5: 80c9e138146a1d96b9d16091c207e2e8.dir + size: 45056059 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -61,8 +61,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 26bbe6b1dafae18eb50689604b925c87.dir - size: 45002224 + md5: 80c9e138146a1d96b9d16091c207e2e8.dir + size: 45056059 nfiles: 2 params: configs/build_model.yaml: @@ -94,18 +94,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 7ba44b4af6ecedf3ffebcf7512731d3d.dir - size: 3348905 + md5: d9c9afc05e8780db47c0548b19bf7d19.dir + size: 3349989 nfiles: 1 - path: data/model/ hash: md5 - md5: ef62a6f9b9336fb5b648589d6e0d54d6.dir - size: 737305293 - nfiles: 35 + md5: 13c3100e1486c27a83a8a47491077842.dir + size: 773523079 + nfiles: 36 - path: metrics/fit_metrics.json hash: md5 - md5: 38600703e1ece1447e5d0fd80b4de8b7 - size: 217 + md5: 2ff70a2a45813e1bcdf2ea3aa8e07d4a + size: 224 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -115,13 +115,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: ef62a6f9b9336fb5b648589d6e0d54d6.dir - size: 737305293 - nfiles: 35 + md5: 13c3100e1486c27a83a8a47491077842.dir + size: 773523079 + nfiles: 36 - path: data/prepared_data hash: md5 - md5: 26bbe6b1dafae18eb50689604b925c87.dir - size: 45002224 + md5: 80c9e138146a1d96b9d16091c207e2e8.dir + size: 45056059 nfiles: 2 params: configs/settings.yaml: @@ -133,8 +133,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 7d88320b1cd3c690032438fad6cb2200.dir - size: 463523 + md5: 5d07bcebf3160a72bb18dfd79106e85c.dir + size: 463197 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -145,13 +145,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 7d88320b1cd3c690032438fad6cb2200.dir - size: 463523 + md5: 5d07bcebf3160a72bb18dfd79106e85c.dir + size: 463197 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 26bbe6b1dafae18eb50689604b925c87.dir - size: 45002224 + md5: 80c9e138146a1d96b9d16091c207e2e8.dir + size: 45056059 nfiles: 2 params: configs/settings.yaml: @@ -161,30 +161,30 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: e17ee59f3a04178a153c5746bf897e74 + md5: 3e08df02fd5c5d094bcf936e1338d596 size: 223 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: - path: 5_generate_scenarios.py hash: md5 - md5: a18f6c6ae2082f038df47386cf3e418e - size: 4896 + md5: 40506749fefd926d47c60ff5b16db307 + size: 5337 params: configs/scenarios.yaml: default.scenarios: input_dataclient_type: aws-s3 output_dataclient_type: local scenario_data_filepaths: - - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet + - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet comparison_output_filepath: ./metrics/scenario_table.md metrics_output_filepath: ./metrics/scenario_metrics.md outs: - path: metrics/scenario_metrics.md hash: md5 - md5: 84fcae91af1480ee0c8fc1616af359b3 + md5: fa4d6d7bbd7818613800da5f8f37ea96 size: 363 - path: metrics/scenario_table.md hash: md5 - md5: 8571ff812c2f7c71eb0b1534ff6ecff5 + md5: d6baf100a1623cc2467c2f8221d314c9 size: 2133 From 9925b54af23d7f0a840de3ff1758a9955bb46f5f Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Thu, 30 May 2024 11:47:04 +0000 Subject: [PATCH 52/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index ff4b5a3..971208e 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.13.0", + "version": "v0.14.0", "stage": { "dev": "v0.13.0" }, From d09c534e0d99876ea282ff6dd7cb4600196cc419 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Thu, 30 May 2024 11:47:46 +0000 Subject: [PATCH 53/53] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 971208e..2fea343 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.14.0", "stage": { - "dev": "v0.13.0" + "dev": "v0.14.0" }, "registered": true, "active": true