From f9b0b6112c351aca229baa9173ec401a431704be Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 9 Oct 2023 15:44:37 +0000 Subject: [PATCH 01/47] add some processing ocde --- .../src/pipeline/configs/analysis.yaml | 2 +- .../src/pipeline/configs/build_model.yaml | 2 +- .../configs/feature_processor_logic.py | 25 +++++++++++++++++++ modules/ml-pipeline/src/pipeline/eda.py | 10 ++++---- 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/analysis.yaml b/modules/ml-pipeline/src/pipeline/configs/analysis.yaml index 5c6e749..725660b 100644 --- a/modules/ml-pipeline/src/pipeline/configs/analysis.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/analysis.yaml @@ -13,4 +13,4 @@ default: dataclient_type: local nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower - row_index: [0, 10, 20] # index of an example datapoint + row_index: [20695, 50243, 7653] # index of an example datapoint diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index d296e6a..bd684e9 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,6 +13,6 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 + time_limit: 180 presets: medium_quality excluded_model_types: ['KNN', 'RF'] diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index c32d2fe..4943f6b 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -18,6 +18,28 @@ def remove_starting_columns(df): return df +def remove_floor_height_ending(df): + # df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING'] + # shows bottom 0.5 percentile is 1.665 + # So keep anything above this + df = df[df["FLOOR_HEIGHT_ENDING"] > 1.665].reset_index(drop=True) + print("we in here") + return df + + +def remove_minimum_habitable_room_size(df): + # Need minimum of 6.5m per habitable room + df = df[ + df["TOTAL_FLOOR_AREA_ENDING"] / df["NUMBER_HABITABLE_ROOMS"] > 6.5 + ].reset_index(drop=True) + return df + + +def keep_flats(df): + df = df[df["PROPERTY_TYPE"] == "Flat"] + return df + + # def keep_ending_columns(df): # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] # keep_columns = df.columns[ending_column_index].to_list() @@ -27,6 +49,9 @@ def remove_starting_columns(df): # return df business_logic = { + # "keep_flats": keep_flats, + # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, + # "remove_floor_height_ending": remove_floor_height_ending # "remove_starting_columns": remove_starting_columns # "keep_ENDING_COLUMNS": keep_ending_columns } diff --git a/modules/ml-pipeline/src/pipeline/eda.py b/modules/ml-pipeline/src/pipeline/eda.py index 2fdd8be..6c29308 100644 --- a/modules/ml-pipeline/src/pipeline/eda.py +++ b/modules/ml-pipeline/src/pipeline/eda.py @@ -207,11 +207,11 @@ mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target]) mix_df = mix_df.sort_values("residual", ascending=False) cosine_similarity_df = mix_df[ - mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"]) + mix_df.columns.difference(["UPRN", "predictions", "residual", "SAP_ENDING"]) ] from sklearn.metrics.pairwise import cosine_similarity -row_index = 58199 +row_index = 20695 from sklearn.preprocessing import LabelEncoder @@ -224,8 +224,8 @@ cosine_similarity_df[object_columns.columns] = cosine_similarity_df[ feature_vector = cosine_similarity_df.loc[[row_index]] cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector) -similar_index = ( - cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index -) + +similar_df = cosine_similarity_df.sort_values("cosine", ascending=False).head(5) +similar_index = similar_df.index check_df = mix_df.loc[similar_index] From 790c3a9456eec4b05e831a99ed80a6963aeba0c9 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 18 Oct 2023 13:27:25 +0000 Subject: [PATCH 02/47] use test dataset --- .../src/pipeline/configs/build_model.yaml | 2 +- .../src/pipeline/configs/settings.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 42 +++++++++---------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index bd684e9..d296e6a 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,6 +13,6 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 180 + time_limit: 4000 presets: medium_quality excluded_model_types: ['KNN', 'RF'] diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index ce7ed2c..9333c46 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -21,7 +21,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index c499874..16eb857 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -20,7 +20,7 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: SAP_ENDING default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: cd75be9fecff0c647792dd2db648085c.dir + size: 37056053 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 5359 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: cd75be9fecff0c647792dd2db648085c.dir + size: 37056053 nfiles: 2 params: configs/build_model.yaml: @@ -66,13 +66,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: 7bb5156243b4db39349e80a01ffecde4.dir - size: 473398662 + md5: 7a5527f779efcb1a7db068148b6bcc45.dir + size: 422448184 nfiles: 27 - path: metrics/fit_metrics.json hash: md5 - md5: 2bb16ac67de8778fbc08171d562b34d5 - size: 184 + md5: 77790bb9485c04c77125e361921c3774 + size: 225 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -82,13 +82,13 @@ stages: size: 3028 - path: data/model hash: md5 - md5: 7bb5156243b4db39349e80a01ffecde4.dir - size: 473398662 + md5: 7a5527f779efcb1a7db068148b6bcc45.dir + size: 422448184 nfiles: 27 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: cd75be9fecff0c647792dd2db648085c.dir + size: 37056053 nfiles: 2 params: configs/settings.yaml: @@ -100,8 +100,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 0bb3cf991906953def81c8204cdcfaf0.dir - size: 374532 + md5: 28d2876e6c6d5cc64844ecc1d6ac40b2.dir + size: 346687 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -112,13 +112,13 @@ stages: size: 4487 - path: data/predictions hash: md5 - md5: 0bb3cf991906953def81c8204cdcfaf0.dir - size: 374532 + md5: 28d2876e6c6d5cc64844ecc1d6ac40b2.dir + size: 346687 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: cd75be9fecff0c647792dd2db648085c.dir + size: 37056053 nfiles: 2 params: configs/settings.yaml: @@ -128,8 +128,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 2e13ae67759a64261d03224f1c0d4bf4 - size: 185 + md5: 7afd04d656dc83ad6aa942d9c63f5b4e + size: 224 startup_cleanup: cmd: python 0_startup_cleanup.py deps: From b2e5fd9419b113a687e67b5791ab0fa4ceec9939 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Thu, 19 Oct 2023 01:19:29 +0000 Subject: [PATCH 03/47] Update Registry --- MODEL_REGISTRY.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index b3ad75a..4da80a8 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.1.0", + "version": "v0.2.0", "stage": { "dev": "v0.1.0" }, @@ -22,5 +22,13 @@ }, "registered": true, "active": true + }, + "carbon": { + "version": "v0.0.1", + "stage": { + "dev": "v0.0.1" + }, + "registered": true, + "active": true } } From 7efb9101031f085f79d746e6ca3002204d6e7aec Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Thu, 19 Oct 2023 01:20:19 +0000 Subject: [PATCH 04/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 4da80a8..e4413a0 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.2.0", "stage": { - "dev": "v0.1.0" + "dev": "v0.2.0" }, "registered": true, "active": true From e1cf3a48a9b6e91894702eb2ea4913ed87981026 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 20 Oct 2023 02:27:26 +0100 Subject: [PATCH 05/47] add dockerignore file for prediction lamda --- deployment/.dockerignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 deployment/.dockerignore diff --git a/deployment/.dockerignore b/deployment/.dockerignore new file mode 100644 index 0000000..d88bb1f --- /dev/null +++ b/deployment/.dockerignore @@ -0,0 +1,4 @@ +modules/ml-pipeline/src/pipeline/data/predictions +modules/ml-pipeline/src/pipeline/data/prepared_data +modules/ml-pipeline/src/pipeline/data/model/allmodels +modules/ml-pipeline/src/pipeline/metrics From fbd235addff6e5e8d52c7cbfc9b4717f03fca4b0 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 20 Oct 2023 02:39:09 +0100 Subject: [PATCH 06/47] add dockerignore for verify step --- deployment/.dockerignore | 8 ++++---- modules/ml-pipeline/src/.dockerignore | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) create mode 100644 modules/ml-pipeline/src/.dockerignore diff --git a/deployment/.dockerignore b/deployment/.dockerignore index d88bb1f..e01cbd5 100644 --- a/deployment/.dockerignore +++ b/deployment/.dockerignore @@ -1,4 +1,4 @@ -modules/ml-pipeline/src/pipeline/data/predictions -modules/ml-pipeline/src/pipeline/data/prepared_data -modules/ml-pipeline/src/pipeline/data/model/allmodels -modules/ml-pipeline/src/pipeline/metrics +modules/ml-pipeline/src/pipeline/data/predictions* +modules/ml-pipeline/src/pipeline/data/prepared_data* +modules/ml-pipeline/src/pipeline/data/model/allmodels* +modules/ml-pipeline/src/pipeline/metrics* diff --git a/modules/ml-pipeline/src/.dockerignore b/modules/ml-pipeline/src/.dockerignore new file mode 100644 index 0000000..ce8997b --- /dev/null +++ b/modules/ml-pipeline/src/.dockerignore @@ -0,0 +1,4 @@ +pipeline/data/predictions* +pipeline/data/prepared_data* +pipeline/data/model/allmodels* +pipeline/metrics* From a44fe33998b6851875ba73427229989108d42802 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 20 Oct 2023 02:48:17 +0100 Subject: [PATCH 07/47] add the test data back to get it to run --- modules/ml-pipeline/src/.dockerignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ml-pipeline/src/.dockerignore b/modules/ml-pipeline/src/.dockerignore index ce8997b..14f71d7 100644 --- a/modules/ml-pipeline/src/.dockerignore +++ b/modules/ml-pipeline/src/.dockerignore @@ -1,4 +1,4 @@ pipeline/data/predictions* -pipeline/data/prepared_data* +pipeline/data/prepared_data/train.parquet* pipeline/data/model/allmodels* pipeline/metrics* From ddf3ad3b4037b9ce93ae57708337b991bd68a8ed Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 20 Oct 2023 02:56:58 +0100 Subject: [PATCH 08/47] add dependency for workflow files --- .github/workflows/Deploy.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 48375c3..ec912f8 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -3,6 +3,10 @@ name: Sap Change Model Deploy on: push: branches: [ sap-dev, sap-prod ] + workflow_run: + workflows: [MLPipelinePostMerge] + types: + - completed jobs: deploy: From ad98ec4f1a30e4c6e870f072adc8704b7f6d3965 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 20 Oct 2023 01:58:57 +0000 Subject: [PATCH 09/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index e4413a0..8c94d23 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.2.0", + "version": "v0.2.1", "stage": { "dev": "v0.2.0" }, From b50e0ef1bace97f2265c116dab967f69ca198239 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 20 Oct 2023 01:59:48 +0000 Subject: [PATCH 10/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 8c94d23..f1b1b44 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.2.1", "stage": { - "dev": "v0.2.0" + "dev": "v0.2.1" }, "registered": true, "active": true From 0c87f21673e9dd925dbb675b6155a051eae3c85f Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 20 Oct 2023 03:08:13 +0100 Subject: [PATCH 11/47] test just a single dependency --- .github/workflows/Deploy.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index ec912f8..6f7124a 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -1,8 +1,8 @@ name: Sap Change Model Deploy on: - push: - branches: [ sap-dev, sap-prod ] + # push: + # branches: [ sap-dev, sap-prod ] workflow_run: workflows: [MLPipelinePostMerge] types: From 652bdd34676cad7abe592f5fc4c8630bc2fc5454 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 20 Oct 2023 02:11:10 +0000 Subject: [PATCH 12/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index f1b1b44..b1133de 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.2.1", + "version": "v0.2.2", "stage": { "dev": "v0.2.1" }, From c5a9b548ab7eb63b0d3df0cfc3e4171b9be46e64 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 20 Oct 2023 02:12:04 +0000 Subject: [PATCH 13/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index b1133de..e1e7c82 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.2.2", "stage": { - "dev": "v0.2.1" + "dev": "v0.2.2" }, "registered": true, "active": true From dadcbbab3aa8535dd2ba03e39b12c1c09712e0d7 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 20 Oct 2023 03:13:24 +0100 Subject: [PATCH 14/47] revert back for now --- .github/workflows/Deploy.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 6f7124a..48375c3 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -1,12 +1,8 @@ name: Sap Change Model Deploy on: - # push: - # branches: [ sap-dev, sap-prod ] - workflow_run: - workflows: [MLPipelinePostMerge] - types: - - completed + push: + branches: [ sap-dev, sap-prod ] jobs: deploy: From 72d4dbae3fdf2b775ec4c6d7a35b8ca130519a71 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 20 Oct 2023 02:15:23 +0000 Subject: [PATCH 15/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index e1e7c82..f7b2a20 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.2.2", + "version": "v0.2.3", "stage": { "dev": "v0.2.2" }, From c605d6b54933ecd4550f51320cc3bb85cf3e3678 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 20 Oct 2023 02:16:05 +0000 Subject: [PATCH 16/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index f7b2a20..828066a 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.2.3", "stage": { - "dev": "v0.2.2" + "dev": "v0.2.3" }, "registered": true, "active": true From 867f4e0bf021df40f37ef7e26115ecd31c41dd98 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 20 Oct 2023 15:45:04 +0100 Subject: [PATCH 17/47] change logging style --- .../src/pipeline/0_startup_cleanup.py | 10 ------- .../src/pipeline/1_prepare_data.py | 20 -------------- .../ml-pipeline/src/pipeline/2_build_model.py | 27 +------------------ .../src/pipeline/3_generate_predictions.py | 12 --------- .../src/pipeline/4_generate_metrics.py | 23 ---------------- .../ml-pipeline/src/pipeline/core/Logger.py | 2 ++ 6 files changed, 3 insertions(+), 91 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py b/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py index 0bfa37f..32e8a1b 100644 --- a/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py +++ b/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py @@ -16,13 +16,9 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: Remove the directory where artefacts are stored """ - logger.info("---------------------") logger.info(f"--- Run Clean up ---") - logger.info("---------------------") - logger.info("-------------------------") logger.info(f"--- Delete artefacts ---") - logger.info("-------------------------") artefact_directory_path = Path(artefacts_directory) @@ -31,9 +27,7 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: logger.info(f"Removing the directory: {artefacts_directory}") shutil.rmtree(artefact_directory_path) - logger.info("-----------------------") logger.info(f"--- Delete metrics ---") - logger.info("-----------------------") metrics_directory_path = Path(metrics_directory) @@ -45,15 +39,11 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") run_cleanup( artefacts_directory=startup_cleanup_params["artefacts"], metrics_directory=startup_cleanup_params["metrics"], ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/1_prepare_data.py b/modules/ml-pipeline/src/pipeline/1_prepare_data.py index 32daa19..ed7e057 100644 --- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py @@ -17,9 +17,7 @@ from core.DataClient import dataclient_factory from core.FeatureProcessor import feature_processor_factory from config import settings -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -33,9 +31,7 @@ output_train_filepath = prepare_data_params["output_train_filepath"] output_test_filepath = prepare_data_params["output_test_filepath"] feature_processor_config = feature_process_params["feature_processor_config"] -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") input_dataclient_type = prepare_data_params["input_dataclient_type"] output_dataclient_type = prepare_data_params["output_dataclient_type"] @@ -49,9 +45,7 @@ output_dataclient = dataclient_factory( dataclient_config=client_params[output_dataclient_type], ) -logger.info("----------------------------------") logger.info(f"--- Initiate FeatureProcessor ---") -logger.info("----------------------------------") feature_processor = feature_processor_factory( feature_process_params["feature_processor_type"] @@ -76,15 +70,11 @@ def prepare_data( :param pipeline_mode: bool, Default False, this caches out the file for experimentation, objects returned in pipeline mode """ - logger.info("--------------------") logger.info("--- Loading data ---") - logger.info("--------------------") data = input_dataclient.load_data(location=data_filepath, load_config={}) - logger.info("--------------------------") logger.info("--- Feature Processing ---") - logger.info("--------------------------") data = feature_processor.feature_process( data, @@ -93,9 +83,7 @@ def prepare_data( new_feature_funcs=new_feature_funcs, ) - logger.info("----------------------") logger.info("--- Splitting data ---") - logger.info("----------------------") if train_proportion == 1: train = data @@ -108,9 +96,7 @@ def prepare_data( train = train.reset_index(drop=True) - logger.info("-----------------------") logger.info("--- Outputting data ---") - logger.info("-----------------------") output_dataclient.save_data( obj=train, location=output_train_filepath, save_config=None @@ -126,13 +112,9 @@ def prepare_data( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("---------------------------") logger.info(f"--- Prepare Data Stage ---") - logger.info("---------------------------") prepare_data( input_dataclient=input_dataclient, @@ -147,6 +129,4 @@ if __name__ == "__main__": new_feature_funcs=new_feature_funcs, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/2_build_model.py b/modules/ml-pipeline/src/pipeline/2_build_model.py index cae5cfd..f3504a7 100644 --- a/modules/ml-pipeline/src/pipeline/2_build_model.py +++ b/modules/ml-pipeline/src/pipeline/2_build_model.py @@ -18,9 +18,7 @@ from core.MLMetrics import metrics_factory from configs.post_prediction_logic import post_prediction_logic from config import settings -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -40,22 +38,16 @@ train_filepath = prepare_data_params["output_train_filepath"] test_filepath = prepare_data_params["output_test_filepath"] fit_metrics_filepath = build_model_params["fit_metrics_filepath"] -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") # Output of previous prepare data step, will be where the data is dataclient = dataclient_factory(prepare_data_params["output_dataclient_type"]) -logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") -logger.info("-------------------------") model = model_factory(model_type) -logger.info("-------------------------") logger.info(f"--- Initiate Metrics ---") -logger.info("-------------------------") metrics = metrics_factory(generate_metrics_params["metrics_type"]) @@ -75,9 +67,8 @@ def build_model( test_data: Union[pd.DataFrame, None] = None, pipeline_mode: bool = False, ): - logger.info("--------------------------------------") + logger.info("--- Loading Data for build process ---") - logger.info("--------------------------------------") if train_data is None: if train_filepath is None: @@ -89,9 +80,7 @@ def build_model( raise ValueError(f"Need {test_filepath} if no data supplied") test_data = dataclient.load_data(location=test_filepath, load_config=None) - logger.info("----------------------") logger.info("--- Training model ---") - logger.info("----------------------") model.train_model( data=train_data.drop(columns=identifier_columns), @@ -99,32 +88,24 @@ def build_model( model_hyperparameters=model_hyperparameters, ) - logger.info("----------------------------------") logger.info("--- Generating fit predictions ---") - logger.info("----------------------------------") fit_predictions = model.predict( data=train_data, post_prediction_logic=post_prediction_logic ) - logger.info("------------------------------") logger.info("--- Generating fit metrics ---") - logger.info("------------------------------") metrics_output = metrics.generate_metrics( target=train_data[target], predictions=pd.Series(fit_predictions), ) - logger.info("--------------------") logger.info("--- Saving model ---") - logger.info("--------------------") model.save_model(path=Path(model_save_location)) - logger.info("--------------------------") logger.info("--- Saving fit metrics ---") - logger.info("--------------------------") dataclient.save_data( obj=metrics_output, location=fit_metrics_filepath, save_config=None @@ -133,13 +114,9 @@ def build_model( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("--------------------------") logger.info(f"--- Build Model Stage ---") - logger.info("--------------------------") build_model( dataclient=dataclient, @@ -154,6 +131,4 @@ if __name__ == "__main__": fit_metrics_filepath=fit_metrics_filepath, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py index 9461392..acb9e99 100644 --- a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py @@ -10,9 +10,7 @@ from core.Logger import logger from config import settings from generate_predictions import generate_predictions -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -33,15 +31,11 @@ model_filepath = build_model_params["model_save_filepath"] predictions_output_filepath = generate_predictions_params["predictions_output_filepath"] predictions_column_name = generate_predictions_params["predictions_column_name"] -logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") -logger.info("-------------------------") model = model_factory(build_model_params["model_type"]) -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") # We may have different locations of loading hence why we use one specified in generate_predictions.yaml # I.e. for metric runs, this will be a local data client @@ -59,13 +53,9 @@ output_dataclient = dataclient_factory( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("----------------------------------") logger.info(f"--- Generate Predictions Stage---") - logger.info("----------------------------------") generate_predictions( input_dataclient=input_dataclient, @@ -78,6 +68,4 @@ if __name__ == "__main__": predictions_column_name=predictions_column_name, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py index 7b115a2..ddcd3cc 100644 --- a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py +++ b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py @@ -16,9 +16,7 @@ from core.MLMetrics import metrics_factory from core.Logger import logger from config import settings -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -35,16 +33,11 @@ predictions_output_filepath = generate_predictions_params["predictions_output_fi predictions_column_name = generate_predictions_params["predictions_column_name"] metrics_output_filepath = generate_metrics_params["metrics_output_filepath"] - -logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") -logger.info("-------------------------") model = model_factory(build_model_params["model_type"]) -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") # Use data client for input and output, as we use dvc to cache later to the cloud dataclient_type = generate_metrics_params["dataclient_type"] @@ -53,9 +46,7 @@ dataclient = dataclient_factory( dataclient_config=client_params[dataclient_type], ) -logger.info("---------------------------") logger.info(f"--- Initiate MLMetrics ---") -logger.info("---------------------------") metrics = metrics_factory(generate_metrics_params["metrics_type"]) @@ -75,34 +66,26 @@ def generate_metrics( For a given model, we generate prediction and evaluate this against the true target """ - logger.info("-------------------------") logger.info("--- Loading test data ---") - logger.info("-------------------------") test_data = input_dataclient.load_data( location=test_data_filepath, load_config=None ) - logger.info("---------------------------") logger.info("--- Loading predictions ---") - logger.info("---------------------------") predictions = input_dataclient.load_data( location=predictions_output_filepath, load_config=None ) - logger.info("--------------------------") logger.info("--- Generating metrics ---") - logger.info("--------------------------") metrics_output = metrics.generate_metrics( target=test_data[target], predictions=pd.Series(predictions[predictions_column_name]), ) - logger.info("----------------------") logger.info("--- Saving metrics ---") - logger.info("----------------------") output_dataclient.save_data( obj=metrics_output, location=metrics_output_filepath, save_config=None @@ -111,13 +94,9 @@ def generate_metrics( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("------------------------------") logger.info(f"--- Generate Metrics Stage---") - logger.info("------------------------------") generate_metrics( input_dataclient=dataclient, @@ -131,6 +110,4 @@ if __name__ == "__main__": metrics_output_filepath=metrics_output_filepath, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/core/Logger.py b/modules/ml-pipeline/src/pipeline/core/Logger.py index a0fc231..2194063 100644 --- a/modules/ml-pipeline/src/pipeline/core/Logger.py +++ b/modules/ml-pipeline/src/pipeline/core/Logger.py @@ -22,6 +22,8 @@ def setup_logger(): # Add the stream handler to the logger logger.addHandler(stream_handler) + logger.propagate = False + return logger From 811d47b78a0a5a181053ea16164da179300debb8 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 20 Oct 2023 23:30:31 +0100 Subject: [PATCH 18/47] remove more lines --- modules/ml-pipeline/src/pipeline/generate_predictions.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/generate_predictions.py b/modules/ml-pipeline/src/pipeline/generate_predictions.py index 83ea103..59ce732 100644 --- a/modules/ml-pipeline/src/pipeline/generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/generate_predictions.py @@ -20,23 +20,17 @@ def generate_predictions( For a given model, we generate prediction and evaluate this against the true target """ - logger.info("-------------------------") logger.info("--- Loading test data ---") - logger.info("-------------------------") test_data = input_dataclient.load_data( location=test_data_filepath, load_config=None ) - logger.info("---------------------") logger.info("--- Loading model ---") - logger.info("---------------------") model.load_model(model_filepath) - logger.info("------------------------------") logger.info("--- Generating predictions ---") - logger.info("------------------------------") prediction_data = ( test_data.drop(columns=target) if target in test_data.columns else test_data @@ -46,9 +40,7 @@ def generate_predictions( data=prediction_data, post_prediction_logic=post_prediction_logic ) - logger.info("--------------------------") logger.info("--- Saving predictions ---") - logger.info("--------------------------") predictions_df = pd.DataFrame(predictions) predictions_df.columns = [predictions_column_name] From 960425e709c4c780d34d3455e89922a2ecef91c5 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 20 Oct 2023 22:40:39 +0000 Subject: [PATCH 19/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 828066a..ca0686d 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.2.3", + "version": "v0.2.4", "stage": { "dev": "v0.2.3" }, From 3145b5d3312f64b3c79a48e37c538f3cec05f739 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Fri, 20 Oct 2023 22:41:26 +0000 Subject: [PATCH 20/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index ca0686d..745cb3b 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.2.4", "stage": { - "dev": "v0.2.3" + "dev": "v0.2.4" }, "registered": true, "active": true From ca37e4ee184a2a0808f473de708f6268b31b5f8d Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sat, 21 Oct 2023 04:00:13 +0100 Subject: [PATCH 21/47] final removal of dash from handler --- deployment/handlers/prediction_app.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py index 36a906c..ac397b9 100644 --- a/deployment/handlers/prediction_app.py +++ b/deployment/handlers/prediction_app.py @@ -69,9 +69,7 @@ def handler(event, context): storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet" - logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") - logger.info("-------------------------") build_model_params = settings.build_model client_params = settings.client @@ -80,17 +78,13 @@ def handler(event, context): model = model_factory(build_model_params["model_type"]) - logger.info("----------------------------") logger.info(f"--- Initiate Input DataClient ---") - logger.info("----------------------------") input_dataclient = dataclient_factory( dataclient_type="aws-s3", dataclient_config=client_params["aws-s3"], ) - logger.info("----------------------------") logger.info(f"--- Initiate Output DataClient ---") - logger.info("----------------------------") output_dataclient = dataclient_factory( dataclient_type="aws-s3", dataclient_config=client_params["aws-s3"], From 72cf7096016796cca1c5c8d523d0a666c3ed3243 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Sat, 21 Oct 2023 03:02:38 +0000 Subject: [PATCH 22/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 745cb3b..f755b77 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.2.4", + "version": "v0.2.5", "stage": { "dev": "v0.2.4" }, From a15bdd5ee04b3b6f9bb63600e31b2e6f9c51c70a Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Sat, 21 Oct 2023 03:03:21 +0000 Subject: [PATCH 23/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index f755b77..2c3db33 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.2.5", "stage": { - "dev": "v0.2.4" + "dev": "v0.2.5" }, "registered": true, "active": true From cbd46489fec444bf5920d6eea1f79e05be0e0f37 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 22 Oct 2023 03:25:07 +0000 Subject: [PATCH 24/47] Remove propgate --- modules/ml-pipeline/src/pipeline/core/Logger.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/ml-pipeline/src/pipeline/core/Logger.py b/modules/ml-pipeline/src/pipeline/core/Logger.py index 2194063..d2f6c61 100644 --- a/modules/ml-pipeline/src/pipeline/core/Logger.py +++ b/modules/ml-pipeline/src/pipeline/core/Logger.py @@ -21,7 +21,6 @@ def setup_logger(): # Add the stream handler to the logger logger.addHandler(stream_handler) - logger.propagate = False return logger From 499458b6993fb4070980d6f78f9a64a71599128a Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 22 Oct 2023 21:02:32 +0000 Subject: [PATCH 25/47] add time to inference to model --- .../src/pipeline/configs/build_model.yaml | 2 + modules/ml-pipeline/src/pipeline/dvc.lock | 60 ++++++++++--------- 2 files changed, 33 insertions(+), 29 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index d296e6a..1ebb62d 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -16,3 +16,5 @@ default: time_limit: 4000 presets: medium_quality excluded_model_types: ['KNN', 'RF'] + infer_limit: 0.05 + infer_limit_batch_size: 10000 diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 16eb857..20dd532 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: 1_prepare_data.py hash: md5 - md5: c9f030df733e318b80d1fa91b7732f79 - size: 5132 + md5: 896d3d88a4a9f68d174efe71dc089517 + size: 4222 params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: @@ -29,20 +29,20 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: cd75be9fecff0c647792dd2db648085c.dir - size: 37056053 + md5: 6bfdb621b608648c017bf2323f7b5052.dir + size: 37048968 nfiles: 2 build_model: cmd: python 2_build_model.py deps: - path: 2_build_model.py hash: md5 - md5: 84699d208874c52accaff61c6af9bb0a - size: 5359 + md5: 7b79f280b8b0d5bc6f07669e7cc37c6a + size: 4150 - path: data/prepared_data hash: md5 - md5: cd75be9fecff0c647792dd2db648085c.dir - size: 37056053 + md5: 6bfdb621b608648c017bf2323f7b5052.dir + size: 37048968 nfiles: 2 params: configs/build_model.yaml: @@ -63,32 +63,34 @@ stages: excluded_model_types: - KNN - RF + infer_limit: 0.05 + infer_limit_batch_size: 10000 outs: - path: data/model/ hash: md5 - md5: 7a5527f779efcb1a7db068148b6bcc45.dir - size: 422448184 + md5: f2999107de7572ea5ff0f2d774fa83b8.dir + size: 424943352 nfiles: 27 - path: metrics/fit_metrics.json hash: md5 - md5: 77790bb9485c04c77125e361921c3774 - size: 225 + md5: 9537e7ebc2eb32b421a7cabd2005f00b + size: 223 generate_predictions: cmd: python 3_generate_predictions.py deps: - path: 3_generate_predictions.py hash: md5 - md5: 5ef2856a5a977304f1ec01f9b4205262 - size: 3028 + md5: 0a70ad4dfe99414a75d1261c75a177b9 + size: 2464 - path: data/model hash: md5 - md5: 7a5527f779efcb1a7db068148b6bcc45.dir - size: 422448184 + md5: f2999107de7572ea5ff0f2d774fa83b8.dir + size: 424943352 nfiles: 27 - path: data/prepared_data hash: md5 - md5: cd75be9fecff0c647792dd2db648085c.dir - size: 37056053 + md5: 6bfdb621b608648c017bf2323f7b5052.dir + size: 37048968 nfiles: 2 params: configs/settings.yaml: @@ -100,25 +102,25 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 28d2876e6c6d5cc64844ecc1d6ac40b2.dir - size: 346687 + md5: f4439a56669f84bc51a9fcb4cd08353f.dir + size: 346539 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py deps: - path: 4_generate_metrics.py hash: md5 - md5: 2c9fb78955a8c19cff0a098976f81d1b - size: 4487 + md5: 4fedb86d89d528f0a6597934ba3890a0 + size: 3484 - path: data/predictions hash: md5 - md5: 28d2876e6c6d5cc64844ecc1d6ac40b2.dir - size: 346687 + md5: f4439a56669f84bc51a9fcb4cd08353f.dir + size: 346539 nfiles: 1 - path: data/prepared_data hash: md5 - md5: cd75be9fecff0c647792dd2db648085c.dir - size: 37056053 + md5: 6bfdb621b608648c017bf2323f7b5052.dir + size: 37048968 nfiles: 2 params: configs/settings.yaml: @@ -128,15 +130,15 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 7afd04d656dc83ad6aa942d9c63f5b4e + md5: 357904cf106279be5a578e8faefa5d80 size: 224 startup_cleanup: cmd: python 0_startup_cleanup.py deps: - path: 0_startup_cleanup.py hash: md5 - md5: fbb7e3b1b98b517c870f3e1df3e7f695 - size: 1676 + md5: b1b12f6b6393fbf8b83d23684df0a3d4 + size: 1220 params: configs/settings.yaml: default.startup_cleanup.artefacts: ./data From 0f96bc55f13135279e682fbd4fe4bf821ccb2233 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 22 Oct 2023 21:05:07 +0000 Subject: [PATCH 26/47] add time to inference to model --- modules/ml-pipeline/src/pipeline/core/MLModels.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/ml-pipeline/src/pipeline/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py index 4cf8b08..4fc572a 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLModels.py +++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py @@ -149,6 +149,8 @@ class AutogluonAutoML: "time_limit", "presets", "excluded_model_types", + "infer_limit", + "infer_limit_batch_size", ] def load_model(self, path: Union[Path, str]) -> None: @@ -203,6 +205,8 @@ class AutogluonAutoML: time_limit=model_hyperparameters["time_limit"], presets=model_hyperparameters["presets"], excluded_model_types=model_hyperparameters["excluded_model_types"], + infer_limit=model_hyperparameters["infer_limit"], + infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"], ) def predict( From 6d3407ba0eb569d8739fd4f49c4d38aed3987f7f Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Sun, 22 Oct 2023 21:06:37 +0000 Subject: [PATCH 27/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 2c3db33..731b818 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.2.5", + "version": "v0.2.6", "stage": { "dev": "v0.2.5" }, From 7d26ec42198ebd89ab728c46b65d95ea0e595f52 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Sun, 22 Oct 2023 21:07:17 +0000 Subject: [PATCH 28/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 731b818..6cf2958 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.2.6", "stage": { - "dev": "v0.2.5" + "dev": "v0.2.6" }, "registered": true, "active": true From acdac3d8dcfc84ce70093a515cd56c91e4eb8cb4 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 22 Dec 2023 10:28:56 +0000 Subject: [PATCH 29/47] test new data --- .../src/pipeline/configs/build_model.yaml | 2 +- .../configs/feature_processor_logic.py | 10 +-- .../pipeline/configs/post_prediction_logic.py | 4 +- .../src/pipeline/configs/settings.yaml | 8 +-- modules/ml-pipeline/src/pipeline/dvc.lock | 64 +++++++++---------- 5 files changed, 44 insertions(+), 44 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 1ebb62d..4c72487 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,7 +13,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 + time_limit: 400 presets: medium_quality excluded_model_types: ['KNN', 'RF'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 4943f6b..026191c 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -9,11 +9,11 @@ Business Logic dict + functions def remove_starting_columns(df): keep_column_index = [ - False if col_name.endswith("_STARTING") else True + False if col_name.endswith("_starting") else True for col_name in list(df.columns) ] keep_columns = df.columns[keep_column_index].to_list() - keep_columns.append("SAP_STARTING") + keep_columns.append("sap_starting") df = df[keep_columns] return df @@ -22,7 +22,7 @@ def remove_floor_height_ending(df): # df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING'] # shows bottom 0.5 percentile is 1.665 # So keep anything above this - df = df[df["FLOOR_HEIGHT_ENDING"] > 1.665].reset_index(drop=True) + df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True) print("we in here") return df @@ -30,13 +30,13 @@ def remove_floor_height_ending(df): def remove_minimum_habitable_room_size(df): # Need minimum of 6.5m per habitable room df = df[ - df["TOTAL_FLOOR_AREA_ENDING"] / df["NUMBER_HABITABLE_ROOMS"] > 6.5 + df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5 ].reset_index(drop=True) return df def keep_flats(df): - df = df[df["PROPERTY_TYPE"] == "Flat"] + df = df[df["property_type"] == "Flat"] return df diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index b85d3a4..c1b8ebd 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -12,9 +12,9 @@ def clip_predictions_to_minimum_value( predictions.name = "predictions" predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement - replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"] + replace_index = predictions_df["sap_starting"] + 1 > predictions_df["predictions"] predictions_df.loc[replace_index, "predictions"] = ( - predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value + predictions_df.loc[replace_index, "sap_starting"] + minimum_value ) predictions_new = predictions_df["predictions"] diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 9333c46..d5ffe8d 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -21,7 +21,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_refactor.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -31,9 +31,9 @@ default: feature_processor_config: subsample_amount: null subsample_seed: 0 - target: SAP_ENDING - identifier_columns: ["UPRN"] - drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"] + target: sap_ending + identifier_columns: ["uprn"] + drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_change", "carbon_ending"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 20dd532..5e7bfe5 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -10,17 +10,17 @@ stages: params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: - - HEAT_DEMAND_CHANGE - - CARBON_CHANGE - - RDSAP_CHANGE - - HEAT_DEMAND_ENDING - - CARBON_ENDING + - heat_demand_change + - carbon_change + - rdsap_change + - heat_demand_change + - carbon_ending default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 - default.feature_processor.feature_processor_config.target: SAP_ENDING + default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_refactor.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,20 +29,20 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 6bfdb621b608648c017bf2323f7b5052.dir - size: 37048968 + md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir + size: 40122363 nfiles: 2 build_model: cmd: python 2_build_model.py deps: - path: 2_build_model.py hash: md5 - md5: 7b79f280b8b0d5bc6f07669e7cc37c6a - size: 4150 + md5: b824822475c222521516493e68eef9c5 + size: 4149 - path: data/prepared_data hash: md5 - md5: 6bfdb621b608648c017bf2323f7b5052.dir - size: 37048968 + md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir + size: 40122363 nfiles: 2 params: configs/build_model.yaml: @@ -58,7 +58,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 4000 + time_limit: 400 presets: medium_quality excluded_model_types: - KNN @@ -68,13 +68,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: f2999107de7572ea5ff0f2d774fa83b8.dir - size: 424943352 - nfiles: 27 + md5: 6a737d44dae68be2e75d6edb7f04f3ca.dir + size: 334981921 + nfiles: 24 - path: metrics/fit_metrics.json hash: md5 - md5: 9537e7ebc2eb32b421a7cabd2005f00b - size: 223 + md5: 89ba30b943c911e24b13b4370db12d18 + size: 225 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: f2999107de7572ea5ff0f2d774fa83b8.dir - size: 424943352 - nfiles: 27 + md5: 6a737d44dae68be2e75d6edb7f04f3ca.dir + size: 334981921 + nfiles: 24 - path: data/prepared_data hash: md5 - md5: 6bfdb621b608648c017bf2323f7b5052.dir - size: 37048968 + md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir + size: 40122363 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: f4439a56669f84bc51a9fcb4cd08353f.dir - size: 346539 + md5: c9a0ad3ef06f23d5d507bbec0ba86e98.dir + size: 362994 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: f4439a56669f84bc51a9fcb4cd08353f.dir - size: 346539 + md5: c9a0ad3ef06f23d5d507bbec0ba86e98.dir + size: 362994 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 6bfdb621b608648c017bf2323f7b5052.dir - size: 37048968 + md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir + size: 40122363 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +130,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 357904cf106279be5a578e8faefa5d80 - size: 224 + md5: fa40071006901c4335b5dbd567c9d9b3 + size: 226 startup_cleanup: cmd: python 0_startup_cleanup.py deps: From c576657805e47b85d3885a87189799bb78f179ab Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 22 Dec 2023 10:35:17 +0000 Subject: [PATCH 30/47] comment out old dataset --- modules/ml-pipeline/src/pipeline/configs/settings.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index d5ffe8d..cc5623d 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -21,6 +21,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_refactor.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet From daa4c28be646effc55cbdcba7bf04ab6a2ec0865 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 22 Dec 2023 10:44:23 +0000 Subject: [PATCH 31/47] remove unneeded dvc gto files --- modules/ml-pipeline/.dvc/.gitignore | 3 --- modules/ml-pipeline/.dvc/config | 2 -- modules/ml-pipeline/.dvcignore | 3 --- modules/ml-pipeline/.gto | 2 -- 4 files changed, 10 deletions(-) delete mode 100644 modules/ml-pipeline/.dvc/.gitignore delete mode 100644 modules/ml-pipeline/.dvc/config delete mode 100644 modules/ml-pipeline/.dvcignore delete mode 100644 modules/ml-pipeline/.gto diff --git a/modules/ml-pipeline/.dvc/.gitignore b/modules/ml-pipeline/.dvc/.gitignore deleted file mode 100644 index 528f30c..0000000 --- a/modules/ml-pipeline/.dvc/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/config.local -/tmp -/cache diff --git a/modules/ml-pipeline/.dvc/config b/modules/ml-pipeline/.dvc/config deleted file mode 100644 index 03ccfbc..0000000 --- a/modules/ml-pipeline/.dvc/config +++ /dev/null @@ -1,2 +0,0 @@ -['remote "myremote"'] - url = /tmp/dvcstore diff --git a/modules/ml-pipeline/.dvcignore b/modules/ml-pipeline/.dvcignore deleted file mode 100644 index 5197305..0000000 --- a/modules/ml-pipeline/.dvcignore +++ /dev/null @@ -1,3 +0,0 @@ -# Add patterns of files dvc should ignore, which could improve -# the performance. Learn more at -# https://dvc.org/doc/user-guide/dvcignore diff --git a/modules/ml-pipeline/.gto b/modules/ml-pipeline/.gto deleted file mode 100644 index c44c86e..0000000 --- a/modules/ml-pipeline/.gto +++ /dev/null @@ -1,2 +0,0 @@ -# .gto config file -stages: [dev, stage, prod] # list of allowed Stages From 717a1a64fef9a27632f163f2edc17c94b6101a30 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 22 Dec 2023 10:47:35 +0000 Subject: [PATCH 32/47] update version control packages --- .../requirements/version_control/requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt index 91cb005..a2b9531 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt @@ -1,4 +1,4 @@ -dvc==3.18.0 -dvc-s3==2.23.0 -gto==1.0.4 -pyOpenSSL==23.2.0 +dvc==3.36.0 +dvc-s3==3.0.1 +gto==1.6.1 +pyOpenSSL==23.3.0 From 50c369720e73c8c5ec3643b58bb674a26257dc05 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 22 Dec 2023 11:16:45 +0000 Subject: [PATCH 33/47] corrected model --- .../src/pipeline/configs/settings.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 42 +++++++++--------- modules/ml-pipeline/src/pipeline/eda.py | 43 +++++++++++++------ 3 files changed, 52 insertions(+), 35 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index cc5623d..918abd6 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -34,7 +34,7 @@ default: subsample_seed: 0 target: sap_ending identifier_columns: ["uprn"] - drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_change", "carbon_ending"] + drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 5e7bfe5..82c8608 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -13,7 +13,7 @@ stages: - heat_demand_change - carbon_change - rdsap_change - - heat_demand_change + - heat_demand_ending - carbon_ending default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir - size: 40122363 + md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir + size: 39303409 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 4149 - path: data/prepared_data hash: md5 - md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir - size: 40122363 + md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir + size: 39303409 nfiles: 2 params: configs/build_model.yaml: @@ -68,13 +68,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: 6a737d44dae68be2e75d6edb7f04f3ca.dir - size: 334981921 + md5: 6265dafedf579905c31c676e81c2a9c7.dir + size: 344212462 nfiles: 24 - path: metrics/fit_metrics.json hash: md5 - md5: 89ba30b943c911e24b13b4370db12d18 - size: 225 + md5: 5cd6b92af1b1df753e20e9ea33629c4d + size: 224 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 6a737d44dae68be2e75d6edb7f04f3ca.dir - size: 334981921 + md5: 6265dafedf579905c31c676e81c2a9c7.dir + size: 344212462 nfiles: 24 - path: data/prepared_data hash: md5 - md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir - size: 40122363 + md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir + size: 39303409 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: c9a0ad3ef06f23d5d507bbec0ba86e98.dir - size: 362994 + md5: b130faf5117b06897b2deed97f5868ee.dir + size: 367038 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: c9a0ad3ef06f23d5d507bbec0ba86e98.dir - size: 362994 + md5: b130faf5117b06897b2deed97f5868ee.dir + size: 367038 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir - size: 40122363 + md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir + size: 39303409 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +130,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: fa40071006901c4335b5dbd567c9d9b3 - size: 226 + md5: 3900cc1697d6d7308728b3d5b3025f85 + size: 224 startup_cleanup: cmd: python 0_startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/eda.py b/modules/ml-pipeline/src/pipeline/eda.py index 6c29308..e1d33a6 100644 --- a/modules/ml-pipeline/src/pipeline/eda.py +++ b/modules/ml-pipeline/src/pipeline/eda.py @@ -190,28 +190,35 @@ prediction_analysis_params = settings.prediction_analysis model = model_factory(build_model_params["model_type"]) model.load_model(build_model_params["model_save_filepath"]) dataclient_type = prediction_analysis_params["dataclient_type"] -dataclient = dataclient_factory( - dataclient_type=dataclient_type, - dataclient_config=client_params[dataclient_type], -) +# dataclient_type = 'aws-s3' +# dataclient = dataclient_factory( +# dataclient_type=dataclient_type, +# dataclient_config=client_params[dataclient_type], +# ) +# data = dataclient.load_data("s3://retrofit-data-dev/sap_change_model/dataset.parquet") target = feature_process_params["feature_processor_config"]["target"] predictions_column_name = generate_predictions_params["predictions_column_name"] output_test_filepath = prepare_data_params["output_test_filepath"] predictions_output_filepath = generate_predictions_params["predictions_output_filepath"] -test_df = dataclient.load_data(output_test_filepath) -predictions = dataclient.load_data(predictions_output_filepath) +# score_data = dataclient.load_data("s3://retrofit-data-dev/carbon_change_predictions/51/2023-11-28T21:01:21.869339.parquet") + + +local_dataclient = dataclient_factory( + dataclient_type="local", + dataclient_config=client_params["local"], +) +test_df = local_dataclient.load_data(output_test_filepath) +predictions = local_dataclient.load_data(predictions_output_filepath) mix_df = pd.concat([test_df.copy(), predictions], axis=1) mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target]) mix_df = mix_df.sort_values("residual", ascending=False) -cosine_similarity_df = mix_df[ - mix_df.columns.difference(["UPRN", "predictions", "residual", "SAP_ENDING"]) -] +cosine_similarity_df = mix_df[mix_df.columns.difference(["predictions", "residual"])] from sklearn.metrics.pairwise import cosine_similarity -row_index = 20695 +row_index = 0 from sklearn.preprocessing import LabelEncoder @@ -224,8 +231,18 @@ cosine_similarity_df[object_columns.columns] = cosine_similarity_df[ feature_vector = cosine_similarity_df.loc[[row_index]] cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector) - -similar_df = cosine_similarity_df.sort_values("cosine", ascending=False).head(5) -similar_index = similar_df.index +similar_index = ( + cosine_similarity_df.sort_values("cosine", ascending=False).head(15).index +) check_df = mix_df.loc[similar_index] + +columns_to_check = [ + "LOW_ENERGY_LIGHTING_ENDING", + "walls_thermal_transmittance_ENDING", + "floor_thermal_transmittance_ENDING", + "roof_thermal_transmittance_ENDING", + "roof_insulation_thickness_ENDING", +] + +cosine_similarity_df = mix_df[columns_to_check] From f472d3c5faa32ceec64e1b7aba9b54f88b2f13e3 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 16 Jan 2024 17:38:07 +0000 Subject: [PATCH 34/47] Update Registry --- MODEL_REGISTRY.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 6cf2958..9f51851 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.2.6", + "version": "v0.3.0", "stage": { "dev": "v0.2.6" }, @@ -16,17 +16,17 @@ "active": true }, "heat": { - "version": "v0.0.1", + "version": "v0.2.0", "stage": { - "dev": "v0.0.1" + "dev": "v0.2.0" }, "registered": true, "active": true }, "carbon": { - "version": "v0.0.1", + "version": "v0.2.0", "stage": { - "dev": "v0.0.1" + "dev": "v0.2.0" }, "registered": true, "active": true From 77888bb83961b45928efb781d2c69362768ca025 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 16 Jan 2024 17:38:50 +0000 Subject: [PATCH 35/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 9f51851..695997d 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.3.0", "stage": { - "dev": "v0.2.6" + "dev": "v0.3.0" }, "registered": true, "active": true From 0e31d67970629724254ddefb4e5f0df09df1893d Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 17 Jan 2024 23:07:22 +0000 Subject: [PATCH 36/47] run sap model with new data --- .../src/pipeline/configs/build_model.yaml | 2 +- .../src/pipeline/configs/settings.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 44 +++++++++---------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 4c72487..9c97ef0 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,7 +13,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 400 + time_limit: 600 presets: medium_quality excluded_model_types: ['KNN', 'RF'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 918abd6..bcc8802 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -22,7 +22,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_refactor.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 82c8608..39314dc 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -20,7 +20,7 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_refactor.parquet + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir - size: 39303409 + md5: d047420c632d91203199b9a93b6b0134.dir + size: 39476967 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 4149 - path: data/prepared_data hash: md5 - md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir - size: 39303409 + md5: d047420c632d91203199b9a93b6b0134.dir + size: 39476967 nfiles: 2 params: configs/build_model.yaml: @@ -58,7 +58,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 400 + time_limit: 600 presets: medium_quality excluded_model_types: - KNN @@ -68,13 +68,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: 6265dafedf579905c31c676e81c2a9c7.dir - size: 344212462 + md5: 0ad794c5498acfcc79893a371b29be62.dir + size: 372199625 nfiles: 24 - path: metrics/fit_metrics.json hash: md5 - md5: 5cd6b92af1b1df753e20e9ea33629c4d - size: 224 + md5: 534fa836074bdd9795b5879f0c479681 + size: 225 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 6265dafedf579905c31c676e81c2a9c7.dir - size: 344212462 + md5: 0ad794c5498acfcc79893a371b29be62.dir + size: 372199625 nfiles: 24 - path: data/prepared_data hash: md5 - md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir - size: 39303409 + md5: d047420c632d91203199b9a93b6b0134.dir + size: 39476967 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: b130faf5117b06897b2deed97f5868ee.dir - size: 367038 + md5: 25ac7334855d5eacc5fd9e2879900f33.dir + size: 367393 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: b130faf5117b06897b2deed97f5868ee.dir - size: 367038 + md5: 25ac7334855d5eacc5fd9e2879900f33.dir + size: 367393 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir - size: 39303409 + md5: d047420c632d91203199b9a93b6b0134.dir + size: 39476967 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +130,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 3900cc1697d6d7308728b3d5b3025f85 - size: 224 + md5: a6fa095b4cc44e6dd7828708f8cca18b + size: 222 startup_cleanup: cmd: python 0_startup_cleanup.py deps: From 47f8447223da0c0a0934ba09068af1c1443e6157 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Thu, 18 Jan 2024 10:36:52 +0000 Subject: [PATCH 37/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 695997d..74e4205 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.3.0", + "version": "v0.4.0", "stage": { "dev": "v0.3.0" }, From 6d6b82400647fd2d455210a35dbae10933378e64 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Thu, 18 Jan 2024 10:37:52 +0000 Subject: [PATCH 38/47] Update Registry --- MODEL_REGISTRY.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 74e4205..d82f94b 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,13 +10,13 @@ "sap": { "version": "v0.4.0", "stage": { - "dev": "v0.3.0" + "dev": "v0.4.0" }, "registered": true, "active": true }, "heat": { - "version": "v0.2.0", + "version": "v0.3.0", "stage": { "dev": "v0.2.0" }, @@ -24,7 +24,7 @@ "active": true }, "carbon": { - "version": "v0.2.0", + "version": "v0.3.0", "stage": { "dev": "v0.2.0" }, From efb84723bb7bf6e742c408854fdaecf918f8e133 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 23 Jan 2024 19:27:53 +0000 Subject: [PATCH 39/47] test model with 1 percent o change records --- .../src/pipeline/configs/build_model.yaml | 2 +- .../src/pipeline/configs/settings.yaml | 3 +- modules/ml-pipeline/src/pipeline/dvc.lock | 44 +++++++++---------- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 9c97ef0..4c72487 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,7 +13,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 600 + time_limit: 400 presets: medium_quality excluded_model_types: ['KNN', 'RF'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index bcc8802..4ba4779 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -22,7 +22,8 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + data_filepath: s3://retrofit-datalake-dev/dataset_with0.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 39314dc..19173d2 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -20,7 +20,7 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: d047420c632d91203199b9a93b6b0134.dir - size: 39476967 + md5: 1b1f7467e4abc12e6febbf2a84756914.dir + size: 39780684 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 4149 - path: data/prepared_data hash: md5 - md5: d047420c632d91203199b9a93b6b0134.dir - size: 39476967 + md5: 1b1f7467e4abc12e6febbf2a84756914.dir + size: 39780684 nfiles: 2 params: configs/build_model.yaml: @@ -58,7 +58,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 600 + time_limit: 400 presets: medium_quality excluded_model_types: - KNN @@ -68,13 +68,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: 0ad794c5498acfcc79893a371b29be62.dir - size: 372199625 + md5: c83b4cf0c51bd433bfb38307e978ed39.dir + size: 344485548 nfiles: 24 - path: metrics/fit_metrics.json hash: md5 - md5: 534fa836074bdd9795b5879f0c479681 - size: 225 + md5: 3105f9cf71b69b5b0f5675b2c169273c + size: 223 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 0ad794c5498acfcc79893a371b29be62.dir - size: 372199625 + md5: c83b4cf0c51bd433bfb38307e978ed39.dir + size: 344485548 nfiles: 24 - path: data/prepared_data hash: md5 - md5: d047420c632d91203199b9a93b6b0134.dir - size: 39476967 + md5: 1b1f7467e4abc12e6febbf2a84756914.dir + size: 39780684 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 25ac7334855d5eacc5fd9e2879900f33.dir - size: 367393 + md5: f914cf31400e228ee6e1386155b68e7c.dir + size: 369783 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 25ac7334855d5eacc5fd9e2879900f33.dir - size: 367393 + md5: f914cf31400e228ee6e1386155b68e7c.dir + size: 369783 nfiles: 1 - path: data/prepared_data hash: md5 - md5: d047420c632d91203199b9a93b6b0134.dir - size: 39476967 + md5: 1b1f7467e4abc12e6febbf2a84756914.dir + size: 39780684 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +130,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: a6fa095b4cc44e6dd7828708f8cca18b - size: 222 + md5: c23b7f0628473bf42eef126167e8928e + size: 224 startup_cleanup: cmd: python 0_startup_cleanup.py deps: From ca2a3d362352cb901cd85d00e9850df454ddbd41 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 23 Jan 2024 21:46:24 +0000 Subject: [PATCH 40/47] longer run model --- .../src/pipeline/configs/build_model.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 44 +++++++++---------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 4c72487..354b2ca 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,7 +13,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 400 + time_limit: 800 presets: medium_quality excluded_model_types: ['KNN', 'RF'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 19173d2..4d669b5 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 1b1f7467e4abc12e6febbf2a84756914.dir - size: 39780684 + md5: 12b7939c38b6a201063b063ed64d521b.dir + size: 39840424 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 4149 - path: data/prepared_data hash: md5 - md5: 1b1f7467e4abc12e6febbf2a84756914.dir - size: 39780684 + md5: 12b7939c38b6a201063b063ed64d521b.dir + size: 39840424 nfiles: 2 params: configs/build_model.yaml: @@ -58,7 +58,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 400 + time_limit: 800 presets: medium_quality excluded_model_types: - KNN @@ -68,12 +68,12 @@ stages: outs: - path: data/model/ hash: md5 - md5: c83b4cf0c51bd433bfb38307e978ed39.dir - size: 344485548 - nfiles: 24 + md5: 7d062363a9de5a659df638de1541d9ee.dir + size: 383515358 + nfiles: 26 - path: metrics/fit_metrics.json hash: md5 - md5: 3105f9cf71b69b5b0f5675b2c169273c + md5: 06c50da7ca7fdb631896790b76a5e19d size: 223 generate_predictions: cmd: python 3_generate_predictions.py @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: c83b4cf0c51bd433bfb38307e978ed39.dir - size: 344485548 - nfiles: 24 + md5: 7d062363a9de5a659df638de1541d9ee.dir + size: 383515358 + nfiles: 26 - path: data/prepared_data hash: md5 - md5: 1b1f7467e4abc12e6febbf2a84756914.dir - size: 39780684 + md5: 12b7939c38b6a201063b063ed64d521b.dir + size: 39840424 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: f914cf31400e228ee6e1386155b68e7c.dir - size: 369783 + md5: d6c97ad17146677fe705ccd7bcbb4873.dir + size: 369475 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: f914cf31400e228ee6e1386155b68e7c.dir - size: 369783 + md5: d6c97ad17146677fe705ccd7bcbb4873.dir + size: 369475 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 1b1f7467e4abc12e6febbf2a84756914.dir - size: 39780684 + md5: 12b7939c38b6a201063b063ed64d521b.dir + size: 39840424 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +130,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: c23b7f0628473bf42eef126167e8928e - size: 224 + md5: 6bb037ff29c7119576c8818b395d32f6 + size: 225 startup_cleanup: cmd: python 0_startup_cleanup.py deps: From d356fbfed0b22be77da90a9d00d67e240c18d015 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 24 Jan 2024 10:29:56 +0000 Subject: [PATCH 41/47] test model with all permutation and zero records --- .../configs/feature_processor_logic.py | 6 +++ .../src/pipeline/configs/settings.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 42 +++++++++---------- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 026191c..2d14dc4 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -40,6 +40,11 @@ def keep_flats(df): return df +def keep_non_zero_rdsap(df): + df = df[df["rdsap_change"] != 0] + return df + + # def keep_ending_columns(df): # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] # keep_columns = df.columns[ending_column_index].to_list() @@ -49,6 +54,7 @@ def keep_flats(df): # return df business_logic = { + "keep_non_zero_rdsap": keep_non_zero_rdsap, # "keep_flats": keep_flats, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, # "remove_floor_height_ending": remove_floor_height_ending diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 4ba4779..ba05d38 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -23,7 +23,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet - data_filepath: s3://retrofit-datalake-dev/dataset_with0.parquet + data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 4d669b5..dde6078 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -20,7 +20,7 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0.parquet + default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 12b7939c38b6a201063b063ed64d521b.dir - size: 39840424 + md5: 312d09b682ce0c973eabcec40e2741fe.dir + size: 39832060 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 4149 - path: data/prepared_data hash: md5 - md5: 12b7939c38b6a201063b063ed64d521b.dir - size: 39840424 + md5: 312d09b682ce0c973eabcec40e2741fe.dir + size: 39832060 nfiles: 2 params: configs/build_model.yaml: @@ -68,13 +68,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: 7d062363a9de5a659df638de1541d9ee.dir - size: 383515358 + md5: 7708d5705a2db2d621dae73338a641ae.dir + size: 393761847 nfiles: 26 - path: metrics/fit_metrics.json hash: md5 - md5: 06c50da7ca7fdb631896790b76a5e19d - size: 223 + md5: f7c3a5d39644d41cf60872baad7797b2 + size: 222 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 7d062363a9de5a659df638de1541d9ee.dir - size: 383515358 + md5: 7708d5705a2db2d621dae73338a641ae.dir + size: 393761847 nfiles: 26 - path: data/prepared_data hash: md5 - md5: 12b7939c38b6a201063b063ed64d521b.dir - size: 39840424 + md5: 312d09b682ce0c973eabcec40e2741fe.dir + size: 39832060 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: d6c97ad17146677fe705ccd7bcbb4873.dir - size: 369475 + md5: dade2114bb2be2769cf0648b8046f705.dir + size: 369115 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: d6c97ad17146677fe705ccd7bcbb4873.dir - size: 369475 + md5: dade2114bb2be2769cf0648b8046f705.dir + size: 369115 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 12b7939c38b6a201063b063ed64d521b.dir - size: 39840424 + md5: 312d09b682ce0c973eabcec40e2741fe.dir + size: 39832060 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +130,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 6bb037ff29c7119576c8818b395d32f6 - size: 225 + md5: 3315792b9f7e6f55d59a39db03ee7093 + size: 222 startup_cleanup: cmd: python 0_startup_cleanup.py deps: From 353b62bc77f507c7b5278a62fd4b4419a788634b Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 29 Jan 2024 09:03:36 +0000 Subject: [PATCH 42/47] test model with all data, using interal cross validation, all dataset with permuation and 0, test data is just a random 10 percent sample of the training data --- .../src/pipeline/1_prepare_data.py | 3 +- .../ml-pipeline/src/pipeline/2_build_model.py | 16 +++++ .../src/pipeline/configs/build_model.yaml | 5 +- .../configs/feature_processor_logic.py | 2 +- .../pipeline/configs/post_prediction_logic.py | 6 +- .../src/pipeline/configs/settings.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 68 +++++++++++-------- modules/ml-pipeline/src/pipeline/dvc.yaml | 1 + 8 files changed, 67 insertions(+), 36 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/1_prepare_data.py b/modules/ml-pipeline/src/pipeline/1_prepare_data.py index ed7e057..75d784f 100644 --- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py @@ -87,7 +87,8 @@ def prepare_data( if train_proportion == 1: train = data - test = None + # Sample 10% of the data for testing + test = data.sample(round(len(data) * 0.1)) else: train, test = train_test_split( data, train_size=train_proportion, test_size=(1 - train_proportion) diff --git a/modules/ml-pipeline/src/pipeline/2_build_model.py b/modules/ml-pipeline/src/pipeline/2_build_model.py index 7ca4951..09e5910 100644 --- a/modules/ml-pipeline/src/pipeline/2_build_model.py +++ b/modules/ml-pipeline/src/pipeline/2_build_model.py @@ -26,9 +26,12 @@ prepare_data_params = settings.prepare_data build_model_params = settings.build_model feature_process_params = settings.feature_processor generate_metrics_params = settings.generate_metrics +generate_predictions_params = settings.generate_predictions model_type = build_model_params["model_type"] target = feature_process_params["feature_processor_config"]["target"] +fit_predictions_filepath = build_model_params["fit_predictions_filepath"] +predictions_column_name = generate_predictions_params["predictions_column_name"] identifier_columns = feature_process_params["feature_processor_config"][ "identifier_columns" ] @@ -60,6 +63,8 @@ def build_model( identifier_columns: List[str], model_save_location: str, model_hyperparameters: dict, + fit_predictions_filepath: str, + predictions_column_name: str, fit_metrics_filepath: str, train_filepath: Union[str, None] = None, test_filepath: Union[str, None] = None, @@ -93,6 +98,15 @@ def build_model( data=train_data, post_prediction_logic=post_prediction_logic ) + logger.info("--- Saving fit predictions ---") + + predictions_df = pd.DataFrame(fit_predictions) + predictions_df.columns = [predictions_column_name] + + dataclient.save_data( + obj=predictions_df, location=fit_predictions_filepath, save_config=None + ) + logger.info("--- Generating fit metrics ---") metrics_output = metrics.generate_metrics( @@ -128,6 +142,8 @@ if __name__ == "__main__": train_filepath=train_filepath, test_filepath=test_filepath, fit_metrics_filepath=fit_metrics_filepath, + fit_predictions_filepath=fit_predictions_filepath, + predictions_column_name=predictions_column_name, ) logger.info(f"--- {__file__} - Complete! ---") diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 354b2ca..fcec7f7 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -3,6 +3,7 @@ default: model_type: AutogluonAutoML model_save_filepath: ./data/model/optimised/ fit_metrics_filepath: ./metrics/fit_metrics.json + fit_predictions_filepath: ./data/fit_predictions/predictions.parquet SKLearnLinearRegression: null @@ -13,8 +14,8 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 800 + time_limit: 4000 presets: medium_quality - excluded_model_types: ['KNN', 'RF'] + excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 infer_limit_batch_size: 10000 diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 2d14dc4..103168d 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -54,7 +54,7 @@ def keep_non_zero_rdsap(df): # return df business_logic = { - "keep_non_zero_rdsap": keep_non_zero_rdsap, + # "keep_non_zero_rdsap": keep_non_zero_rdsap, # "keep_flats": keep_flats, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, # "remove_floor_height_ending": remove_floor_height_ending diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index c1b8ebd..643231a 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -5,14 +5,16 @@ import pandas as pd def clip_predictions_to_minimum_value( - data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 1 + data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0 ) -> pd.Series: series_name = predictions.name predictions.name = "predictions" predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement - replace_index = predictions_df["sap_starting"] + 1 > predictions_df["predictions"] + replace_index = ( + predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"] + ) predictions_df.loc[replace_index, "predictions"] = ( predictions_df.loc[replace_index, "sap_starting"] + minimum_value ) diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index ba05d38..4327e64 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -24,7 +24,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet - train_proportion: 0.9 + train_proportion: 1 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index dde6078..f15978f 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: 1_prepare_data.py hash: md5 - md5: 896d3d88a4a9f68d174efe71dc089517 - size: 4222 + md5: 1793a35e71751d3c84f9affc67ecb9a8 + size: 4296 params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: @@ -25,24 +25,24 @@ stages: default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet - default.prepare_data.train_proportion: 0.9 + default.prepare_data.train_proportion: 1 outs: - path: data/prepared_data/ hash: md5 - md5: 312d09b682ce0c973eabcec40e2741fe.dir - size: 39832060 + md5: 84fa631bd02686b052d6a7144eafd38e.dir + size: 43859225 nfiles: 2 build_model: cmd: python 2_build_model.py deps: - path: 2_build_model.py hash: md5 - md5: b824822475c222521516493e68eef9c5 - size: 4149 + md5: 7231450b78920b0c5e7c6bada496b24a + size: 4820 - path: data/prepared_data hash: md5 - md5: 312d09b682ce0c973eabcec40e2741fe.dir - size: 39832060 + md5: 84fa631bd02686b052d6a7144eafd38e.dir + size: 43859225 nfiles: 2 params: configs/build_model.yaml: @@ -51,6 +51,7 @@ stages: model_type: AutogluonAutoML model_save_filepath: ./data/model/optimised/ fit_metrics_filepath: ./metrics/fit_metrics.json + fit_predictions_filepath: ./data/fit_predictions/predictions.parquet SKLearnLinearRegression: SKLearnSVMRegression: kernel: linear @@ -58,23 +59,32 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 800 + time_limit: 4000 presets: medium_quality excluded_model_types: - - KNN - RF + - FASTAI + - CAT + - NN_TORCH + - KNN + - XT infer_limit: 0.05 infer_limit_batch_size: 10000 outs: + - path: data/fit_predictions/ + hash: md5 + md5: ede187e9d0bffdef054f573f3c2bd222.dir + size: 3578590 + nfiles: 1 - path: data/model/ hash: md5 - md5: 7708d5705a2db2d621dae73338a641ae.dir - size: 393761847 - nfiles: 26 + md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir + size: 814720415 + nfiles: 31 - path: metrics/fit_metrics.json hash: md5 - md5: f7c3a5d39644d41cf60872baad7797b2 - size: 222 + md5: c45b84f12971a0156e4f3d85d3e725f5 + size: 218 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +94,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 7708d5705a2db2d621dae73338a641ae.dir - size: 393761847 - nfiles: 26 + md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir + size: 814720415 + nfiles: 31 - path: data/prepared_data hash: md5 - md5: 312d09b682ce0c973eabcec40e2741fe.dir - size: 39832060 + md5: 84fa631bd02686b052d6a7144eafd38e.dir + size: 43859225 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +112,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: dade2114bb2be2769cf0648b8046f705.dir - size: 369115 + md5: 5e60ca251af51de6fef3d0c659f8bb27.dir + size: 627416 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +124,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: dade2114bb2be2769cf0648b8046f705.dir - size: 369115 + md5: 5e60ca251af51de6fef3d0c659f8bb27.dir + size: 627416 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 312d09b682ce0c973eabcec40e2741fe.dir - size: 39832060 + md5: 84fa631bd02686b052d6a7144eafd38e.dir + size: 43859225 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +140,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 3315792b9f7e6f55d59a39db03ee7093 - size: 222 + md5: 033efa4d4044b6b6fc92dd37194727fa + size: 225 startup_cleanup: cmd: python 0_startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml index ccdd779..58889cc 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/dvc.yaml @@ -38,6 +38,7 @@ stages: - configs/build_model.yaml: outs: - data/model/ + - data/fit_predictions/ - metrics/fit_metrics.json always_changed: true generate_predictions: From 7f59305e208d44d86d3681035ad1a8f2f5938d4f Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Mon, 29 Jan 2024 12:37:45 +0000 Subject: [PATCH 43/47] Update Registry --- MODEL_REGISTRY.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index d82f94b..5359665 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.4.0", + "version": "v0.5.0", "stage": { "dev": "v0.4.0" }, @@ -18,7 +18,7 @@ "heat": { "version": "v0.3.0", "stage": { - "dev": "v0.2.0" + "dev": "v0.3.0" }, "registered": true, "active": true @@ -26,7 +26,7 @@ "carbon": { "version": "v0.3.0", "stage": { - "dev": "v0.2.0" + "dev": "v0.3.0" }, "registered": true, "active": true From 96eb3904e2989da2e1de6c4c359a7ef097037c04 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Mon, 29 Jan 2024 12:38:33 +0000 Subject: [PATCH 44/47] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 5359665..1bcceec 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.5.0", "stage": { - "dev": "v0.4.0" + "dev": "v0.5.0" }, "registered": true, "active": true From c0dc934be6a61e39c6aae6956c396208eb4c66d1 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 27 Mar 2024 23:10:36 +0000 Subject: [PATCH 45/47] run carbon model with new data --- MODEL_REGISTRY.md | 4 +- modules/ml-pipeline/src/README.md | 2 +- modules/ml-pipeline/src/pipeline/.gitignore | 1 + .../src/pipeline/configs/build_model.yaml | 1 + .../configs/feature_processor_logic.py | 54 +++++++---- .../pipeline/configs/post_prediction_logic.py | 15 +-- .../src/pipeline/configs/settings.yaml | 15 ++- .../ml-pipeline/src/pipeline/core/MLModels.py | 4 +- modules/ml-pipeline/src/pipeline/dvc.lock | 91 +++++++++++-------- .../predictions/requirements-dev.txt | 4 +- .../requirements/predictions/requirements.txt | 4 +- .../training/requirements-dev.txt | 7 +- .../requirements/training/requirements.txt | 4 +- 13 files changed, 119 insertions(+), 87 deletions(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 1bcceec..820b75a 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,9 +8,9 @@ "active": true }, "sap": { - "version": "v0.5.0", + "version": "v0.4.0", "stage": { - "dev": "v0.5.0" + "dev": "v0.4.0" }, "registered": true, "active": true diff --git a/modules/ml-pipeline/src/README.md b/modules/ml-pipeline/src/README.md index d7afc6a..db1b8b4 100644 --- a/modules/ml-pipeline/src/README.md +++ b/modules/ml-pipeline/src/README.md @@ -1,3 +1,3 @@ -# The generic reproducible ML-pipeline +# The generic reproducible ML-pipeline! Pipeline required to build a model to produce an output, that gets hashed via DVC diff --git a/modules/ml-pipeline/src/pipeline/.gitignore b/modules/ml-pipeline/src/pipeline/.gitignore index bf035d2..ce8309f 100644 --- a/modules/ml-pipeline/src/pipeline/.gitignore +++ b/modules/ml-pipeline/src/pipeline/.gitignore @@ -1,3 +1,4 @@ # Ignore dynaconf secret files .secrets.* +example.py diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index fcec7f7..be5ec5c 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -19,3 +19,4 @@ default: excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 infer_limit_batch_size: 10000 + ag_args_ensemble: {'num_folds_parallel': 2} diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 103168d..bcc53e5 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -18,30 +18,44 @@ def remove_starting_columns(df): return df -def remove_floor_height_ending(df): - # df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING'] - # shows bottom 0.5 percentile is 1.665 - # So keep anything above this - df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True) - print("we in here") +def keep_negative_heat_change(df): + df = df[df["heat_demand_change"] < 0] return df -def remove_minimum_habitable_room_size(df): - # Need minimum of 6.5m per habitable room - df = df[ - df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5 - ].reset_index(drop=True) +def keep_non_negative_carbon_ending(df): + df = df[df["carbon_ending"] > 0] return df -def keep_flats(df): - df = df[df["property_type"] == "Flat"] +def keep_negative_carbon_change(df): + df = df[df["carbon_change"] < 0] return df -def keep_non_zero_rdsap(df): - df = df[df["rdsap_change"] != 0] +# TODO: Move to ETL pipeline +def remove_unreasonable_habitable_rooms(df): + """ + Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2 + """ + minimum_room_size_index = ( + df["total_floor_area_ending"] / df["number_habitable_rooms"] >= 6.5 + ) + df = df[minimum_room_size_index] + return df + + +def remove_top_1_percent_heat_demand(df): + # threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%'] + threshold_value = 860 + df = df[df["heat_demand_starting"] < threshold_value] + return df + + +def remove_top_1_percent_carbon(df): + # threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%'] + threshold_value = 18 + df = df[df["carbon_starting"] < threshold_value] return df @@ -54,10 +68,12 @@ def keep_non_zero_rdsap(df): # return df business_logic = { - # "keep_non_zero_rdsap": keep_non_zero_rdsap, - # "keep_flats": keep_flats, - # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, - # "remove_floor_height_ending": remove_floor_height_ending + "remove_unreasonable_habitable_rooms": remove_unreasonable_habitable_rooms, + "keep_negative_heat_change": keep_negative_heat_change, + "keep_negative_carbon_change": keep_negative_carbon_change, + "remove_top_1_percent_heat_demand": remove_top_1_percent_heat_demand, + "remove_top_1_percent_carbon": remove_top_1_percent_carbon, + "keep_non_negative_carbon_ending": keep_non_negative_carbon_ending, # "remove_starting_columns": remove_starting_columns # "keep_ENDING_COLUMNS": keep_ending_columns } diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index 643231a..2ca8890 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -1,23 +1,24 @@ """ After predictions, we may want to apply some post processing to the predictions """ + import pandas as pd def clip_predictions_to_minimum_value( - data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0 + data: pd.DataFrame, + predictions: pd.Series, ) -> pd.Series: series_name = predictions.name predictions.name = "predictions" + predictions = predictions.astype(data["carbon_starting"].dtype) predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement - replace_index = ( - predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"] - ) - predictions_df.loc[replace_index, "predictions"] = ( - predictions_df.loc[replace_index, "sap_starting"] + minimum_value - ) + replace_index = predictions_df["predictions"] > predictions_df["carbon_starting"] + predictions_df.loc[replace_index, "predictions"] = predictions_df.loc[ + replace_index, "carbon_starting" + ] predictions_new = predictions_df["predictions"] predictions_new.name = series_name diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 4327e64..fecdcb0 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -18,12 +18,7 @@ default: prepare_data: input_dataclient_type: aws-s3 output_dataclient_type: local - # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet - data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet train_proportion: 1 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -33,9 +28,13 @@ default: feature_processor_config: subsample_amount: null subsample_seed: 0 - target: sap_ending + target: carbon_ending identifier_columns: ["uprn"] - drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending"] + # drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "sap_ending"] + drop_columns: [ + "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "sap_ending", "days_to_starting", "days_to_ending", + 'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending', + 'number_habitable_rooms', 'number_heated_rooms'] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py index 4fc572a..257261d 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLModels.py +++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py @@ -25,7 +25,7 @@ def model_factory(model_type: str) -> MLModel: models = { "SKLearnLinearRegression": SKLearnLinearRegression(), "SKLearnSVMRegression": SKLearnSVMRegression(), - "AutogluonAutoML": AutogluonAutoML() + "AutogluonAutoML": AutogluonAutoML(), # ADD OTHER MODELS HERE } @@ -151,6 +151,7 @@ class AutogluonAutoML: "excluded_model_types", "infer_limit", "infer_limit_batch_size", + "ag_args_ensemble", ] def load_model(self, path: Union[Path, str]) -> None: @@ -207,6 +208,7 @@ class AutogluonAutoML: excluded_model_types=model_hyperparameters["excluded_model_types"], infer_limit=model_hyperparameters["infer_limit"], infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"], + ag_args_ensemble=model_hyperparameters["ag_args_ensemble"], ) def predict( diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index f15978f..81224d8 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -1,12 +1,23 @@ schema: '2.0' stages: + startup_cleanup: + cmd: python 0_startup_cleanup.py + deps: + - path: 0_startup_cleanup.py + hash: md5 + md5: b1b12f6b6393fbf8b83d23684df0a3d4 + size: 1220 + params: + configs/settings.yaml: + default.startup_cleanup.artefacts: ./data + default.startup_cleanup.metrics: ./metrics prepare_data: cmd: python 1_prepare_data.py deps: - path: 1_prepare_data.py hash: md5 - md5: 1793a35e71751d3c84f9affc67ecb9a8 - size: 4296 + md5: 11a3b8bfdfe199ab7ecc39ccc5652649 + size: 4298 params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: @@ -14,13 +25,22 @@ stages: - carbon_change - rdsap_change - heat_demand_ending - - carbon_ending + - sap_ending + - days_to_starting + - days_to_ending + - number_habitable_rooms_starting + - number_habitable_rooms_ending + - number_heated_rooms_starting + - number_heated_rooms_ending + - number_habitable_rooms + - number_heated_rooms default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 - default.feature_processor.feature_processor_config.target: sap_ending + default.feature_processor.feature_processor_config.target: carbon_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet + default.prepare_data.data_filepath: + s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,8 +49,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 84fa631bd02686b052d6a7144eafd38e.dir - size: 43859225 + md5: 35d7daa8144434e188ba3b1da4bcf328.dir + size: 33946500 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +61,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 84fa631bd02686b052d6a7144eafd38e.dir - size: 43859225 + md5: 35d7daa8144434e188ba3b1da4bcf328.dir + size: 33946500 nfiles: 2 params: configs/build_model.yaml: @@ -70,21 +90,23 @@ stages: - XT infer_limit: 0.05 infer_limit_batch_size: 10000 + ag_args_ensemble: + num_folds_parallel: 2 outs: - path: data/fit_predictions/ hash: md5 - md5: ede187e9d0bffdef054f573f3c2bd222.dir - size: 3578590 + md5: 19d033f5abfa9b064c3e52815e607ced.dir + size: 3927492 nfiles: 1 - path: data/model/ hash: md5 - md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir - size: 814720415 - nfiles: 31 + md5: f159d40353b01ffdcf1b1b490c019f1f.dir + size: 787748148 + nfiles: 32 - path: metrics/fit_metrics.json hash: md5 - md5: c45b84f12971a0156e4f3d85d3e725f5 - size: 218 + md5: e69d56ab9d82f23f2aa66001bd9bebbc + size: 229 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -94,13 +116,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir - size: 814720415 - nfiles: 31 + md5: f159d40353b01ffdcf1b1b490c019f1f.dir + size: 787748148 + nfiles: 32 - path: data/prepared_data hash: md5 - md5: 84fa631bd02686b052d6a7144eafd38e.dir - size: 43859225 + md5: 35d7daa8144434e188ba3b1da4bcf328.dir + size: 33946500 nfiles: 2 params: configs/settings.yaml: @@ -112,8 +134,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 5e60ca251af51de6fef3d0c659f8bb27.dir - size: 627416 + md5: 50d0c76fc56c6290babeff1c84750316.dir + size: 651956 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -124,13 +146,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 5e60ca251af51de6fef3d0c659f8bb27.dir - size: 627416 + md5: 50d0c76fc56c6290babeff1c84750316.dir + size: 651956 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 84fa631bd02686b052d6a7144eafd38e.dir - size: 43859225 + md5: 35d7daa8144434e188ba3b1da4bcf328.dir + size: 33946500 nfiles: 2 params: configs/settings.yaml: @@ -140,16 +162,5 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 033efa4d4044b6b6fc92dd37194727fa - size: 225 - startup_cleanup: - cmd: python 0_startup_cleanup.py - deps: - - path: 0_startup_cleanup.py - hash: md5 - md5: b1b12f6b6393fbf8b83d23684df0a3d4 - size: 1220 - params: - configs/settings.yaml: - default.startup_cleanup.artefacts: ./data - default.startup_cleanup.metrics: ./metrics + md5: 542b982d6aa9fe0fdb89611e4299cb1e + size: 228 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt index 0d259fb..258981d 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 dynaconf==3.2.0 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt index afad9be..2ab48e9 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 dynaconf==3.2.0 pyarrow==13.0.0 PyYAML==6.0.1 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index d8c5907..2024d84 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -1,9 +1,10 @@ joblib==1.3.2 boto3==1.28.17 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 +ray==2.6.3 dynaconf==3.2.0 -alibi==0.9.4 +alibi==0.9.5 shap==0.42.1 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt index bbdc2fa..84452a3 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt @@ -1,4 +1,4 @@ boto3==1.28.41 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 dynaconf==3.2.0 From 78bf0a490dbf9bc281cf773f0d2317ea381ac786 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 27 Mar 2024 23:43:07 +0000 Subject: [PATCH 46/47] use 0.9 training data --- .../src/pipeline/configs/settings.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 50 +++++++++---------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index fecdcb0..66f7089 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -19,7 +19,7 @@ default: input_dataclient_type: aws-s3 output_dataclient_type: local data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet - train_proportion: 1 + train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 81224d8..ef40a2d 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -45,12 +45,12 @@ stages: default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet - default.prepare_data.train_proportion: 1 + default.prepare_data.train_proportion: 0.9 outs: - path: data/prepared_data/ hash: md5 - md5: 35d7daa8144434e188ba3b1da4bcf328.dir - size: 33946500 + md5: 824541f44e6538d2ef10e9d754c79743.dir + size: 36691842 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -61,8 +61,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 35d7daa8144434e188ba3b1da4bcf328.dir - size: 33946500 + md5: 824541f44e6538d2ef10e9d754c79743.dir + size: 36691842 nfiles: 2 params: configs/build_model.yaml: @@ -95,18 +95,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 19d033f5abfa9b064c3e52815e607ced.dir - size: 3927492 + md5: 5a3091120d3497fa00b994d91bc7e5eb.dir + size: 3664806 nfiles: 1 - path: data/model/ hash: md5 - md5: f159d40353b01ffdcf1b1b490c019f1f.dir - size: 787748148 - nfiles: 32 + md5: 074da8dcfa515b9f3d082b21c7d76616.dir + size: 721558897 + nfiles: 31 - path: metrics/fit_metrics.json hash: md5 - md5: e69d56ab9d82f23f2aa66001bd9bebbc - size: 229 + md5: 728a49dcef5a98182325df455f929a33 + size: 225 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -116,13 +116,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: f159d40353b01ffdcf1b1b490c019f1f.dir - size: 787748148 - nfiles: 32 + md5: 074da8dcfa515b9f3d082b21c7d76616.dir + size: 721558897 + nfiles: 31 - path: data/prepared_data hash: md5 - md5: 35d7daa8144434e188ba3b1da4bcf328.dir - size: 33946500 + md5: 824541f44e6538d2ef10e9d754c79743.dir + size: 36691842 nfiles: 2 params: configs/settings.yaml: @@ -134,8 +134,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 50d0c76fc56c6290babeff1c84750316.dir - size: 651956 + md5: 680f51234d214d4cab9e6a064c75fc5d.dir + size: 499546 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -146,13 +146,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 50d0c76fc56c6290babeff1c84750316.dir - size: 651956 + md5: 680f51234d214d4cab9e6a064c75fc5d.dir + size: 499546 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 35d7daa8144434e188ba3b1da4bcf328.dir - size: 33946500 + md5: 824541f44e6538d2ef10e9d754c79743.dir + size: 36691842 nfiles: 2 params: configs/settings.yaml: @@ -162,5 +162,5 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 542b982d6aa9fe0fdb89611e4299cb1e - size: 228 + md5: 67b7ab30a4b0839d20bc6eb0c84e4dd1 + size: 226 From bb3af26c3f77ea8d13e7679db2a366c56760d501 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 28 Mar 2024 16:06:43 +0000 Subject: [PATCH 47/47] add binary to prediction docker, change requiremnets --- deployment/Dockerfile.prediction.lambda | 2 +- .../src/pipeline/requirements/predictions/requirements-dev.txt | 2 +- .../src/pipeline/requirements/predictions/requirements.txt | 2 +- .../src/pipeline/requirements/training/requirements-dev.txt | 2 +- .../src/pipeline/requirements/training/requirements.txt | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/deployment/Dockerfile.prediction.lambda b/deployment/Dockerfile.prediction.lambda index a2520ba..f8000bf 100644 --- a/deployment/Dockerfile.prediction.lambda +++ b/deployment/Dockerfile.prediction.lambda @@ -9,7 +9,7 @@ ARG RUNTIME_ENVIRONMENT ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT} # Install necessary build tools - required to test locally -RUN yum install -y gcc python3-devel +RUN yum install -y gcc python3-devel gcc-c++ # Install python packages COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt index 258981d..734419a 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt @@ -2,6 +2,6 @@ joblib==1.3.2 boto3==1.28.17 pandas==2.1.4 autogluon==1.0.0 -dynaconf==3.2.0 +dynaconf==3.2.1 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt index 2ab48e9..937b000 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt @@ -2,6 +2,6 @@ joblib==1.3.2 boto3==1.28.17 pandas==2.1.4 autogluon==1.0.0 -dynaconf==3.2.0 +dynaconf==3.2.1 pyarrow==13.0.0 PyYAML==6.0.1 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index 2024d84..fe06a4d 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -3,7 +3,7 @@ boto3==1.28.17 pandas==2.1.4 autogluon==1.0.0 ray==2.6.3 -dynaconf==3.2.0 +dynaconf==3.2.1 alibi==0.9.5 shap==0.42.1 pyarrow==13.0.0 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt index 84452a3..a5bccd3 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt @@ -1,4 +1,4 @@ boto3==1.28.41 pandas==2.1.4 autogluon==1.0.0 -dynaconf==3.2.0 +dynaconf==3.2.1