From f2cc32f4b4b2a8c5c7e7d461744e700815b89def Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 12 Oct 2023 08:38:55 +0000 Subject: [PATCH 1/6] using good model 4000s --- .../src/pipeline/configs/build_model.yaml | 4 +- modules/ml-pipeline/src/pipeline/dvc.lock | 48 +++++++++---------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 7e409bf..ced4159 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,6 +13,6 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 400 - presets: medium_quality + time_limit: 4000 + presets: good_quality excluded_model_types: ['KNN', 'RF'] diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 2056834..689bb64 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 5fd3c01804ee2994ee77fc501d178be4.dir - size: 30137355 + md5: 91407be844d5cfe428bf9d09e980fc0e.dir + size: 30051355 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 5359 - path: data/prepared_data hash: md5 - md5: 5fd3c01804ee2994ee77fc501d178be4.dir - size: 30137355 + md5: 91407be844d5cfe428bf9d09e980fc0e.dir + size: 30051355 nfiles: 2 params: configs/build_model.yaml: @@ -58,21 +58,21 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 400 - presets: medium_quality + time_limit: 4000 + presets: good_quality excluded_model_types: - KNN - RF outs: - path: data/model/ hash: md5 - md5: 4b49c12395a645e35e50a9de8840f08d.dir - size: 282024140 - nfiles: 24 + md5: 229de034422caa37a4e24366b572bc29.dir + size: 317288619 + nfiles: 138 - path: metrics/fit_metrics.json hash: md5 - md5: a6d139fa59f5ddf75023bb7d3364f6d2 - size: 225 + md5: 50005e7cbac69d6c888cdd0f929b9240 + size: 226 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -82,13 +82,13 @@ stages: size: 3028 - path: data/model hash: md5 - md5: 4b49c12395a645e35e50a9de8840f08d.dir - size: 282024140 - nfiles: 24 + md5: 229de034422caa37a4e24366b572bc29.dir + size: 317288619 + nfiles: 138 - path: data/prepared_data hash: md5 - md5: 5fd3c01804ee2994ee77fc501d178be4.dir - size: 30137355 + md5: 91407be844d5cfe428bf9d09e980fc0e.dir + size: 30051355 nfiles: 2 params: configs/settings.yaml: @@ -100,8 +100,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 8f724261b3d17bf87067e91a1ff99077.dir - size: 441423 + md5: 40d5bf4d265018e8a181287846e4f892.dir + size: 441331 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -112,13 +112,13 @@ stages: size: 4487 - path: data/predictions hash: md5 - md5: 8f724261b3d17bf87067e91a1ff99077.dir - size: 441423 + md5: 40d5bf4d265018e8a181287846e4f892.dir + size: 441331 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 5fd3c01804ee2994ee77fc501d178be4.dir - size: 30137355 + md5: 91407be844d5cfe428bf9d09e980fc0e.dir + size: 30051355 nfiles: 2 params: configs/settings.yaml: @@ -128,8 +128,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 38787835f838f65c6cc75654843eb311 - size: 223 + md5: d6217ba41c60e4ec452670faf07321ab + size: 224 startup_cleanup: cmd: python 0_startup_cleanup.py deps: From 7589977cda6530f2c6837b6a68e9035122449ca2 Mon Sep 17 00:00:00 2001 From: quandanrepo <45804868+quandanrepo@users.noreply.github.com> Date: Thu, 12 Oct 2023 10:19:22 +0100 Subject: [PATCH 2/6] Update Makefile --- modules/ml-pipeline/Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index 5c5d563..6ccb4c4 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -10,15 +10,15 @@ init: dev-conda dev-conda: # conda deactivate || echo "Not in conda environment" # conda remove --name $CONDA_ENV --all -y || echo "No environment created previously" - conda create --name $CONDA_ENV python=$(PYTHON_VERSION) -y + conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y conda init bash - conda run -vvvv -n $CONDA_ENV pip install --upgrade pip - conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/training/requirements-dev.txt - conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/version_control/requirements.txt - conda run -vvvv -n $CONDA_ENV pre-commit install - conda run -vvvv -n $CONDA_ENV pip install ipykernel + conda run -vvvv -n ${CONDA_ENV} pip install --upgrade pip + conda run -vvvv -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt + conda run -vvvv -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt + conda run -vvvv -n ${CONDA_ENV} pre-commit install + conda run -vvvv -n ${CONDA_ENV} pip install ipykernel echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" - echo "conda activate $CONDA_ENV" + echo "conda activate ${CONDA_ENV}" .PHONY: dev-pyenv From 96153f82489c4aa09008262589840b98c770fccf Mon Sep 17 00:00:00 2001 From: quandanrepo <45804868+quandanrepo@users.noreply.github.com> Date: Tue, 17 Oct 2023 03:08:01 +0100 Subject: [PATCH 3/6] Update Makefile --- modules/ml-pipeline/Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index 6ccb4c4..0bef7d6 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -9,14 +9,14 @@ init: dev-conda .PHONY: dev-conda dev-conda: # conda deactivate || echo "Not in conda environment" - # conda remove --name $CONDA_ENV --all -y || echo "No environment created previously" + # conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously" conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y conda init bash - conda run -vvvv -n ${CONDA_ENV} pip install --upgrade pip - conda run -vvvv -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt - conda run -vvvv -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt - conda run -vvvv -n ${CONDA_ENV} pre-commit install - conda run -vvvv -n ${CONDA_ENV} pip install ipykernel + conda run -v -n ${CONDA_ENV} pip install --upgrade pip + conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt + conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt + conda run -v -n ${CONDA_ENV} pre-commit install + conda run -v -n ${CONDA_ENV} pip install ipykernel echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" echo "conda activate ${CONDA_ENV}" From e9417ca73d0cdb6f73132fea8231dc9a36e85177 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 27 Nov 2023 15:17:01 +0000 Subject: [PATCH 4/6] Added additional workflows for new models --- .github/workflows/Deploy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 48375c3..6e34d36 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -2,7 +2,7 @@ name: Sap Change Model Deploy on: push: - branches: [ sap-dev, sap-prod ] + branches: [ sap-dev, sap-prod, heat-dev, heat-prod, carbon-dev, carbon-prod] jobs: deploy: From 217fb3dca8786b0cd43e7eba9b5c651ad20dfb26 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 27 Nov 2023 18:52:47 +0000 Subject: [PATCH 5/6] add inference speed check --- deployment/handlers/prediction_app.py | 6 -- .../src/pipeline/0_startup_cleanup.py | 10 --- .../src/pipeline/1_prepare_data.py | 20 ------ .../ml-pipeline/src/pipeline/2_build_model.py | 26 ------- .../src/pipeline/3_generate_predictions.py | 12 ---- .../src/pipeline/4_generate_metrics.py | 22 ------ .../src/pipeline/configs/build_model.yaml | 6 +- .../src/pipeline/configs/settings.yaml | 2 +- .../ml-pipeline/src/pipeline/core/Logger.py | 1 + .../ml-pipeline/src/pipeline/core/MLModels.py | 4 ++ modules/ml-pipeline/src/pipeline/dvc.lock | 68 ++++++++++--------- .../src/pipeline/generate_predictions.py | 8 --- 12 files changed, 45 insertions(+), 140 deletions(-) diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py index 36a906c..ac397b9 100644 --- a/deployment/handlers/prediction_app.py +++ b/deployment/handlers/prediction_app.py @@ -69,9 +69,7 @@ def handler(event, context): storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet" - logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") - logger.info("-------------------------") build_model_params = settings.build_model client_params = settings.client @@ -80,17 +78,13 @@ def handler(event, context): model = model_factory(build_model_params["model_type"]) - logger.info("----------------------------") logger.info(f"--- Initiate Input DataClient ---") - logger.info("----------------------------") input_dataclient = dataclient_factory( dataclient_type="aws-s3", dataclient_config=client_params["aws-s3"], ) - logger.info("----------------------------") logger.info(f"--- Initiate Output DataClient ---") - logger.info("----------------------------") output_dataclient = dataclient_factory( dataclient_type="aws-s3", dataclient_config=client_params["aws-s3"], diff --git a/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py b/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py index 0bfa37f..32e8a1b 100644 --- a/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py +++ b/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py @@ -16,13 +16,9 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: Remove the directory where artefacts are stored """ - logger.info("---------------------") logger.info(f"--- Run Clean up ---") - logger.info("---------------------") - logger.info("-------------------------") logger.info(f"--- Delete artefacts ---") - logger.info("-------------------------") artefact_directory_path = Path(artefacts_directory) @@ -31,9 +27,7 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: logger.info(f"Removing the directory: {artefacts_directory}") shutil.rmtree(artefact_directory_path) - logger.info("-----------------------") logger.info(f"--- Delete metrics ---") - logger.info("-----------------------") metrics_directory_path = Path(metrics_directory) @@ -45,15 +39,11 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") run_cleanup( artefacts_directory=startup_cleanup_params["artefacts"], metrics_directory=startup_cleanup_params["metrics"], ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/1_prepare_data.py b/modules/ml-pipeline/src/pipeline/1_prepare_data.py index 32daa19..ed7e057 100644 --- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py @@ -17,9 +17,7 @@ from core.DataClient import dataclient_factory from core.FeatureProcessor import feature_processor_factory from config import settings -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -33,9 +31,7 @@ output_train_filepath = prepare_data_params["output_train_filepath"] output_test_filepath = prepare_data_params["output_test_filepath"] feature_processor_config = feature_process_params["feature_processor_config"] -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") input_dataclient_type = prepare_data_params["input_dataclient_type"] output_dataclient_type = prepare_data_params["output_dataclient_type"] @@ -49,9 +45,7 @@ output_dataclient = dataclient_factory( dataclient_config=client_params[output_dataclient_type], ) -logger.info("----------------------------------") logger.info(f"--- Initiate FeatureProcessor ---") -logger.info("----------------------------------") feature_processor = feature_processor_factory( feature_process_params["feature_processor_type"] @@ -76,15 +70,11 @@ def prepare_data( :param pipeline_mode: bool, Default False, this caches out the file for experimentation, objects returned in pipeline mode """ - logger.info("--------------------") logger.info("--- Loading data ---") - logger.info("--------------------") data = input_dataclient.load_data(location=data_filepath, load_config={}) - logger.info("--------------------------") logger.info("--- Feature Processing ---") - logger.info("--------------------------") data = feature_processor.feature_process( data, @@ -93,9 +83,7 @@ def prepare_data( new_feature_funcs=new_feature_funcs, ) - logger.info("----------------------") logger.info("--- Splitting data ---") - logger.info("----------------------") if train_proportion == 1: train = data @@ -108,9 +96,7 @@ def prepare_data( train = train.reset_index(drop=True) - logger.info("-----------------------") logger.info("--- Outputting data ---") - logger.info("-----------------------") output_dataclient.save_data( obj=train, location=output_train_filepath, save_config=None @@ -126,13 +112,9 @@ def prepare_data( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("---------------------------") logger.info(f"--- Prepare Data Stage ---") - logger.info("---------------------------") prepare_data( input_dataclient=input_dataclient, @@ -147,6 +129,4 @@ if __name__ == "__main__": new_feature_funcs=new_feature_funcs, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/2_build_model.py b/modules/ml-pipeline/src/pipeline/2_build_model.py index cae5cfd..7ca4951 100644 --- a/modules/ml-pipeline/src/pipeline/2_build_model.py +++ b/modules/ml-pipeline/src/pipeline/2_build_model.py @@ -18,9 +18,7 @@ from core.MLMetrics import metrics_factory from configs.post_prediction_logic import post_prediction_logic from config import settings -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -40,22 +38,16 @@ train_filepath = prepare_data_params["output_train_filepath"] test_filepath = prepare_data_params["output_test_filepath"] fit_metrics_filepath = build_model_params["fit_metrics_filepath"] -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") # Output of previous prepare data step, will be where the data is dataclient = dataclient_factory(prepare_data_params["output_dataclient_type"]) -logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") -logger.info("-------------------------") model = model_factory(model_type) -logger.info("-------------------------") logger.info(f"--- Initiate Metrics ---") -logger.info("-------------------------") metrics = metrics_factory(generate_metrics_params["metrics_type"]) @@ -75,9 +67,7 @@ def build_model( test_data: Union[pd.DataFrame, None] = None, pipeline_mode: bool = False, ): - logger.info("--------------------------------------") logger.info("--- Loading Data for build process ---") - logger.info("--------------------------------------") if train_data is None: if train_filepath is None: @@ -89,9 +79,7 @@ def build_model( raise ValueError(f"Need {test_filepath} if no data supplied") test_data = dataclient.load_data(location=test_filepath, load_config=None) - logger.info("----------------------") logger.info("--- Training model ---") - logger.info("----------------------") model.train_model( data=train_data.drop(columns=identifier_columns), @@ -99,32 +87,24 @@ def build_model( model_hyperparameters=model_hyperparameters, ) - logger.info("----------------------------------") logger.info("--- Generating fit predictions ---") - logger.info("----------------------------------") fit_predictions = model.predict( data=train_data, post_prediction_logic=post_prediction_logic ) - logger.info("------------------------------") logger.info("--- Generating fit metrics ---") - logger.info("------------------------------") metrics_output = metrics.generate_metrics( target=train_data[target], predictions=pd.Series(fit_predictions), ) - logger.info("--------------------") logger.info("--- Saving model ---") - logger.info("--------------------") model.save_model(path=Path(model_save_location)) - logger.info("--------------------------") logger.info("--- Saving fit metrics ---") - logger.info("--------------------------") dataclient.save_data( obj=metrics_output, location=fit_metrics_filepath, save_config=None @@ -133,13 +113,9 @@ def build_model( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("--------------------------") logger.info(f"--- Build Model Stage ---") - logger.info("--------------------------") build_model( dataclient=dataclient, @@ -154,6 +130,4 @@ if __name__ == "__main__": fit_metrics_filepath=fit_metrics_filepath, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py index 9461392..acb9e99 100644 --- a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py @@ -10,9 +10,7 @@ from core.Logger import logger from config import settings from generate_predictions import generate_predictions -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -33,15 +31,11 @@ model_filepath = build_model_params["model_save_filepath"] predictions_output_filepath = generate_predictions_params["predictions_output_filepath"] predictions_column_name = generate_predictions_params["predictions_column_name"] -logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") -logger.info("-------------------------") model = model_factory(build_model_params["model_type"]) -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") # We may have different locations of loading hence why we use one specified in generate_predictions.yaml # I.e. for metric runs, this will be a local data client @@ -59,13 +53,9 @@ output_dataclient = dataclient_factory( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("----------------------------------") logger.info(f"--- Generate Predictions Stage---") - logger.info("----------------------------------") generate_predictions( input_dataclient=input_dataclient, @@ -78,6 +68,4 @@ if __name__ == "__main__": predictions_column_name=predictions_column_name, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py index 7b115a2..937c5be 100644 --- a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py +++ b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py @@ -16,9 +16,7 @@ from core.MLMetrics import metrics_factory from core.Logger import logger from config import settings -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -36,15 +34,11 @@ predictions_column_name = generate_predictions_params["predictions_column_name"] metrics_output_filepath = generate_metrics_params["metrics_output_filepath"] -logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") -logger.info("-------------------------") model = model_factory(build_model_params["model_type"]) -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") # Use data client for input and output, as we use dvc to cache later to the cloud dataclient_type = generate_metrics_params["dataclient_type"] @@ -53,9 +47,7 @@ dataclient = dataclient_factory( dataclient_config=client_params[dataclient_type], ) -logger.info("---------------------------") logger.info(f"--- Initiate MLMetrics ---") -logger.info("---------------------------") metrics = metrics_factory(generate_metrics_params["metrics_type"]) @@ -75,34 +67,26 @@ def generate_metrics( For a given model, we generate prediction and evaluate this against the true target """ - logger.info("-------------------------") logger.info("--- Loading test data ---") - logger.info("-------------------------") test_data = input_dataclient.load_data( location=test_data_filepath, load_config=None ) - logger.info("---------------------------") logger.info("--- Loading predictions ---") - logger.info("---------------------------") predictions = input_dataclient.load_data( location=predictions_output_filepath, load_config=None ) - logger.info("--------------------------") logger.info("--- Generating metrics ---") - logger.info("--------------------------") metrics_output = metrics.generate_metrics( target=test_data[target], predictions=pd.Series(predictions[predictions_column_name]), ) - logger.info("----------------------") logger.info("--- Saving metrics ---") - logger.info("----------------------") output_dataclient.save_data( obj=metrics_output, location=metrics_output_filepath, save_config=None @@ -111,13 +95,9 @@ def generate_metrics( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("------------------------------") logger.info(f"--- Generate Metrics Stage---") - logger.info("------------------------------") generate_metrics( input_dataclient=dataclient, @@ -131,6 +111,4 @@ if __name__ == "__main__": metrics_output_filepath=metrics_output_filepath, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index ced4159..4c72487 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,6 +13,8 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 - presets: good_quality + time_limit: 400 + presets: medium_quality excluded_model_types: ['KNN', 'RF'] + infer_limit: 0.05 + infer_limit_batch_size: 10000 diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 4f3ebce..5514406 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -21,7 +21,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/core/Logger.py b/modules/ml-pipeline/src/pipeline/core/Logger.py index a0fc231..d2f6c61 100644 --- a/modules/ml-pipeline/src/pipeline/core/Logger.py +++ b/modules/ml-pipeline/src/pipeline/core/Logger.py @@ -21,6 +21,7 @@ def setup_logger(): # Add the stream handler to the logger logger.addHandler(stream_handler) + logger.propagate = False return logger diff --git a/modules/ml-pipeline/src/pipeline/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py index 4cf8b08..4fc572a 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLModels.py +++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py @@ -149,6 +149,8 @@ class AutogluonAutoML: "time_limit", "presets", "excluded_model_types", + "infer_limit", + "infer_limit_batch_size", ] def load_model(self, path: Union[Path, str]) -> None: @@ -203,6 +205,8 @@ class AutogluonAutoML: time_limit=model_hyperparameters["time_limit"], presets=model_hyperparameters["presets"], excluded_model_types=model_hyperparameters["excluded_model_types"], + infer_limit=model_hyperparameters["infer_limit"], + infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"], ) def predict( diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 689bb64..627af99 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: 1_prepare_data.py hash: md5 - md5: c9f030df733e318b80d1fa91b7732f79 - size: 5132 + md5: 896d3d88a4a9f68d174efe71dc089517 + size: 4222 params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: @@ -20,7 +20,7 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: CARBON_ENDING default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,20 +29,20 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 91407be844d5cfe428bf9d09e980fc0e.dir - size: 30051355 + md5: 73c1f7be21be8358a73c4ab5f9ec8e39.dir + size: 32943109 nfiles: 2 build_model: cmd: python 2_build_model.py deps: - path: 2_build_model.py hash: md5 - md5: 84699d208874c52accaff61c6af9bb0a - size: 5359 + md5: b824822475c222521516493e68eef9c5 + size: 4149 - path: data/prepared_data hash: md5 - md5: 91407be844d5cfe428bf9d09e980fc0e.dir - size: 30051355 + md5: 73c1f7be21be8358a73c4ab5f9ec8e39.dir + size: 32943109 nfiles: 2 params: configs/build_model.yaml: @@ -58,37 +58,39 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 4000 - presets: good_quality + time_limit: 400 + presets: medium_quality excluded_model_types: - KNN - RF + infer_limit: 0.05 + infer_limit_batch_size: 10000 outs: - path: data/model/ hash: md5 - md5: 229de034422caa37a4e24366b572bc29.dir - size: 317288619 - nfiles: 138 + md5: dee1a60e6a9f4695272da8127196f714.dir + size: 326732699 + nfiles: 24 - path: metrics/fit_metrics.json hash: md5 - md5: 50005e7cbac69d6c888cdd0f929b9240 + md5: 1fefa99c7bc50d09c31bf175d5b9ee9c size: 226 generate_predictions: cmd: python 3_generate_predictions.py deps: - path: 3_generate_predictions.py hash: md5 - md5: 5ef2856a5a977304f1ec01f9b4205262 - size: 3028 + md5: 0a70ad4dfe99414a75d1261c75a177b9 + size: 2464 - path: data/model hash: md5 - md5: 229de034422caa37a4e24366b572bc29.dir - size: 317288619 - nfiles: 138 + md5: dee1a60e6a9f4695272da8127196f714.dir + size: 326732699 + nfiles: 24 - path: data/prepared_data hash: md5 - md5: 91407be844d5cfe428bf9d09e980fc0e.dir - size: 30051355 + md5: 73c1f7be21be8358a73c4ab5f9ec8e39.dir + size: 32943109 nfiles: 2 params: configs/settings.yaml: @@ -100,25 +102,25 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 40d5bf4d265018e8a181287846e4f892.dir - size: 441331 + md5: d2da3b713811952b66e2c5f8c95f5407.dir + size: 410646 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py deps: - path: 4_generate_metrics.py hash: md5 - md5: 2c9fb78955a8c19cff0a098976f81d1b - size: 4487 + md5: d09a80dd55f1f69e2a832b1991b3c406 + size: 3485 - path: data/predictions hash: md5 - md5: 40d5bf4d265018e8a181287846e4f892.dir - size: 441331 + md5: d2da3b713811952b66e2c5f8c95f5407.dir + size: 410646 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 91407be844d5cfe428bf9d09e980fc0e.dir - size: 30051355 + md5: 73c1f7be21be8358a73c4ab5f9ec8e39.dir + size: 32943109 nfiles: 2 params: configs/settings.yaml: @@ -128,15 +130,15 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: d6217ba41c60e4ec452670faf07321ab + md5: 4ed2edc06b4dad3c094a2d1be374a5de size: 224 startup_cleanup: cmd: python 0_startup_cleanup.py deps: - path: 0_startup_cleanup.py hash: md5 - md5: fbb7e3b1b98b517c870f3e1df3e7f695 - size: 1676 + md5: b1b12f6b6393fbf8b83d23684df0a3d4 + size: 1220 params: configs/settings.yaml: default.startup_cleanup.artefacts: ./data diff --git a/modules/ml-pipeline/src/pipeline/generate_predictions.py b/modules/ml-pipeline/src/pipeline/generate_predictions.py index 83ea103..59ce732 100644 --- a/modules/ml-pipeline/src/pipeline/generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/generate_predictions.py @@ -20,23 +20,17 @@ def generate_predictions( For a given model, we generate prediction and evaluate this against the true target """ - logger.info("-------------------------") logger.info("--- Loading test data ---") - logger.info("-------------------------") test_data = input_dataclient.load_data( location=test_data_filepath, load_config=None ) - logger.info("---------------------") logger.info("--- Loading model ---") - logger.info("---------------------") model.load_model(model_filepath) - logger.info("------------------------------") logger.info("--- Generating predictions ---") - logger.info("------------------------------") prediction_data = ( test_data.drop(columns=target) if target in test_data.columns else test_data @@ -46,9 +40,7 @@ def generate_predictions( data=prediction_data, post_prediction_logic=post_prediction_logic ) - logger.info("--------------------------") logger.info("--- Saving predictions ---") - logger.info("--------------------------") predictions_df = pd.DataFrame(predictions) predictions_df.columns = [predictions_column_name] From f29d6af6a28a7816f00a31bf3600cb0a22b0e1e3 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 27 Nov 2023 19:13:23 +0000 Subject: [PATCH 6/6] change readme --- modules/ml-pipeline/src/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ml-pipeline/src/README.md b/modules/ml-pipeline/src/README.md index d7afc6a..db1b8b4 100644 --- a/modules/ml-pipeline/src/README.md +++ b/modules/ml-pipeline/src/README.md @@ -1,3 +1,3 @@ -# The generic reproducible ML-pipeline +# The generic reproducible ML-pipeline! Pipeline required to build a model to produce an output, that gets hashed via DVC