From bdc177baa929afd5e4475506be5080f3fd6d289d Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 2 Nov 2025 22:49:36 +0000 Subject: [PATCH] roll back to autogluon 1.3.0 due to stabiulity issue --- .../src/pipeline/configs/build_model.yaml | 3 +- .../ml-pipeline/src/pipeline/core/MLModels.py | 4 ++ modules/ml-pipeline/src/pipeline/dvc.lock | 51 ++++++++++--------- .../predictions/requirements-dev.txt | 2 +- .../requirements/predictions/requirements.txt | 2 +- .../training/requirements-dev.txt | 4 +- .../requirements/training/requirements.txt | 2 +- 7 files changed, 37 insertions(+), 31 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 38c0910..6e8845c 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -19,5 +19,6 @@ default: excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.0005 infer_limit_batch_size: 10000 - "fit_strategy": "parallel" + fit_strategy: "parallel" ag_args_ensemble: {'num_folds_parallel': 2} + num_gpus: auto diff --git a/modules/ml-pipeline/src/pipeline/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py index 437c69f..35f79c4 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLModels.py +++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py @@ -153,6 +153,7 @@ class AutogluonAutoML: "infer_limit_batch_size", "ag_args_ensemble", "fit_strategy", + "num_gpus", ] def load_model(self, path: Union[Path, str]) -> None: @@ -184,6 +185,8 @@ class AutogluonAutoML: """ Method to train a model """ + # Force Parallel Model fitting + os.environ["AG_FORCE_PARALLEL"] = "True" validate_dict_keys( keys_1=list(model_hyperparameters.keys()), @@ -211,6 +214,7 @@ class AutogluonAutoML: infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"], ag_args_ensemble=model_hyperparameters["ag_args_ensemble"], fit_strategy=model_hyperparameters["fit_strategy"], + num_gpus=model_hyperparameters["num_gpus"], ) def predict( diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index f05f185..7ed27d7 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -61,8 +61,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 2feba8772c240b507eb900934efcb8ca.dir - size: 46064555 + md5: 7b780ea01da913d9d8cadcff73fbde0f.dir + size: 46092230 nfiles: 3 build_model: cmd: python 2_build_model.py @@ -73,8 +73,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 2feba8772c240b507eb900934efcb8ca.dir - size: 46064555 + md5: 7b780ea01da913d9d8cadcff73fbde0f.dir + size: 46092230 nfiles: 3 params: configs/build_model.yaml: @@ -104,20 +104,21 @@ stages: fit_strategy: parallel ag_args_ensemble: num_folds_parallel: 2 + num_gpus: auto outs: - path: data/fit_predictions/ hash: md5 - md5: 29036f4f42b1fdcab7f9e40a87f38a8c.dir - size: 3474783 + md5: 01328a1cc5a1ff35e701a3c44902afc6.dir + size: 3474659 nfiles: 1 - path: data/model/ hash: md5 - md5: 77cab231e3d51bbebbae5a7af310c18a.dir - size: 791390619 - nfiles: 34 + md5: 70f076a248524dfce60412f83969ae63.dir + size: 760254863 + nfiles: 33 - path: metrics/fit_metrics.json hash: md5 - md5: 4f39064fb6b31c7c879299621bcea28d + md5: 4726c52b2f27650ab1bbf97b5bf61e54 size: 224 generate_predictions: cmd: python 3_generate_predictions.py @@ -128,13 +129,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 77cab231e3d51bbebbae5a7af310c18a.dir - size: 791390619 - nfiles: 34 + md5: 70f076a248524dfce60412f83969ae63.dir + size: 760254863 + nfiles: 33 - path: data/prepared_data hash: md5 - md5: 2feba8772c240b507eb900934efcb8ca.dir - size: 46064555 + md5: 7b780ea01da913d9d8cadcff73fbde0f.dir + size: 46092230 nfiles: 3 params: configs/settings.yaml: @@ -148,8 +149,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 8dfa69b48586da6b0ef33a6fbedb7c4a.dir - size: 484314 + md5: 312f9106eb18d34df75124f0536f0603.dir + size: 484470 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -160,13 +161,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 8dfa69b48586da6b0ef33a6fbedb7c4a.dir - size: 484314 + md5: 312f9106eb18d34df75124f0536f0603.dir + size: 484470 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 2feba8772c240b507eb900934efcb8ca.dir - size: 46064555 + md5: 7b780ea01da913d9d8cadcff73fbde0f.dir + size: 46092230 nfiles: 3 params: configs/settings.yaml: @@ -176,8 +177,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: bf980dad2dc5b97651546b0b755419ae - size: 223 + md5: 661388682aa1ca888b256e4667211379 + size: 222 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: @@ -197,9 +198,9 @@ stages: outs: - path: metrics/scenario_metrics.md hash: md5 - md5: 05e2cce8e61d5005398659e9f3465cd6 + md5: 88ebca8dccf907692675301ffe06b10d size: 356 - path: metrics/scenario_table.md hash: md5 - md5: 92446d2f3836c6f790d06e3b268b05f3 + md5: 3ec419e883b812b254b331f055999cc9 size: 872 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt index bf8b14c..d4eb1fd 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt @@ -1,7 +1,7 @@ joblib==1.5.2 boto3==1.40.61 pandas==2.2.3 -autogluon.tabular[all]==1.4 +autogluon.tabular[all]==1.3 dynaconf==3.2.12 pyarrow==20.0.0 pre-commit==4.3.0 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt index 0df33db..138a4ef 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt @@ -1,7 +1,7 @@ joblib==1.5.2 boto3==1.40.61 pandas==2.2.3 -autogluon.tabular[all]==1.4 +autogluon.tabular[all]==1.3 dynaconf==3.2.12 pyarrow==20.0.0 PyYAML==6.0.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index a503ecf..6fe98f7 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -1,10 +1,10 @@ joblib==1.5.2 boto3==1.40.61 pandas==2.2.3 -autogluon.tabular[all]==1.4 +autogluon.tabular[all]==1.3 ray==2.44.1 dynaconf==3.2.12 alibi==0.5.5 shap==0.49.1 -pyarrow +pyarrow==20.0.0 pre-commit==4.3.0 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt index 6e85ded..84455e8 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt @@ -1,4 +1,4 @@ boto3==1.40.61 pandas==2.2.3 -autogluon.tabular[all]==1.4 +autogluon.tabular[all]==1.3 dynaconf==3.2.12