diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index fcec7f7..6fbf094 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -14,8 +14,9 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 - presets: medium_quality + time_limit: 1800 + presets: good_quality excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 infer_limit_batch_size: 10000 + ag_args_ensemble: {'num_folds_parallel': 2} diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 19b0a5b..4757d91 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -24,7 +24,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet # data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet - train_proportion: 1 + train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py index 4fc572a..257261d 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLModels.py +++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py @@ -25,7 +25,7 @@ def model_factory(model_type: str) -> MLModel: models = { "SKLearnLinearRegression": SKLearnLinearRegression(), "SKLearnSVMRegression": SKLearnSVMRegression(), - "AutogluonAutoML": AutogluonAutoML() + "AutogluonAutoML": AutogluonAutoML(), # ADD OTHER MODELS HERE } @@ -151,6 +151,7 @@ class AutogluonAutoML: "excluded_model_types", "infer_limit", "infer_limit_batch_size", + "ag_args_ensemble", ] def load_model(self, path: Union[Path, str]) -> None: @@ -207,6 +208,7 @@ class AutogluonAutoML: excluded_model_types=model_hyperparameters["excluded_model_types"], infer_limit=model_hyperparameters["infer_limit"], infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"], + ag_args_ensemble=model_hyperparameters["ag_args_ensemble"], ) def predict( diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 826e654..530a3c8 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -27,12 +27,12 @@ stages: default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet - default.prepare_data.train_proportion: 1 + default.prepare_data.train_proportion: 0.9 outs: - path: data/prepared_data/ hash: md5 - md5: 3c77fa10cd1cd503eb4d2540394629f6.dir - size: 42626894 + md5: 3d1144848fce4ce50f6abfaec5235552.dir + size: 46392840 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -43,8 +43,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 3c77fa10cd1cd503eb4d2540394629f6.dir - size: 42626894 + md5: 3d1144848fce4ce50f6abfaec5235552.dir + size: 46392840 nfiles: 2 params: configs/build_model.yaml: @@ -61,8 +61,8 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 4000 - presets: medium_quality + time_limit: 1800 + presets: good_quality excluded_model_types: - RF - FASTAI @@ -72,21 +72,23 @@ stages: - XT infer_limit: 0.05 infer_limit_batch_size: 10000 + ag_args_ensemble: + num_folds_parallel: 2 outs: - path: data/fit_predictions/ hash: md5 - md5: e0a11ac6e4adf69d6180c0217c639a0e.dir - size: 3680908 + md5: 346b6611afbf2070e038bf945249a86e.dir + size: 3384302 nfiles: 1 - path: data/model/ hash: md5 - md5: bdaaf823857f9dc7b6ee2d4b88927cc1.dir - size: 805896324 - nfiles: 31 + md5: 8e37f21728cd092660bafa8c32dc109f.dir + size: 423840922 + nfiles: 118 - path: metrics/fit_metrics.json hash: md5 - md5: 0ed5b1141bbb8bc3156e7c056b29f3cd - size: 225 + md5: d63e1a8d31503055835ac35149554e41 + size: 223 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -96,13 +98,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: bdaaf823857f9dc7b6ee2d4b88927cc1.dir - size: 805896324 - nfiles: 31 + md5: 8e37f21728cd092660bafa8c32dc109f.dir + size: 423840922 + nfiles: 118 - path: data/prepared_data hash: md5 - md5: 3c77fa10cd1cd503eb4d2540394629f6.dir - size: 42626894 + md5: 3d1144848fce4ce50f6abfaec5235552.dir + size: 46392840 nfiles: 2 params: configs/settings.yaml: @@ -114,8 +116,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 38707d16ae1e2330cc03f524db9cdd60.dir - size: 648730 + md5: d148baf508140353d62c16d6ab0fb6b7.dir + size: 469224 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -126,13 +128,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 38707d16ae1e2330cc03f524db9cdd60.dir - size: 648730 + md5: d148baf508140353d62c16d6ab0fb6b7.dir + size: 469224 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 3c77fa10cd1cd503eb4d2540394629f6.dir - size: 42626894 + md5: 3d1144848fce4ce50f6abfaec5235552.dir + size: 46392840 nfiles: 2 params: configs/settings.yaml: @@ -142,8 +144,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 145e7ac84ab4a4407b23695a632b4d91 - size: 226 + md5: 196232f94b563ac525cf65ee5cc6d639 + size: 222 startup_cleanup: cmd: python 0_startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt index 0d259fb..258981d 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 dynaconf==3.2.0 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt index afad9be..2ab48e9 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 dynaconf==3.2.0 pyarrow==13.0.0 PyYAML==6.0.1 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index d8c5907..2024d84 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -1,9 +1,10 @@ joblib==1.3.2 boto3==1.28.17 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 +ray==2.6.3 dynaconf==3.2.0 -alibi==0.9.4 +alibi==0.9.5 shap==0.42.1 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt index bbdc2fa..84452a3 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt @@ -1,4 +1,4 @@ boto3==1.28.41 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 dynaconf==3.2.0