diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index 09f9f15..3e6be85 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -20,7 +20,8 @@ dev-conda: uv pip install -r src/pipeline/requirements/training/requirements-dev.txt && \ uv pip install -r src/pipeline/requirements/version_control/requirements.txt && \ pre-commit install && \ - uv pip install ipykernel + uv pip install ipykernel && \ + conda install llvm-openmp -y echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" echo "conda activate ${CONDA_ENV}" diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 6e8845c..50122ee 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -17,8 +17,20 @@ default: time_limit: 1800 presets: medium_quality excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT'] - infer_limit: 0.0005 + infer_limit: 0.001 infer_limit_batch_size: 10000 - fit_strategy: "parallel" + fit_strategy: "sequential" ag_args_ensemble: {'num_folds_parallel': 2} num_gpus: auto + hyperparameters: + { + 'NN_TORCH': [{}], + 'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, {'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 3, 'ag_args': {'name_suffix': 'Large', 'priority': 0, 'hyperparameter_tune_kwargs': 'auto'}}], + # 'GBM': [{}], + 'CAT': [{}], + 'XGB': [{}], + 'FASTAI': [{}], + 'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}], + 'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}], + 'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}], + } diff --git a/modules/ml-pipeline/src/pipeline/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py index 35f79c4..dabe154 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLModels.py +++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py @@ -154,6 +154,7 @@ class AutogluonAutoML: "ag_args_ensemble", "fit_strategy", "num_gpus", + "hyperparameters", ] def load_model(self, path: Union[Path, str]) -> None: @@ -215,6 +216,7 @@ class AutogluonAutoML: ag_args_ensemble=model_hyperparameters["ag_args_ensemble"], fit_strategy=model_hyperparameters["fit_strategy"], num_gpus=model_hyperparameters["num_gpus"], + hyperparameters=model_hyperparameters["hyperparameters"].to_dict(), ) def predict( diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 7ed27d7..cffd1b3 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -61,8 +61,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 7b780ea01da913d9d8cadcff73fbde0f.dir - size: 46092230 + md5: ba409a8c79863ddc407786b7aa7a053a.dir + size: 46113237 nfiles: 3 build_model: cmd: python 2_build_model.py @@ -73,8 +73,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 7b780ea01da913d9d8cadcff73fbde0f.dir - size: 46092230 + md5: ba409a8c79863ddc407786b7aa7a053a.dir + size: 46113237 nfiles: 3 params: configs/build_model.yaml: @@ -99,27 +99,94 @@ stages: - NN_TORCH - KNN - XT - infer_limit: 0.0005 + infer_limit: 0.001 infer_limit_batch_size: 10000 - fit_strategy: parallel + fit_strategy: sequential ag_args_ensemble: num_folds_parallel: 2 num_gpus: auto + hyperparameters: + NN_TORCH: + - {} + GBM: + - extra_trees: true + ag_args: + name_suffix: XT + - {} + - learning_rate: 0.03 + num_leaves: 128 + feature_fraction: 0.9 + min_data_in_leaf: 3 + ag_args: + name_suffix: Large + priority: 0 + hyperparameter_tune_kwargs: auto + CAT: + - {} + XGB: + - {} + FASTAI: + - {} + RF: + - criterion: gini + ag_args: + name_suffix: Gini + problem_types: + - binary + - multiclass + - criterion: entropy + ag_args: + name_suffix: Entr + problem_types: + - binary + - multiclass + - criterion: squared_error + ag_args: + name_suffix: MSE + problem_types: + - regression + - quantile + XT: + - criterion: gini + ag_args: + name_suffix: Gini + problem_types: + - binary + - multiclass + - criterion: entropy + ag_args: + name_suffix: Entr + problem_types: + - binary + - multiclass + - criterion: squared_error + ag_args: + name_suffix: MSE + problem_types: + - regression + - quantile + KNN: + - weights: uniform + ag_args: + name_suffix: Unif + - weights: distance + ag_args: + name_suffix: Dist outs: - path: data/fit_predictions/ hash: md5 - md5: 01328a1cc5a1ff35e701a3c44902afc6.dir - size: 3474659 + md5: a9361ab31ff8fc08c3e5e3b96cec06d4.dir + size: 3474690 nfiles: 1 - path: data/model/ hash: md5 - md5: 70f076a248524dfce60412f83969ae63.dir - size: 760254863 - nfiles: 33 + md5: 19019e558886b1acd6d29442a47243d0.dir + size: 761937021 + nfiles: 34 - path: metrics/fit_metrics.json hash: md5 - md5: 4726c52b2f27650ab1bbf97b5bf61e54 - size: 224 + md5: 3af168aedf1f81a22024bb8c815f5d12 + size: 221 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -129,13 +196,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 70f076a248524dfce60412f83969ae63.dir - size: 760254863 - nfiles: 33 + md5: 19019e558886b1acd6d29442a47243d0.dir + size: 761937021 + nfiles: 34 - path: data/prepared_data hash: md5 - md5: 7b780ea01da913d9d8cadcff73fbde0f.dir - size: 46092230 + md5: ba409a8c79863ddc407786b7aa7a053a.dir + size: 46113237 nfiles: 3 params: configs/settings.yaml: @@ -149,8 +216,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 312f9106eb18d34df75124f0536f0603.dir - size: 484470 + md5: a9f32d70a4817df8092e52c5513a445f.dir + size: 484694 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -161,13 +228,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 312f9106eb18d34df75124f0536f0603.dir - size: 484470 + md5: a9f32d70a4817df8092e52c5513a445f.dir + size: 484694 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 7b780ea01da913d9d8cadcff73fbde0f.dir - size: 46092230 + md5: ba409a8c79863ddc407786b7aa7a053a.dir + size: 46113237 nfiles: 3 params: configs/settings.yaml: @@ -177,8 +244,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 661388682aa1ca888b256e4667211379 - size: 222 + md5: 736ef69da7edb94577139ae9ede5ac0d + size: 224 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: @@ -198,9 +265,9 @@ stages: outs: - path: metrics/scenario_metrics.md hash: md5 - md5: 88ebca8dccf907692675301ffe06b10d + md5: adcc78833e7a0824ecb10ad78a646ea8 size: 356 - path: metrics/scenario_table.md hash: md5 - md5: 3ec419e883b812b254b331f055999cc9 + md5: 35e704d0499e943c4110f66f1482d2ec size: 872 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt index d4eb1fd..ba7aebb 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt @@ -1,7 +1,7 @@ joblib==1.5.2 boto3==1.40.61 -pandas==2.2.3 -autogluon.tabular[all]==1.3 +pandas==2.3.3 +autogluon.tabular[all]==1.4.0 dynaconf==3.2.12 pyarrow==20.0.0 pre-commit==4.3.0 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt index 138a4ef..ccd84ab 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt @@ -1,7 +1,7 @@ joblib==1.5.2 boto3==1.40.61 -pandas==2.2.3 -autogluon.tabular[all]==1.3 +pandas==2.3.3 +autogluon.tabular[all]==1.4.0 dynaconf==3.2.12 pyarrow==20.0.0 PyYAML==6.0.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index 6fe98f7..eaef2a7 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -1,10 +1,10 @@ joblib==1.5.2 boto3==1.40.61 -pandas==2.2.3 -autogluon.tabular[all]==1.3 +pandas==2.3.3 +autogluon.tabular[all]==1.4.0 ray==2.44.1 dynaconf==3.2.12 -alibi==0.5.5 +# alibi shap==0.49.1 pyarrow==20.0.0 pre-commit==4.3.0 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt index 84455e8..1d7704e 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt @@ -1,4 +1,4 @@ boto3==1.40.61 -pandas==2.2.3 -autogluon.tabular[all]==1.3 +pandas==2.3.3 +autogluon.tabular[all]==1.4.0 dynaconf==3.2.12