diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index 0bef7d6..ff83c27 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -1,7 +1,8 @@ export PYENV_ROOT=$(HOME)/.pyenv export PATH := $(PYENV_ROOT)/bin:$(PATH) -PYTHON_VERSION ?= 3.10.12 -CONDA_ENV=dev_env_pipeline +PYTHON_VERSION ?= 3.12.12 +CONDA_ENV=dev_env_pipeline_1 +CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda deactivate ; conda activate .PHONY: init init: dev-conda @@ -12,11 +13,13 @@ dev-conda: # conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously" conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y conda init bash - conda run -v -n ${CONDA_ENV} pip install --upgrade pip - conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt - conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt - conda run -v -n ${CONDA_ENV} pre-commit install - conda run -v -n ${CONDA_ENV} pip install ipykernel + ${CONDA_ACTIVATE} ${CONDA_ENV} && \ + which pip && \ + pip install --upgrade pip && \ + pip install -r src/pipeline/requirements/training/requirements-dev.txt && \ + pip install -r src/pipeline/requirements/version_control/requirements.txt && \ + pre-commit install && \ + pip install ipykernel echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" echo "conda activate ${CONDA_ENV}" diff --git a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py index 6debe32..faab4a9 100644 --- a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py +++ b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py @@ -99,6 +99,12 @@ def generate_scenario_predictions( ] ) + # TEMPORARY FIX: ADD is_post_sap10_starting and is_post_sap10_ending if not present + if "is_post_sap10_starting" not in scenario_data.columns: + scenario_data["is_post_sap10_starting"] = False + if "is_post_sap10_ending" not in scenario_data.columns: + scenario_data["is_post_sap10_ending"] = False + logger.info("--- Loading Model ---") model.load_model(model_filepath) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index a36bfbc..38c0910 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -17,6 +17,7 @@ default: time_limit: 1800 presets: medium_quality excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT'] - infer_limit: 0.05 + infer_limit: 0.0005 infer_limit_batch_size: 10000 + "fit_strategy": "parallel" ag_args_ensemble: {'num_folds_parallel': 2} diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 34e03fb..28d5cd9 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -21,7 +21,8 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-10-03-22-57-23/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-10-08-21-58-03/dataset_rooms.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/2025-09-05-14-05-32/dataset_rooms.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/2025-09-05-14-05-32/dataset_rooms.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/2025-11-02-09-32-42/dataset_rooms.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py index 257261d..437c69f 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLModels.py +++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py @@ -1,4 +1,4 @@ -"""" +""" " Implementations of MLModels, all of which will have four methods to: - Load model - Save Model @@ -152,6 +152,7 @@ class AutogluonAutoML: "infer_limit", "infer_limit_batch_size", "ag_args_ensemble", + "fit_strategy", ] def load_model(self, path: Union[Path, str]) -> None: @@ -209,6 +210,7 @@ class AutogluonAutoML: infer_limit=model_hyperparameters["infer_limit"], infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"], ag_args_ensemble=model_hyperparameters["ag_args_ensemble"], + fit_strategy=model_hyperparameters["fit_strategy"], ) def predict( diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 16b7b07..5502f03 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -49,18 +49,20 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: - s3://retrofit-data-dev/sap_change_model/2025-09-05-14-05-32/dataset_rooms.parquet + default.prepare_data.data_filepath: + s3://retrofit-data-dev/sap_change_model/2025-11-02-09-32-42/dataset_rooms.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local - default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet - default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet + default.prepare_data.output_test_filepath: + ./data/prepared_data/test.parquet + default.prepare_data.output_train_filepath: + ./data/prepared_data/train.parquet default.prepare_data.train_proportion: 0.9 outs: - path: data/prepared_data/ hash: md5 - md5: 7cc090d55cb296ce5d360d655c06e861.dir - size: 46183314 + md5: 5c56787d9e6450e26a78c15700e104c7.dir + size: 45746089 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -71,8 +73,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 7cc090d55cb296ce5d360d655c06e861.dir - size: 46183314 + md5: 5c56787d9e6450e26a78c15700e104c7.dir + size: 45746089 nfiles: 2 params: configs/build_model.yaml: @@ -97,25 +99,26 @@ stages: - NN_TORCH - KNN - XT - infer_limit: 0.05 + infer_limit: 0.0005 infer_limit_batch_size: 10000 + fit_strategy: parallel ag_args_ensemble: num_folds_parallel: 2 outs: - path: data/fit_predictions/ hash: md5 - md5: a6196bf08607c43ba6bc637611bb32b0.dir - size: 3491001 + md5: 4fa77e3f129d2e6f9ef7222c44978c26.dir + size: 3474669 nfiles: 1 - path: data/model/ hash: md5 - md5: b225d7b01356cecefb3794a9a3cd19b5.dir - size: 790430916 - nfiles: 36 + md5: e27b9216bc7455f8245d5b49f27b2707.dir + size: 753575768 + nfiles: 30 - path: metrics/fit_metrics.json hash: md5 - md5: 33421d5e3a2d569dbe6d4486c568a2b7 - size: 225 + md5: 426a162284ca9e29c043eb1d72e547e6 + size: 224 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -125,26 +128,28 @@ stages: size: 2464 - path: data/model hash: md5 - md5: b225d7b01356cecefb3794a9a3cd19b5.dir - size: 790430916 - nfiles: 36 + md5: e27b9216bc7455f8245d5b49f27b2707.dir + size: 753575768 + nfiles: 30 - path: data/prepared_data hash: md5 - md5: 7cc090d55cb296ce5d360d655c06e861.dir - size: 46183314 + md5: 5c56787d9e6450e26a78c15700e104c7.dir + size: 45746089 nfiles: 2 params: configs/settings.yaml: default.generate_predictions.input_dataclient_type: local default.generate_predictions.output_dataclient_type: local default.generate_predictions.predictions_column_name: predictions - default.generate_predictions.predictions_output_filepath: ./data/predictions/predictions.parquet - default.generate_predictions.test_data_filepath: ./data/prepared_data/test.parquet + default.generate_predictions.predictions_output_filepath: + ./data/predictions/predictions.parquet + default.generate_predictions.test_data_filepath: + ./data/prepared_data/test.parquet outs: - path: data/predictions/ hash: md5 - md5: bd6821db9abc95af8c74aa20effd7f37.dir - size: 487194 + md5: 6e004c7f4812b5cabbee62fe8fb0d82f.dir + size: 484524 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -155,13 +160,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: bd6821db9abc95af8c74aa20effd7f37.dir - size: 487194 + md5: 6e004c7f4812b5cabbee62fe8fb0d82f.dir + size: 484524 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 7cc090d55cb296ce5d360d655c06e861.dir - size: 46183314 + md5: 5c56787d9e6450e26a78c15700e104c7.dir + size: 45746089 nfiles: 2 params: configs/settings.yaml: @@ -171,15 +176,15 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 9c2a7802554f5c2f750b2242c6003026 + md5: b9ae6d24424f2d5389697577e9076b91 size: 223 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: - path: 5_generate_scenarios.py hash: md5 - md5: 40506749fefd926d47c60ff5b16db307 - size: 5337 + md5: 872b0c762ce1c8933fcbc5f54d5d4b5d + size: 5658 params: configs/scenarios.yaml: default.scenarios: @@ -192,9 +197,9 @@ stages: outs: - path: metrics/scenario_metrics.md hash: md5 - md5: c01524a0cc2e61151c106d7049af3bf9 + md5: 32d78c20d91fedf2f5dbb4162f323e25 size: 356 - path: metrics/scenario_table.md hash: md5 - md5: a995c8ef7ffbe2ca254441150817ae21 + md5: 52cbd19566151b0c300f9673252704d2 size: 872 diff --git a/modules/ml-pipeline/src/pipeline/metrics/.gitignore b/modules/ml-pipeline/src/pipeline/metrics/.gitignore index 6427764..eaea051 100644 --- a/modules/ml-pipeline/src/pipeline/metrics/.gitignore +++ b/modules/ml-pipeline/src/pipeline/metrics/.gitignore @@ -1,4 +1,2 @@ -/fit_metrics.json -/metrics.json /scenario_table.md /scenario_metrics.md diff --git a/modules/ml-pipeline/src/pipeline/metrics/fit_metrics.json b/modules/ml-pipeline/src/pipeline/metrics/fit_metrics.json new file mode 100644 index 0000000..d0311f6 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/metrics/fit_metrics.json @@ -0,0 +1 @@ +{"mean_absolute_error": 1.2158480882644653, "median_absolute_error": 0.8539352416992188, "mean_squared_error": 3.116994857788086, "mean_absolute_percentage_error": 0.01968802697956562, "symmetric_mape": 0.019615056540152054} diff --git a/modules/ml-pipeline/src/pipeline/metrics/metrics.json b/modules/ml-pipeline/src/pipeline/metrics/metrics.json new file mode 100644 index 0000000..b824a27 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/metrics/metrics.json @@ -0,0 +1 @@ +{"mean_absolute_error": 2.121211290359497, "median_absolute_error": 1.3063621520996094, "mean_squared_error": 11.15064525604248, "mean_absolute_percentage_error": 0.03622421622276306, "symmetric_mape": 0.035541225671999285} diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt index 4dc4c36..a70ecf8 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt @@ -1,7 +1,7 @@ -joblib==1.3.2 -boto3==1.28.17 -pandas==2.1.4 -autogluon.tabular[all]==1.0.0 -dynaconf==3.2.1 -pyarrow==13.0.0 -pre-commit==3.3.3 +joblib==1.5.2 +boto3==1.40.61 +pandas==2.2.3 +autogluon.tabular[all]==1.3 +dynaconf==3.2.12 +pyarrow==22.0.0 +pre-commit==4.3.0 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt index 35bdb05..b9aa74c 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt @@ -1,7 +1,7 @@ -joblib==1.3.2 -boto3==1.28.17 -pandas==2.1.4 -autogluon.tabular[all]==1.0.0 -dynaconf==3.2.1 -pyarrow==13.0.0 -PyYAML==6.0.1 +joblib==1.5.2 +boto3==1.40.61 +pandas==2.2.3 +autogluon.tabular[all]==1.3 +dynaconf==3.2.12 +pyarrow==22.0.0 +PyYAML==6.0.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index 93a042e..1e59b59 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -1,10 +1,10 @@ -joblib==1.3.2 -boto3==1.28.17 -pandas==2.1.4 -autogluon.tabular[all]==1.0.0 -ray==2.6.3 -dynaconf==3.2.1 -alibi==0.9.5 -shap==0.42.1 -pyarrow==13.0.0 -pre-commit==3.3.3 +joblib==1.5.2 +boto3==1.40.61 +pandas==2.2.3 +autogluon.tabular[all]==1.3 +ray==2.44.1 +dynaconf==3.2.12 +alibi==0.5.5 +shap==0.49.1 +pyarrow==22.0.0 +pre-commit==4.3.0 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt index edeb764..84455e8 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt @@ -1,4 +1,4 @@ -boto3==1.28.41 -pandas==2.1.4 -autogluon.tabular[all]==1.0.0 -dynaconf==3.2.1 +boto3==1.40.61 +pandas==2.2.3 +autogluon.tabular[all]==1.3 +dynaconf==3.2.12