diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index a46b68d..5c5d563 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -1,9 +1,25 @@ export PYENV_ROOT=$(HOME)/.pyenv export PATH := $(PYENV_ROOT)/bin:$(PATH) PYTHON_VERSION ?= 3.10.12 +CONDA_ENV=dev_env_pipeline .PHONY: init -init: dev-pyenv +init: dev-conda + +.PHONY: dev-conda +dev-conda: + # conda deactivate || echo "Not in conda environment" + # conda remove --name $CONDA_ENV --all -y || echo "No environment created previously" + conda create --name $CONDA_ENV python=$(PYTHON_VERSION) -y + conda init bash + conda run -vvvv -n $CONDA_ENV pip install --upgrade pip + conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/training/requirements-dev.txt + conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/version_control/requirements.txt + conda run -vvvv -n $CONDA_ENV pre-commit install + conda run -vvvv -n $CONDA_ENV pip install ipykernel + echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" + echo "conda activate $CONDA_ENV" + .PHONY: dev-pyenv dev-pyenv: diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 75ae2be..5e870b8 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -11,6 +11,6 @@ AutogluonAutoML: output_filepath: ./data/model/autogluonmodel/ problem_type: regression eval_metric: mean_absolute_error - time_limit: 400 - presets: good_quality + time_limit: 60 + presets: medium_quality excluded_model_types: ['KNN'] diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 91a4815..180d3a9 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -17,3 +17,4 @@ def SAP_ENDING(df): new_feature_funcs = {"SAP_ENDING": SAP_ENDING} +# new_feature_funcs = {} diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 501dc10..5f143c3 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: prepare_data.py hash: md5 - md5: 934d774e67f38e440b621ce71152f5f6 - size: 5031 + md5: 2648d7d407dca857a1d20a11a88d3d98 + size: 5116 params: configs/prepare_data.yaml: output_test_filepath: ./data/prepared_data/test.parquet @@ -15,8 +15,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 3767eec56906f5ac724a3f07433645ef.dir - size: 13442342 + md5: c183712d22ab739e0be016724f44ee1c.dir + size: 12203729 nfiles: 2 build_model: cmd: python build_model.py @@ -27,8 +27,8 @@ stages: size: 5134 - path: data/prepared_data hash: md5 - md5: 3767eec56906f5ac724a3f07433645ef.dir - size: 13442342 + md5: c183712d22ab739e0be016724f44ee1c.dir + size: 12203729 nfiles: 2 params: configs/build_model.yaml: @@ -36,8 +36,8 @@ stages: output_filepath: ./data/model/autogluonmodel/ problem_type: regression eval_metric: mean_absolute_error - time_limit: 400 - presets: good_quality + time_limit: 60 + presets: medium_quality excluded_model_types: - KNN SKLearnLinearRegression: @@ -49,25 +49,25 @@ stages: outs: - path: data/model/ hash: md5 - md5: 7b2f8334c81fb5ff23e42e77741b31d1.dir - size: 118227750 - nfiles: 71 + md5: cb03448b572cb167bf281ee8d43dccd9.dir + size: 99423757 + nfiles: 14 - path: metrics/fit_metrics.json hash: md5 - md5: e1c9a16617804f48e8ffac7cec6575ca - size: 185 + md5: 48d9cc86c22c1ac0da8903a32a7d10c3 + size: 183 generate_predictions: cmd: python generate_predictions.py deps: - path: data/model hash: md5 - md5: 7b2f8334c81fb5ff23e42e77741b31d1.dir - size: 118227750 - nfiles: 71 + md5: cb03448b572cb167bf281ee8d43dccd9.dir + size: 99423757 + nfiles: 14 - path: data/prepared_data hash: md5 - md5: 3767eec56906f5ac724a3f07433645ef.dir - size: 13442342 + md5: c183712d22ab739e0be016724f44ee1c.dir + size: 12203729 nfiles: 2 - path: generate_predictions.py hash: md5 @@ -83,21 +83,21 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: fb7cf3f4a90598ec1e43a1b7a4af3bef.dir - size: 536774 + md5: 3d5002f0eecd2374a0ef2fd6f711503e.dir + size: 383878 nfiles: 1 generate_metrics: cmd: python generate_metrics.py deps: - path: data/predictions hash: md5 - md5: fb7cf3f4a90598ec1e43a1b7a4af3bef.dir - size: 536774 + md5: 3d5002f0eecd2374a0ef2fd6f711503e.dir + size: 383878 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 3767eec56906f5ac724a3f07433645ef.dir - size: 13442342 + md5: c183712d22ab739e0be016724f44ee1c.dir + size: 12203729 nfiles: 2 - path: generate_metrics.py hash: md5 @@ -106,14 +106,12 @@ stages: params: configs/generate_metrics.yaml: dataclient_type: local - input_datahandler_type: parquet metrics_output_filepath: ./metrics/metrics.json metrics_type: Regression - output_datahandler_type: json outs: - path: metrics/metrics.json hash: md5 - md5: 852ef4cf2ca5e7f89d70420a9df7a596 + md5: 08a81d2e5cecf360043498526bc98314 size: 183 startup_cleanup: cmd: python startup_cleanup.py diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index e34d5af..d285422 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -5,5 +5,3 @@ autogluon==0.8.2 alibi==0.9.4 pyarrow==13.0.0 pre-commit==3.3.3 -sphinx==7.2.5 -sphinx_rtd_theme==1.3.0