update to python 3.12 and autogluon 1.3

2026-06-08 11:17:25 +00:00 · 2025-11-02 11:37:08 +00:00 · 2025-11-02 11:37:08 +00:00 · 7b001f3abf
commit 7b001f3abf
parent 88c5b6c93a
13 changed files with 92 additions and 74 deletions
--- a/modules/ml-pipeline/Makefile
+++ b/modules/ml-pipeline/Makefile
@ -1,7 +1,8 @@
 export PYENV_ROOT=$(HOME)/.pyenv
 export PATH := $(PYENV_ROOT)/bin:$(PATH)
-PYTHON_VERSION ?= 3.10.12
-CONDA_ENV=dev_env_pipeline
+PYTHON_VERSION ?= 3.12.12
+CONDA_ENV=dev_env_pipeline_1
+CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda deactivate ; conda activate

 .PHONY: init
 init: dev-conda
@ -12,11 +13,13 @@ dev-conda:
 	# conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously"
 	conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y
 	conda init bash
-	conda run -v -n ${CONDA_ENV} pip install --upgrade pip
-	conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt
-	conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt
-	conda run -v -n ${CONDA_ENV} pre-commit install
-	conda run -v -n ${CONDA_ENV} pip install ipykernel
+	${CONDA_ACTIVATE} ${CONDA_ENV} && \
+		which pip && \
+		pip install --upgrade pip && \
+		pip install -r src/pipeline/requirements/training/requirements-dev.txt && \
+		pip install -r src/pipeline/requirements/version_control/requirements.txt && \
+		pre-commit install && \
+		pip install ipykernel
 	echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND"
 	echo "conda activate ${CONDA_ENV}"

--- a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py
+++ b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py
@ -99,6 +99,12 @@ def generate_scenario_predictions(
            ]
        )

+    # TEMPORARY FIX: ADD is_post_sap10_starting and is_post_sap10_ending if not present
+    if "is_post_sap10_starting" not in scenario_data.columns:
+        scenario_data["is_post_sap10_starting"] = False
+    if "is_post_sap10_ending" not in scenario_data.columns:
+        scenario_data["is_post_sap10_ending"] = False
+
    logger.info("--- Loading Model ---")

    model.load_model(model_filepath)
--- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
@ -17,6 +17,7 @@ default:
      time_limit: 1800
      presets: medium_quality
      excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT']
-      infer_limit: 0.05
+      infer_limit: 0.0005
      infer_limit_batch_size: 10000
+      "fit_strategy": "parallel"
      ag_args_ensemble: {'num_folds_parallel': 2}
--- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml
@ -21,7 +21,8 @@ default:
    # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
    # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-10-03-22-57-23/dataset_rooms.parquet
    # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-10-08-21-58-03/dataset_rooms.parquet
-    data_filepath: s3://retrofit-data-dev/sap_change_model/2025-09-05-14-05-32/dataset_rooms.parquet
+    # data_filepath: s3://retrofit-data-dev/sap_change_model/2025-09-05-14-05-32/dataset_rooms.parquet
+    data_filepath: s3://retrofit-data-dev/sap_change_model/2025-11-02-09-32-42/dataset_rooms.parquet
    train_proportion: 0.9
    output_train_filepath: ./data/prepared_data/train.parquet
    output_test_filepath: ./data/prepared_data/test.parquet
--- a/modules/ml-pipeline/src/pipeline/core/MLModels.py
+++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py
@ -1,4 +1,4 @@
-""""
+""" "
 Implementations of MLModels, all of which will have four methods to:
 - Load model
 - Save Model
@ -152,6 +152,7 @@ class AutogluonAutoML:
        "infer_limit",
        "infer_limit_batch_size",
        "ag_args_ensemble",
+        "fit_strategy",
    ]

    def load_model(self, path: Union[Path, str]) -> None:
@ -209,6 +210,7 @@ class AutogluonAutoML:
            infer_limit=model_hyperparameters["infer_limit"],
            infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"],
            ag_args_ensemble=model_hyperparameters["ag_args_ensemble"],
+            fit_strategy=model_hyperparameters["fit_strategy"],
        )

    def predict(
--- a/modules/ml-pipeline/src/pipeline/dvc.lock
+++ b/modules/ml-pipeline/src/pipeline/dvc.lock
@ -49,18 +49,20 @@ stages:
        default.feature_processor.feature_processor_config.subsample_seed: 0
        default.feature_processor.feature_processor_config.target: sap_ending
        default.feature_processor.feature_processor_type: dataframe
-        default.prepare_data.data_filepath: 
-          s3://retrofit-data-dev/sap_change_model/2025-09-05-14-05-32/dataset_rooms.parquet
+        default.prepare_data.data_filepath:
+          s3://retrofit-data-dev/sap_change_model/2025-11-02-09-32-42/dataset_rooms.parquet
        default.prepare_data.input_dataclient_type: aws-s3
        default.prepare_data.output_dataclient_type: local
-        default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
-        default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet
+        default.prepare_data.output_test_filepath:
+          ./data/prepared_data/test.parquet
+        default.prepare_data.output_train_filepath:
+          ./data/prepared_data/train.parquet
        default.prepare_data.train_proportion: 0.9
    outs:
    - path: data/prepared_data/
      hash: md5
-      md5: 7cc090d55cb296ce5d360d655c06e861.dir
-      size: 46183314
+      md5: 5c56787d9e6450e26a78c15700e104c7.dir
+      size: 45746089
      nfiles: 2
  build_model:
    cmd: python 2_build_model.py
@ -71,8 +73,8 @@ stages:
      size: 4820
    - path: data/prepared_data
      hash: md5
-      md5: 7cc090d55cb296ce5d360d655c06e861.dir
-      size: 46183314
+      md5: 5c56787d9e6450e26a78c15700e104c7.dir
+      size: 45746089
      nfiles: 2
    params:
      configs/build_model.yaml:
@ -97,25 +99,26 @@ stages:
              - NN_TORCH
              - KNN
              - XT
-              infer_limit: 0.05
+              infer_limit: 0.0005
              infer_limit_batch_size: 10000
+              fit_strategy: parallel
              ag_args_ensemble:
                num_folds_parallel: 2
    outs:
    - path: data/fit_predictions/
      hash: md5
-      md5: a6196bf08607c43ba6bc637611bb32b0.dir
-      size: 3491001
+      md5: 4fa77e3f129d2e6f9ef7222c44978c26.dir
+      size: 3474669
      nfiles: 1
    - path: data/model/
      hash: md5
-      md5: b225d7b01356cecefb3794a9a3cd19b5.dir
-      size: 790430916
-      nfiles: 36
+      md5: e27b9216bc7455f8245d5b49f27b2707.dir
+      size: 753575768
+      nfiles: 30
    - path: metrics/fit_metrics.json
      hash: md5
-      md5: 33421d5e3a2d569dbe6d4486c568a2b7
-      size: 225
+      md5: 426a162284ca9e29c043eb1d72e547e6
+      size: 224
  generate_predictions:
    cmd: python 3_generate_predictions.py
    deps:
@ -125,26 +128,28 @@ stages:
      size: 2464
    - path: data/model
      hash: md5
-      md5: b225d7b01356cecefb3794a9a3cd19b5.dir
-      size: 790430916
-      nfiles: 36
+      md5: e27b9216bc7455f8245d5b49f27b2707.dir
+      size: 753575768
+      nfiles: 30
    - path: data/prepared_data
      hash: md5
-      md5: 7cc090d55cb296ce5d360d655c06e861.dir
-      size: 46183314
+      md5: 5c56787d9e6450e26a78c15700e104c7.dir
+      size: 45746089
      nfiles: 2
    params:
      configs/settings.yaml:
        default.generate_predictions.input_dataclient_type: local
        default.generate_predictions.output_dataclient_type: local
        default.generate_predictions.predictions_column_name: predictions
-        default.generate_predictions.predictions_output_filepath: ./data/predictions/predictions.parquet
-        default.generate_predictions.test_data_filepath: ./data/prepared_data/test.parquet
+        default.generate_predictions.predictions_output_filepath:
+          ./data/predictions/predictions.parquet
+        default.generate_predictions.test_data_filepath:
+          ./data/prepared_data/test.parquet
    outs:
    - path: data/predictions/
      hash: md5
-      md5: bd6821db9abc95af8c74aa20effd7f37.dir
-      size: 487194
+      md5: 6e004c7f4812b5cabbee62fe8fb0d82f.dir
+      size: 484524
      nfiles: 1
  generate_metrics:
    cmd: python 4_generate_metrics.py
@ -155,13 +160,13 @@ stages:
      size: 3484
    - path: data/predictions
      hash: md5
-      md5: bd6821db9abc95af8c74aa20effd7f37.dir
-      size: 487194
+      md5: 6e004c7f4812b5cabbee62fe8fb0d82f.dir
+      size: 484524
      nfiles: 1
    - path: data/prepared_data
      hash: md5
-      md5: 7cc090d55cb296ce5d360d655c06e861.dir
-      size: 46183314
+      md5: 5c56787d9e6450e26a78c15700e104c7.dir
+      size: 45746089
      nfiles: 2
    params:
      configs/settings.yaml:
@ -171,15 +176,15 @@ stages:
    outs:
    - path: metrics/metrics.json
      hash: md5
-      md5: 9c2a7802554f5c2f750b2242c6003026
+      md5: b9ae6d24424f2d5389697577e9076b91
      size: 223
  generate_scenerio_metrics:
    cmd: python 5_generate_scenarios.py
    deps:
    - path: 5_generate_scenarios.py
      hash: md5
-      md5: 40506749fefd926d47c60ff5b16db307
-      size: 5337
+      md5: 872b0c762ce1c8933fcbc5f54d5d4b5d
+      size: 5658
    params:
      configs/scenarios.yaml:
        default.scenarios:
@ -192,9 +197,9 @@ stages:
    outs:
    - path: metrics/scenario_metrics.md
      hash: md5
-      md5: c01524a0cc2e61151c106d7049af3bf9
+      md5: 32d78c20d91fedf2f5dbb4162f323e25
      size: 356
    - path: metrics/scenario_table.md
      hash: md5
-      md5: a995c8ef7ffbe2ca254441150817ae21
+      md5: 52cbd19566151b0c300f9673252704d2
      size: 872
--- a/modules/ml-pipeline/src/pipeline/metrics/.gitignore
+++ b/modules/ml-pipeline/src/pipeline/metrics/.gitignore
@ -1,4 +1,2 @@
-/fit_metrics.json
-/metrics.json
 /scenario_table.md
 /scenario_metrics.md
--- a/modules/ml-pipeline/src/pipeline/metrics/fit_metrics.json
+++ b/modules/ml-pipeline/src/pipeline/metrics/fit_metrics.json
@ -0,0 +1 @@
+{"mean_absolute_error": 1.2158480882644653, "median_absolute_error": 0.8539352416992188, "mean_squared_error": 3.116994857788086, "mean_absolute_percentage_error": 0.01968802697956562, "symmetric_mape": 0.019615056540152054}
--- a/modules/ml-pipeline/src/pipeline/metrics/metrics.json
+++ b/modules/ml-pipeline/src/pipeline/metrics/metrics.json
@ -0,0 +1 @@
+{"mean_absolute_error": 2.121211290359497, "median_absolute_error": 1.3063621520996094, "mean_squared_error": 11.15064525604248, "mean_absolute_percentage_error": 0.03622421622276306, "symmetric_mape": 0.035541225671999285}
--- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt
@ -1,7 +1,7 @@
-joblib==1.3.2
-boto3==1.28.17
-pandas==2.1.4
-autogluon.tabular[all]==1.0.0
-dynaconf==3.2.1
-pyarrow==13.0.0
-pre-commit==3.3.3
+joblib==1.5.2
+boto3==1.40.61
+pandas==2.2.3
+autogluon.tabular[all]==1.3
+dynaconf==3.2.12
+pyarrow==22.0.0
+pre-commit==4.3.0
--- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt
@ -1,7 +1,7 @@
-joblib==1.3.2
-boto3==1.28.17
-pandas==2.1.4
-autogluon.tabular[all]==1.0.0
-dynaconf==3.2.1
-pyarrow==13.0.0
-PyYAML==6.0.1
+joblib==1.5.2
+boto3==1.40.61
+pandas==2.2.3
+autogluon.tabular[all]==1.3
+dynaconf==3.2.12
+pyarrow==22.0.0
+PyYAML==6.0.3
--- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt
@ -1,10 +1,10 @@
-joblib==1.3.2
-boto3==1.28.17
-pandas==2.1.4
-autogluon.tabular[all]==1.0.0
-ray==2.6.3
-dynaconf==3.2.1
-alibi==0.9.5
-shap==0.42.1
-pyarrow==13.0.0
-pre-commit==3.3.3
+joblib==1.5.2
+boto3==1.40.61
+pandas==2.2.3
+autogluon.tabular[all]==1.3
+ray==2.44.1
+dynaconf==3.2.12
+alibi==0.5.5
+shap==0.49.1
+pyarrow==22.0.0
+pre-commit==4.3.0
--- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt
@ -1,4 +1,4 @@
-boto3==1.28.41
-pandas==2.1.4
-autogluon.tabular[all]==1.0.0
-dynaconf==3.2.1
+boto3==1.40.61
+pandas==2.2.3
+autogluon.tabular[all]==1.3
+dynaconf==3.2.12
				`@ -0,0 +1 @@`
				`{"mean_absolute_error": 1.2158480882644653, "median_absolute_error": 0.8539352416992188, "mean_squared_error": 3.116994857788086, "mean_absolute_percentage_error": 0.01968802697956562, "symmetric_mape": 0.019615056540152054}`
				`@ -0,0 +1 @@`
				`{"mean_absolute_error": 2.121211290359497, "median_absolute_error": 1.3063621520996094, "mean_squared_error": 11.15064525604248, "mean_absolute_percentage_error": 0.03622421622276306, "symmetric_mape": 0.035541225671999285}`