diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index d06df5f..503ea3d 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -5,7 +5,7 @@ on: # branches: # - "model-**" pull_request: - branches: ["sap-dev", "heat-dev", "carbon-dev", "hotwaterkwh-dev"] + branches: ["sap-dev", "heat-dev", "carbon-dev"] label: types: ["created", "edited"] @@ -32,6 +32,92 @@ jobs: # echo "Please choose one of these tags: 'major', 'major', 'patch'" # exit(1) + Verify-Lambda: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Install packages to retrieve artifacts + env: + AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} + run: | + pip install --upgrade pip + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt + + - name: Retrieve artifacts (dvc.lock) + env: + AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} + run: | + cd modules/ml-pipeline/src/pipeline + dvc pull -r experiments + + - name: Set timestamp + id: set_timestamp + run: | + echo "timestamp=$(date +%Y%m%d)" >> $GITHUB_ENV + echo "Generated timestamp: ${timestamp}" + + - name: Upload sample row dataset to S3 + env: + AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} + run: | + cd modules/ml-pipeline/src/pipeline/data/prepared_data/ + aws s3 cp sample_test.parquet s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/sample_test.parquet + + - name: Build Lambda docker Image + run: | + docker build . --file ./deployment/Dockerfile.prediction.lambda --tag lambda_test + + - name: Run lambda docker container + env: + AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} + run: | + docker run -d -p 9000:8080 \ + -e AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} \ + -e AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} \ + -e RUNTIME_ENVIRONMENT=dev \ + -e PREDICTIONS_BUCKET=retrofit-sap-predictions-dev lambda_test + + - name: Test Lambda endpoint + run: | + sleep 2 + curl -X POST "http://localhost:9000/2015-03-31/functions/function/invocations" \ + -H "Content-Type: application/json" \ + -d "{\"body\": \"{\\\"file_location\\\": \\\"s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/sample_test.parquet\\\", \\\"property_id\\\": 1, \\\"portfolio_id\\\": 4, \\\"created_at\\\": \\\"now\\\", \\\"warm\\\": true}\"}" + + - name: Get Lambda logs + run: | + docker logs $(docker ps -al -q) + + - name: Test Lambda endpoint again + run: | + sleep 2 + curl -X POST "http://localhost:9000/2015-03-31/functions/function/invocations" \ + -H "Content-Type: application/json" \ + -d "{\"body\": \"{\\\"file_location\\\": \\\"s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/sample_test.parquet\\\", \\\"property_id\\\": 1, \\\"portfolio_id\\\": 4, \\\"created_at\\\": \\\"now\\\", \\\"testing\\\": true}\"}" + + - name: Get Lambda logs + run: | + docker logs $(docker ps -al -q) + + - name: Stop Lambda container + run: | + docker stop lambda_test || echo "Container already stopped" + + - name: Remove uploaded sample row dataset from S3 + if: always() + env: + AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} + run: | + aws s3 rm --recursive s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/ + + Verify-Model: runs-on: ubuntu-latest @@ -114,4 +200,4 @@ jobs: # metrics_location=$(find . -maxdepth 10 -name "residuals.png") # echo $metrics_location # cd $metric_location - # echo "![](./residuals.png)" >> report.md + # echo "![](./residuals.png)" >> report.md \ No newline at end of file diff --git a/README.md b/README.md index 22a6002..2d7d1e3 100644 --- a/README.md +++ b/README.md @@ -83,3 +83,13 @@ curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d ``` This will send a POST request to the running Lambda function and pass in the required data as JSON. + +For the testing of warm or testing of the lambda, use: + +```json +curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/one_sample_test_dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\", \"testing\": \"true\"}"}' +``` +or +```json +curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/one_sample_test_dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\", \"warm\": \"true\"}"}' +``` \ No newline at end of file diff --git a/deployment/Dockerfile.prediction.lambda b/deployment/Dockerfile.prediction.lambda index f8000bf..8ef1d11 100644 --- a/deployment/Dockerfile.prediction.lambda +++ b/deployment/Dockerfile.prediction.lambda @@ -1,19 +1,24 @@ -FROM public.ecr.aws/lambda/python:3.10 +FROM public.ecr.aws/lambda/python:3.12 # Set the working directory WORKDIR ${LAMBDA_TASK_ROOT} -ENV PYTHONPATH "${PYTHONPATH}:${LAMBDA_TASK_ROOT}" +ENV PYTHONPATH="${PYTHONPATH}:${LAMBDA_TASK_ROOT}" +ENV MPLCONFIGDIR="/tmp/matplotlib" # Environment variables ARG RUNTIME_ENVIRONMENT ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT} # Install necessary build tools - required to test locally -RUN yum install -y gcc python3-devel gcc-c++ +RUN dnf install -y gcc python3-devel gcc-c++ # Install python packages COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt -RUN pip install --no-cache-dir -r ./requirements.txt + +RUN pip install uv + +RUN uv pip install -r requirements.txt --system +# RUN pip install --no-cache-dir -r ./requirements.txt # Copy the project code COPY modules/ml-pipeline/src/pipeline ./pipeline @@ -22,4 +27,4 @@ COPY deployment/handlers/prediction_app.py ./pipeline/prediction_app.py WORKDIR ${LAMBDA_TASK_ROOT}/pipeline -CMD [ "prediction_app.handler" ] +CMD [ "prediction_app.handler" ] \ No newline at end of file diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py index ac397b9..f1036d3 100644 --- a/deployment/handlers/prediction_app.py +++ b/deployment/handlers/prediction_app.py @@ -47,6 +47,30 @@ def upload_dataframe_to_s3(df, bucket, s3_file_name): return False +def warming_up_invocation( + model, + model_filepath: str, +): + """ + Function to handle warm up invocations + """ + import pandas as pd + import numpy as np + + model.load_model(model_filepath) + + warmup_df = pd.DataFrame( + np.zeros((1, len(model.model.original_features))), + columns=model.model.original_features, + ) + + # model_names = model.model.model_names() + # if "NeuralNetFastAI" in model_names: + # model.model.predict(warmup_df, model="NeuralNetFastAI") + # else: + model.predict(data=warmup_df) + + def handler(event, context): """ Take in event and trigger the prediction pipeline @@ -66,9 +90,6 @@ def handler(event, context): created_at = body["created_at"] # TODO: Implement the loading of the model and prediction - - storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet" - logger.info(f"--- Initiate MLModel ---") build_model_params = settings.build_model @@ -78,6 +99,32 @@ def handler(event, context): model = model_factory(build_model_params["model_type"]) + model_filepath = build_model_params["model_save_filepath"] + + if "warm" in body: + logger.info("Warm up invocation - synthetic prediction") + + warming_up_invocation(model=model, model_filepath=model_filepath) + + return { + "statusCode": 200, + "body": json.dumps( + { + "message": "Successfully warmed up invocation", + } + ), + } + + if "testing" in body: + logger.info( + "Testing invocation for CI/CD - save file to same location in S3" + ) + storage_filepath = body["file_location"].replace( + ".parquet", "_output.parquet" + ) + else: + storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet" + logger.info(f"--- Initiate Input DataClient ---") input_dataclient = dataclient_factory( dataclient_type="aws-s3", @@ -95,7 +142,7 @@ def handler(event, context): output_dataclient=output_dataclient, model=model, target=feature_process_params["feature_processor_config"]["target"], - model_filepath=build_model_params["model_save_filepath"], + model_filepath=model_filepath, test_data_filepath=body["file_location"], predictions_output_filepath=storage_filepath, predictions_column_name=generate_predictions_params[ diff --git a/deployment/serverless.yml b/deployment/serverless.yml index b23158d..3596c2b 100644 --- a/deployment/serverless.yml +++ b/deployment/serverless.yml @@ -51,3 +51,4 @@ functions: path: /predict method: POST timeout: 120 # Set max run time to 2 minutes - we shouldn't need this much time so this can be reviewed + memorySize: 3008 diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index 0bef7d6..4a69d1c 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -1,7 +1,8 @@ export PYENV_ROOT=$(HOME)/.pyenv export PATH := $(PYENV_ROOT)/bin:$(PATH) -PYTHON_VERSION ?= 3.10.12 +PYTHON_VERSION ?= 3.12.12 CONDA_ENV=dev_env_pipeline +CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda deactivate ; conda activate .PHONY: init init: dev-conda @@ -12,11 +13,15 @@ dev-conda: # conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously" conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y conda init bash - conda run -v -n ${CONDA_ENV} pip install --upgrade pip - conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt - conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt - conda run -v -n ${CONDA_ENV} pre-commit install - conda run -v -n ${CONDA_ENV} pip install ipykernel + ${CONDA_ACTIVATE} ${CONDA_ENV} && \ + which pip && \ + pip install --upgrade pip && \ + pip install uv && \ + uv pip install -r src/pipeline/requirements/training/requirements-dev.txt && \ + uv pip install -r src/pipeline/requirements/version_control/requirements.txt && \ + pre-commit install && \ + uv pip install ipykernel && \ + conda install llvm-openmp -y echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" echo "conda activate ${CONDA_ENV}" @@ -33,4 +38,4 @@ dev-pyenv: .PHONY: dvc-init dvc-init: - . .dev_env_pipeline/bin/activate && dvc init --subdir + . .dev_env_pipeline/bin/activate && dvc init --subdir \ No newline at end of file diff --git a/modules/ml-pipeline/src/Prediction.Dockerfile b/modules/ml-pipeline/src/Prediction.Dockerfile index e0a292c..c62ee94 100644 --- a/modules/ml-pipeline/src/Prediction.Dockerfile +++ b/modules/ml-pipeline/src/Prediction.Dockerfile @@ -1,16 +1,21 @@ # Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow) -FROM python:3.10.12-slim +FROM python:3.12.12-slim RUN apt-get update && apt-get install -y libgomp1 gcc python3-dev COPY pipeline/requirements/predictions/requirements.txt requirements.txt RUN pip install --upgrade pip -RUN pip install -r requirements.txt + +RUN pip install uv + +RUN uv pip install -r requirements.txt --system + +# RUN pip install -r requirements.txt # Assuming in the CI/CD step, there will be a dvc pull step to get data and model, so will just need to run a single script COPY pipeline/ /home/pipeline/ WORKDIR /home/pipeline/ -CMD [ "python", "3_generate_predictions.py"] +CMD [ "python", "3_generate_predictions.py"] \ No newline at end of file diff --git a/modules/ml-pipeline/src/pipeline/1_prepare_data.py b/modules/ml-pipeline/src/pipeline/1_prepare_data.py index 75d784f..6b4ab84 100644 --- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py @@ -29,6 +29,7 @@ data_filepath = prepare_data_params["data_filepath"] train_proportion = prepare_data_params["train_proportion"] output_train_filepath = prepare_data_params["output_train_filepath"] output_test_filepath = prepare_data_params["output_test_filepath"] +sample_test_filepath = prepare_data_params["sample_test_filepath"] feature_processor_config = feature_process_params["feature_processor_config"] logger.info(f"--- Initiate DataClient ---") @@ -99,6 +100,10 @@ def prepare_data( logger.info("--- Outputting data ---") + output_dataclient.save_data( + obj=data.sample(1), location=sample_test_filepath, save_config=None + ) + output_dataclient.save_data( obj=train, location=output_train_filepath, save_config=None ) diff --git a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py index 6debe32..faab4a9 100644 --- a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py +++ b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py @@ -99,6 +99,12 @@ def generate_scenario_predictions( ] ) + # TEMPORARY FIX: ADD is_post_sap10_starting and is_post_sap10_ending if not present + if "is_post_sap10_starting" not in scenario_data.columns: + scenario_data["is_post_sap10_starting"] = False + if "is_post_sap10_ending" not in scenario_data.columns: + scenario_data["is_post_sap10_ending"] = False + logger.info("--- Loading Model ---") model.load_model(model_filepath) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index a36bfbc..d0d3806 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -14,9 +14,23 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 1800 + time_limit: 3600 presets: medium_quality - excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT'] - infer_limit: 0.05 + excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT', 'FASTAI'] + infer_limit: 1 infer_limit_batch_size: 10000 + fit_strategy: "parallel" ag_args_ensemble: {'num_folds_parallel': 2} + num_gpus: 0 + hyperparameters: + { + 'NN_TORCH': [{}], + 'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, {'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 3, 'ag_args': {'name_suffix': 'Large', 'priority': 0,}}], + # 'GBM': [{}], + 'CAT': [{}], + 'XGB': [{}], + 'FASTAI': [{}], + 'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}], + 'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}], + 'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}], + } \ No newline at end of file diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index f623210..e7ede8e 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -22,6 +22,7 @@ default: train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet + sample_test_filepath: ./data/prepared_data/sample_test.parquet feature_processor: feature_processor_type: dataframe diff --git a/modules/ml-pipeline/src/pipeline/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py index 257261d..2444314 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLModels.py +++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py @@ -1,4 +1,4 @@ -"""" +""" " Implementations of MLModels, all of which will have four methods to: - Load model - Save Model @@ -11,9 +11,6 @@ import joblib import pandas as pd from pathlib import Path from typing import Union, List -from sklearn import linear_model -from sklearn.svm import SVR -from autogluon.tabular import TabularDataset, TabularPredictor from core.interface.InterfaceModels import MLModel from core.Logger import logger @@ -69,6 +66,8 @@ class SKLearnLinearRegression: """ Method to train a model """ + from sklearn import linear_model + self.model = linear_model.LinearRegression() x_train = data.iloc[:, data.columns != target] @@ -117,6 +116,7 @@ class SKLearnSVMRegression: """ Method to train a model """ + from sklearn.svm import SVR validate_dict_keys( list(model_hyperparameters.keys()), @@ -152,12 +152,17 @@ class AutogluonAutoML: "infer_limit", "infer_limit_batch_size", "ag_args_ensemble", + "fit_strategy", + "num_gpus", + "hyperparameters", ] def load_model(self, path: Union[Path, str]) -> None: """ Method to load a model """ + from autogluon.tabular import TabularPredictor + filepath = str(path) self.model = TabularPredictor.load(path=filepath) @@ -183,6 +188,10 @@ class AutogluonAutoML: """ Method to train a model """ + from autogluon.tabular import TabularDataset, TabularPredictor + + # Force Parallel Model fitting + os.environ["AG_FORCE_PARALLEL"] = "True" validate_dict_keys( keys_1=list(model_hyperparameters.keys()), @@ -209,6 +218,9 @@ class AutogluonAutoML: infer_limit=model_hyperparameters["infer_limit"], infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"], ag_args_ensemble=model_hyperparameters["ag_args_ensemble"], + fit_strategy=model_hyperparameters["fit_strategy"], + num_gpus=model_hyperparameters["num_gpus"], + hyperparameters=model_hyperparameters["hyperparameters"].to_dict(), ) def predict( diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 1061d02..224ec28 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -16,8 +16,8 @@ stages: deps: - path: 1_prepare_data.py hash: md5 - md5: 11a3b8bfdfe199ab7ecc39ccc5652649 - size: 4298 + md5: a5ce162e1c402c0f811a80ef78cf4dd5 + size: 4481 params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: @@ -76,15 +76,17 @@ stages: s3://retrofit-data-dev/energy_consumption/2024-07-25/energy_consumption_dataset.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local - default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet - default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet + default.prepare_data.output_test_filepath: + ./data/prepared_data/test.parquet + default.prepare_data.output_train_filepath: + ./data/prepared_data/train.parquet default.prepare_data.train_proportion: 0.9 outs: - path: data/prepared_data/ hash: md5 - md5: c45c73e2e25a5c9697a788cfa04f232d.dir - size: 11682246 - nfiles: 2 + md5: 836879901f44ba1d590f721aead3bb10.dir + size: 11670804 + nfiles: 3 build_model: cmd: python 2_build_model.py deps: @@ -94,9 +96,9 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: c45c73e2e25a5c9697a788cfa04f232d.dir - size: 11682246 - nfiles: 2 + md5: 836879901f44ba1d590f721aead3bb10.dir + size: 11670804 + nfiles: 3 params: configs/build_model.yaml: default: @@ -112,7 +114,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 1800 + time_limit: 3600 presets: medium_quality excluded_model_types: - RF @@ -120,25 +122,94 @@ stages: - NN_TORCH - KNN - XT - infer_limit: 0.05 + - FASTAI + infer_limit: 1 infer_limit_batch_size: 10000 + fit_strategy: parallel ag_args_ensemble: num_folds_parallel: 2 + num_gpus: 0 + hyperparameters: + NN_TORCH: + - {} + GBM: + - extra_trees: true + ag_args: + name_suffix: XT + - {} + - learning_rate: 0.03 + num_leaves: 128 + feature_fraction: 0.9 + min_data_in_leaf: 3 + ag_args: + name_suffix: Large + priority: 0 + CAT: + - {} + XGB: + - {} + FASTAI: + - {} + RF: + - criterion: gini + ag_args: + name_suffix: Gini + problem_types: + - binary + - multiclass + - criterion: entropy + ag_args: + name_suffix: Entr + problem_types: + - binary + - multiclass + - criterion: squared_error + ag_args: + name_suffix: MSE + problem_types: + - regression + - quantile + XT: + - criterion: gini + ag_args: + name_suffix: Gini + problem_types: + - binary + - multiclass + - criterion: entropy + ag_args: + name_suffix: Entr + problem_types: + - binary + - multiclass + - criterion: squared_error + ag_args: + name_suffix: MSE + problem_types: + - regression + - quantile + KNN: + - weights: uniform + ag_args: + name_suffix: Unif + - weights: distance + ag_args: + name_suffix: Dist outs: - path: data/fit_predictions/ hash: md5 - md5: 6abffc8f19e3bb14345f0504a96fd214.dir - size: 1788386 + md5: a3e59cef53439ba2b5dafda82851ce0f.dir + size: 1788338 nfiles: 1 - path: data/model/ hash: md5 - md5: aee2886545c62efbf26d49f32bd1f328.dir - size: 79940408 - nfiles: 35 + md5: a402cbf6c290ab996b4e9c9d032b9bf8.dir + size: 106034886 + nfiles: 31 - path: metrics/fit_metrics.json hash: md5 - md5: 14e5b4019f6e5cf49edf7945b71e6a66 - size: 220 + md5: 70c5522d13dea392e1351ab39f12ad25 + size: 215 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -148,26 +219,28 @@ stages: size: 2464 - path: data/model hash: md5 - md5: aee2886545c62efbf26d49f32bd1f328.dir - size: 79940408 - nfiles: 35 + md5: a402cbf6c290ab996b4e9c9d032b9bf8.dir + size: 106034886 + nfiles: 31 - path: data/prepared_data hash: md5 - md5: c45c73e2e25a5c9697a788cfa04f232d.dir - size: 11682246 - nfiles: 2 + md5: 836879901f44ba1d590f721aead3bb10.dir + size: 11670804 + nfiles: 3 params: configs/settings.yaml: default.generate_predictions.input_dataclient_type: local default.generate_predictions.output_dataclient_type: local default.generate_predictions.predictions_column_name: predictions - default.generate_predictions.predictions_output_filepath: ./data/predictions/predictions.parquet - default.generate_predictions.test_data_filepath: ./data/prepared_data/test.parquet + default.generate_predictions.predictions_output_filepath: + ./data/predictions/predictions.parquet + default.generate_predictions.test_data_filepath: + ./data/prepared_data/test.parquet outs: - path: data/predictions/ hash: md5 - md5: efe40990a6092494363daa3284a22878.dir - size: 192442 + md5: 7f670582ae9a1fca6ac77c730af1473f.dir + size: 192392 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -178,14 +251,14 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: efe40990a6092494363daa3284a22878.dir - size: 192442 + md5: 7f670582ae9a1fca6ac77c730af1473f.dir + size: 192392 nfiles: 1 - path: data/prepared_data hash: md5 - md5: c45c73e2e25a5c9697a788cfa04f232d.dir - size: 11682246 - nfiles: 2 + md5: 836879901f44ba1d590f721aead3bb10.dir + size: 11670804 + nfiles: 3 params: configs/settings.yaml: default.generate_metrics.dataclient_type: local @@ -194,15 +267,15 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: c6f913d497eb2f98e801c9e030bd96e9 - size: 222 + md5: 59e19822478e595fc809d8fcba02ce39 + size: 214 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: - path: 5_generate_scenarios.py hash: md5 - md5: 40506749fefd926d47c60ff5b16db307 - size: 5337 + md5: 872b0c762ce1c8933fcbc5f54d5d4b5d + size: 5658 params: configs/scenarios.yaml: default.scenarios: diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt index 4dc4c36..e9e7fc1 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt @@ -1,7 +1,7 @@ -joblib==1.3.2 -boto3==1.28.17 -pandas==2.1.4 -autogluon.tabular[all]==1.0.0 -dynaconf==3.2.1 -pyarrow==13.0.0 -pre-commit==3.3.3 +joblib==1.5.2 +boto3==1.40.61 +pandas==2.3.3 +autogluon.tabular[all]==1.4.0 +dynaconf==3.2.12 +pyarrow==20.0.0 +pre-commit==4.3.0 \ No newline at end of file diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt index 35bdb05..449ac93 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt @@ -1,7 +1,7 @@ -joblib==1.3.2 -boto3==1.28.17 -pandas==2.1.4 -autogluon.tabular[all]==1.0.0 -dynaconf==3.2.1 -pyarrow==13.0.0 -PyYAML==6.0.1 +joblib==1.5.2 +boto3==1.40.61 +pandas==2.3.3 +autogluon.tabular[all]==1.4.0 +dynaconf==3.2.12 +pyarrow==20.0.0 +PyYAML==6.0.3 \ No newline at end of file diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index 93a042e..64fa24e 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -1,10 +1,10 @@ -joblib==1.3.2 -boto3==1.28.17 -pandas==2.1.4 -autogluon.tabular[all]==1.0.0 -ray==2.6.3 -dynaconf==3.2.1 -alibi==0.9.5 -shap==0.42.1 -pyarrow==13.0.0 -pre-commit==3.3.3 +joblib==1.5.2 +boto3==1.40.61 +pandas==2.3.3 +autogluon.tabular[all]==1.4.0 +ray==2.44.1 +dynaconf==3.2.12 +# alibi +shap==0.49.1 +pyarrow==20.0.0 +pre-commit==4.3.0 \ No newline at end of file diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt index edeb764..bf73cc0 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt @@ -1,4 +1,4 @@ -boto3==1.28.41 -pandas==2.1.4 -autogluon.tabular[all]==1.0.0 -dynaconf==3.2.1 +boto3==1.40.61 +pandas==2.3.3 +autogluon.tabular[all]==1.4.0 +dynaconf==3.2.12 \ No newline at end of file diff --git a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt index 173550d..fa93d82 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt @@ -1,4 +1,4 @@ dvc==3.51.0 dvc-s3==3.2.0 -gto==1.7.1 +gto==1.9.0 pyOpenSSL==23.3.0