Compare commits

..

1 commit

Author SHA1 Message Date
quandanrepo
b8dcf626b2
Merge pull request #117 from Hestia-Homes/sap-dev
Sap dev
2024-05-30 20:18:25 +01:00
19 changed files with 112 additions and 432 deletions

View file

@ -32,92 +32,6 @@ jobs:
# echo "Please choose one of these tags: 'major', 'major', 'patch'"
# exit(1)
Verify-Lambda:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Install packages to retrieve artifacts
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
pip install --upgrade pip
pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
- name: Retrieve artifacts (dvc.lock)
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
cd modules/ml-pipeline/src/pipeline
dvc pull -r experiments
- name: Set timestamp
id: set_timestamp
run: |
echo "timestamp=$(date +%Y%m%d)" >> $GITHUB_ENV
echo "Generated timestamp: ${timestamp}"
- name: Upload sample row dataset to S3
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
cd modules/ml-pipeline/src/pipeline/data/prepared_data/
aws s3 cp sample_test.parquet s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/sample_test.parquet
- name: Build Lambda docker Image
run: |
docker build . --file ./deployment/Dockerfile.prediction.lambda --tag lambda_test
- name: Run lambda docker container
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
docker run -d -p 9000:8080 \
-e AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} \
-e AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} \
-e RUNTIME_ENVIRONMENT=dev \
-e PREDICTIONS_BUCKET=retrofit-sap-predictions-dev lambda_test
- name: Test Lambda endpoint
run: |
sleep 2
curl -X POST "http://localhost:9000/2015-03-31/functions/function/invocations" \
-H "Content-Type: application/json" \
-d "{\"body\": \"{\\\"file_location\\\": \\\"s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/sample_test.parquet\\\", \\\"property_id\\\": 1, \\\"portfolio_id\\\": 4, \\\"created_at\\\": \\\"now\\\", \\\"warm\\\": true}\"}"
- name: Get Lambda logs
run: |
docker logs $(docker ps -al -q)
- name: Test Lambda endpoint again
run: |
sleep 2
curl -X POST "http://localhost:9000/2015-03-31/functions/function/invocations" \
-H "Content-Type: application/json" \
-d "{\"body\": \"{\\\"file_location\\\": \\\"s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/sample_test.parquet\\\", \\\"property_id\\\": 1, \\\"portfolio_id\\\": 4, \\\"created_at\\\": \\\"now\\\", \\\"testing\\\": true}\"}"
- name: Get Lambda logs
run: |
docker logs $(docker ps -al -q)
- name: Stop Lambda container
run: |
docker stop lambda_test || echo "Container already stopped"
- name: Remove uploaded sample row dataset from S3
if: always()
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
aws s3 rm --recursive s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/
Verify-Model:
runs-on: ubuntu-latest

View file

@ -8,65 +8,25 @@
"active": true
},
"sap": {
"version": "v0.17.1",
"version": "v0.14.0",
"stage": {
"dev": "v0.17.1"
"dev": "v0.14.0"
},
"registered": true,
"active": true
},
"heat": {
"version": "v0.7.0",
"version": "v0.5.0",
"stage": {
"dev": "v0.7.0"
"dev": "v0.5.0"
},
"registered": true,
"active": true
},
"carbon": {
"version": "v0.7.0",
"version": "v0.5.0",
"stage": {
"dev": "v0.7.0"
},
"registered": true,
"active": true
},
"hotwater": {
"version": "v1.0.0",
"stage": {
"dev": "v1.0.0"
},
"registered": true,
"active": true
},
"heating": {
"version": "v1.0.0",
"stage": {
"dev": "v1.0.0"
},
"registered": true,
"active": true
},
"lighting": {
"version": "v1.0.0",
"stage": {
"dev": "v1.0.0"
},
"registered": true,
"active": true
},
"hotwaterkwh": {
"version": "v1.3.0",
"stage": {
"dev": "v1.3.0"
},
"registered": true,
"active": true
},
"heatingkwh": {
"version": "v1.5.0",
"stage": {
"dev": "v1.5.0"
"dev": "v0.5.0"
},
"registered": true,
"active": true

View file

@ -83,13 +83,3 @@ curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d
```
This will send a POST request to the running Lambda function and pass in the required data as JSON.
For the testing of warm or testing of the lambda, use:
```json
curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/one_sample_test_dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\", \"testing\": \"true\"}"}'
```
or
```json
curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/one_sample_test_dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\", \"warm\": \"true\"}"}'
```

View file

@ -1,24 +1,19 @@
FROM public.ecr.aws/lambda/python:3.12
FROM public.ecr.aws/lambda/python:3.10
# Set the working directory
WORKDIR ${LAMBDA_TASK_ROOT}
ENV PYTHONPATH="${PYTHONPATH}:${LAMBDA_TASK_ROOT}"
ENV MPLCONFIGDIR="/tmp/matplotlib"
ENV PYTHONPATH "${PYTHONPATH}:${LAMBDA_TASK_ROOT}"
# Environment variables
ARG RUNTIME_ENVIRONMENT
ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT}
# Install necessary build tools - required to test locally
RUN dnf install -y gcc python3-devel gcc-c++
RUN yum install -y gcc python3-devel gcc-c++
# Install python packages
COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt
RUN pip install uv
RUN uv pip install -r requirements.txt --system
# RUN pip install --no-cache-dir -r ./requirements.txt
RUN pip install --no-cache-dir -r ./requirements.txt
# Copy the project code
COPY modules/ml-pipeline/src/pipeline ./pipeline

View file

@ -47,30 +47,6 @@ def upload_dataframe_to_s3(df, bucket, s3_file_name):
return False
def warming_up_invocation(
model,
model_filepath: str,
):
"""
Function to handle warm up invocations
"""
import pandas as pd
import numpy as np
model.load_model(model_filepath)
warmup_df = pd.DataFrame(
np.zeros((1, len(model.model.original_features))),
columns=model.model.original_features,
)
model_names = model.model.model_names()
if "NeuralNetFastAI" in model_names:
model.model.predict(warmup_df, model="NeuralNetFastAI")
else:
model.predict(data=warmup_df)
def handler(event, context):
"""
Take in event and trigger the prediction pipeline
@ -90,6 +66,9 @@ def handler(event, context):
created_at = body["created_at"]
# TODO: Implement the loading of the model and prediction
storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet"
logger.info(f"--- Initiate MLModel ---")
build_model_params = settings.build_model
@ -99,32 +78,6 @@ def handler(event, context):
model = model_factory(build_model_params["model_type"])
model_filepath = build_model_params["model_save_filepath"]
if "warm" in body:
logger.info("Warm up invocation - synthetic prediction")
warming_up_invocation(model=model, model_filepath=model_filepath)
return {
"statusCode": 200,
"body": json.dumps(
{
"message": "Successfully warmed up invocation",
}
),
}
if "testing" in body:
logger.info(
"Testing invocation for CI/CD - save file to same location in S3"
)
storage_filepath = body["file_location"].replace(
".parquet", "_output.parquet"
)
else:
storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet"
logger.info(f"--- Initiate Input DataClient ---")
input_dataclient = dataclient_factory(
dataclient_type="aws-s3",
@ -142,7 +95,7 @@ def handler(event, context):
output_dataclient=output_dataclient,
model=model,
target=feature_process_params["feature_processor_config"]["target"],
model_filepath=model_filepath,
model_filepath=build_model_params["model_save_filepath"],
test_data_filepath=body["file_location"],
predictions_output_filepath=storage_filepath,
predictions_column_name=generate_predictions_params[

View file

@ -1,8 +1,7 @@
export PYENV_ROOT=$(HOME)/.pyenv
export PATH := $(PYENV_ROOT)/bin:$(PATH)
PYTHON_VERSION ?= 3.12.12
PYTHON_VERSION ?= 3.10.12
CONDA_ENV=dev_env_pipeline
CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda deactivate ; conda activate
.PHONY: init
init: dev-conda
@ -13,15 +12,11 @@ dev-conda:
# conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously"
conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y
conda init bash
${CONDA_ACTIVATE} ${CONDA_ENV} && \
which pip && \
pip install --upgrade pip && \
pip install uv && \
uv pip install -r src/pipeline/requirements/training/requirements-dev.txt && \
uv pip install -r src/pipeline/requirements/version_control/requirements.txt && \
pre-commit install && \
uv pip install ipykernel && \
conda install llvm-openmp -y
conda run -v -n ${CONDA_ENV} pip install --upgrade pip
conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt
conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt
conda run -v -n ${CONDA_ENV} pre-commit install
conda run -v -n ${CONDA_ENV} pip install ipykernel
echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND"
echo "conda activate ${CONDA_ENV}"

View file

@ -1,17 +1,12 @@
# Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow)
FROM python:3.12.12-slim
FROM python:3.10.12-slim
RUN apt-get update && apt-get install -y libgomp1 gcc python3-dev
COPY pipeline/requirements/predictions/requirements.txt requirements.txt
RUN pip install --upgrade pip
RUN pip install uv
RUN uv pip install -r requirements.txt --system
# RUN pip install -r requirements.txt
RUN pip install -r requirements.txt
# Assuming in the CI/CD step, there will be a dvc pull step to get data and model, so will just need to run a single script
COPY pipeline/ /home/pipeline/

View file

@ -29,7 +29,6 @@ data_filepath = prepare_data_params["data_filepath"]
train_proportion = prepare_data_params["train_proportion"]
output_train_filepath = prepare_data_params["output_train_filepath"]
output_test_filepath = prepare_data_params["output_test_filepath"]
sample_test_filepath = prepare_data_params["sample_test_filepath"]
feature_processor_config = feature_process_params["feature_processor_config"]
logger.info(f"--- Initiate DataClient ---")
@ -100,10 +99,6 @@ def prepare_data(
logger.info("--- Outputting data ---")
output_dataclient.save_data(
obj=data.sample(1), location=sample_test_filepath, save_config=None
)
output_dataclient.save_data(
obj=train, location=output_train_filepath, save_config=None
)

View file

@ -99,12 +99,6 @@ def generate_scenario_predictions(
]
)
# TEMPORARY FIX: ADD is_post_sap10_starting and is_post_sap10_ending if not present
if "is_post_sap10_starting" not in scenario_data.columns:
scenario_data["is_post_sap10_starting"] = False
if "is_post_sap10_ending" not in scenario_data.columns:
scenario_data["is_post_sap10_ending"] = False
logger.info("--- Loading Model ---")
model.load_model(model_filepath)

View file

@ -14,23 +14,9 @@ default:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error #mean_absolute_error
time_limit: 3600
time_limit: 1800
presets: medium_quality
excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT']
infer_limit: 1
infer_limit: 0.05
infer_limit_batch_size: 10000
fit_strategy: "parallel"
ag_args_ensemble: {'num_folds_parallel': 2}
num_gpus: 0
hyperparameters:
{
'NN_TORCH': [{}],
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, {'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 3, 'ag_args': {'name_suffix': 'Large', 'priority': 0,}}],
# 'GBM': [{}],
'CAT': [{}],
'XGB': [{}],
'FASTAI': [{}],
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}

View file

@ -3,10 +3,11 @@ default:
input_dataclient_type: aws-s3
output_dataclient_type: local
scenario_data_filepaths:
# - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/07-10-2024-16-26-06/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/08-10-2024-15-07-33/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/08-10-2024-22-18-44/recommendations_scoring_data.parquet
- s3://retrofit-data-dev/scenario_data/09-10-2024-18-21-08/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/22-03-2024-19-20-09/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md

View file

@ -18,15 +18,13 @@ default:
prepare_data:
input_dataclient_type: aws-s3
output_dataclient_type: local
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-10-03-22-57-23/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-10-08-21-58-03/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2025-09-05-14-05-32/dataset_rooms.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/2025-11-02-09-32-42/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet
sample_test_filepath: ./data/prepared_data/sample_test.parquet
feature_processor:
feature_processor_type: dataframe
@ -39,9 +37,7 @@ default:
drop_columns: [
"heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending",
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
'number_habitable_rooms', 'number_heated_rooms', 'lighting_cost_starting',
'lighting_cost_ending', 'heating_cost_starting', 'heating_cost_ending', 'hot_water_cost_starting', 'hot_water_cost_ending',
'floor_thermal_transmittance', 'floor_thermal_transmittance_ending', 'lodgement_date_starting', 'lodgement_date_ending',]
'number_habitable_rooms', 'number_heated_rooms']
retain_features: null
# retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending',
# 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending',

View file

@ -1,4 +1,4 @@
""" "
""""
Implementations of MLModels, all of which will have four methods to:
- Load model
- Save Model
@ -11,6 +11,9 @@ import joblib
import pandas as pd
from pathlib import Path
from typing import Union, List
from sklearn import linear_model
from sklearn.svm import SVR
from autogluon.tabular import TabularDataset, TabularPredictor
from core.interface.InterfaceModels import MLModel
from core.Logger import logger
@ -66,8 +69,6 @@ class SKLearnLinearRegression:
"""
Method to train a model
"""
from sklearn import linear_model
self.model = linear_model.LinearRegression()
x_train = data.iloc[:, data.columns != target]
@ -116,7 +117,6 @@ class SKLearnSVMRegression:
"""
Method to train a model
"""
from sklearn.svm import SVR
validate_dict_keys(
list(model_hyperparameters.keys()),
@ -152,17 +152,12 @@ class AutogluonAutoML:
"infer_limit",
"infer_limit_batch_size",
"ag_args_ensemble",
"fit_strategy",
"num_gpus",
"hyperparameters",
]
def load_model(self, path: Union[Path, str]) -> None:
"""
Method to load a model
"""
from autogluon.tabular import TabularPredictor
filepath = str(path)
self.model = TabularPredictor.load(path=filepath)
@ -188,10 +183,6 @@ class AutogluonAutoML:
"""
Method to train a model
"""
from autogluon.tabular import TabularDataset, TabularPredictor
# Force Parallel Model fitting
os.environ["AG_FORCE_PARALLEL"] = "True"
validate_dict_keys(
keys_1=list(model_hyperparameters.keys()),
@ -218,9 +209,6 @@ class AutogluonAutoML:
infer_limit=model_hyperparameters["infer_limit"],
infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"],
ag_args_ensemble=model_hyperparameters["ag_args_ensemble"],
fit_strategy=model_hyperparameters["fit_strategy"],
num_gpus=model_hyperparameters["num_gpus"],
hyperparameters=model_hyperparameters["hyperparameters"].to_dict(),
)
def predict(

View file

@ -16,8 +16,8 @@ stages:
deps:
- path: 1_prepare_data.py
hash: md5
md5: a5ce162e1c402c0f811a80ef78cf4dd5
size: 4481
md5: 11a3b8bfdfe199ab7ecc39ccc5652649
size: 4298
params:
configs/settings.yaml:
default.feature_processor.feature_processor_config.drop_columns:
@ -34,36 +34,24 @@ stages:
- number_heated_rooms_ending
- number_habitable_rooms
- number_heated_rooms
- lighting_cost_starting
- lighting_cost_ending
- heating_cost_starting
- heating_cost_ending
- hot_water_cost_starting
- hot_water_cost_ending
- floor_thermal_transmittance
- floor_thermal_transmittance_ending
- lodgement_date_starting
- lodgement_date_ending
default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: sap_ending
default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2025-11-02-09-32-42/dataset_rooms.parquet
s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath:
./data/prepared_data/test.parquet
default.prepare_data.output_train_filepath:
./data/prepared_data/train.parquet
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet
default.prepare_data.train_proportion: 0.9
outs:
- path: data/prepared_data/
hash: md5
md5: 54204b6a31ba369cfbd26b9b25bfa355.dir
size: 46095230
nfiles: 3
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
nfiles: 2
build_model:
cmd: python 2_build_model.py
deps:
@ -73,9 +61,9 @@ stages:
size: 4820
- path: data/prepared_data
hash: md5
md5: 54204b6a31ba369cfbd26b9b25bfa355.dir
size: 46095230
nfiles: 3
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
nfiles: 2
params:
configs/build_model.yaml:
default:
@ -91,7 +79,7 @@ stages:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error
time_limit: 3600
time_limit: 1800
presets: medium_quality
excluded_model_types:
- RF
@ -99,93 +87,25 @@ stages:
- NN_TORCH
- KNN
- XT
infer_limit: 1
infer_limit: 0.05
infer_limit_batch_size: 10000
fit_strategy: parallel
ag_args_ensemble:
num_folds_parallel: 2
num_gpus: 0
hyperparameters:
NN_TORCH:
- {}
GBM:
- extra_trees: true
ag_args:
name_suffix: XT
- {}
- learning_rate: 0.03
num_leaves: 128
feature_fraction: 0.9
min_data_in_leaf: 3
ag_args:
name_suffix: Large
priority: 0
CAT:
- {}
XGB:
- {}
FASTAI:
- {}
RF:
- criterion: gini
ag_args:
name_suffix: Gini
problem_types:
- binary
- multiclass
- criterion: entropy
ag_args:
name_suffix: Entr
problem_types:
- binary
- multiclass
- criterion: squared_error
ag_args:
name_suffix: MSE
problem_types:
- regression
- quantile
XT:
- criterion: gini
ag_args:
name_suffix: Gini
problem_types:
- binary
- multiclass
- criterion: entropy
ag_args:
name_suffix: Entr
problem_types:
- binary
- multiclass
- criterion: squared_error
ag_args:
name_suffix: MSE
problem_types:
- regression
- quantile
KNN:
- weights: uniform
ag_args:
name_suffix: Unif
- weights: distance
ag_args:
name_suffix: Dist
outs:
- path: data/fit_predictions/
hash: md5
md5: f29cfa6a2dadf4fbe81813b3d517fd10.dir
size: 3474971
md5: d9c9afc05e8780db47c0548b19bf7d19.dir
size: 3349989
nfiles: 1
- path: data/model/
hash: md5
md5: 1156f526fe9d11134e49f805c41c3781.dir
size: 763384978
nfiles: 35
md5: 13c3100e1486c27a83a8a47491077842.dir
size: 773523079
nfiles: 36
- path: metrics/fit_metrics.json
hash: md5
md5: 24b2f7c34e5e08b66f39289afac5d795
size: 226
md5: 2ff70a2a45813e1bcdf2ea3aa8e07d4a
size: 224
generate_predictions:
cmd: python 3_generate_predictions.py
deps:
@ -195,28 +115,26 @@ stages:
size: 2464
- path: data/model
hash: md5
md5: 1156f526fe9d11134e49f805c41c3781.dir
size: 763384978
nfiles: 35
md5: 13c3100e1486c27a83a8a47491077842.dir
size: 773523079
nfiles: 36
- path: data/prepared_data
hash: md5
md5: 54204b6a31ba369cfbd26b9b25bfa355.dir
size: 46095230
nfiles: 3
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
nfiles: 2
params:
configs/settings.yaml:
default.generate_predictions.input_dataclient_type: local
default.generate_predictions.output_dataclient_type: local
default.generate_predictions.predictions_column_name: predictions
default.generate_predictions.predictions_output_filepath:
./data/predictions/predictions.parquet
default.generate_predictions.test_data_filepath:
./data/prepared_data/test.parquet
default.generate_predictions.predictions_output_filepath: ./data/predictions/predictions.parquet
default.generate_predictions.test_data_filepath: ./data/prepared_data/test.parquet
outs:
- path: data/predictions/
hash: md5
md5: e9b1d9b94d1e44c999c17b7a2d096db9.dir
size: 484818
md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
size: 463197
nfiles: 1
generate_metrics:
cmd: python 4_generate_metrics.py
@ -227,14 +145,14 @@ stages:
size: 3484
- path: data/predictions
hash: md5
md5: e9b1d9b94d1e44c999c17b7a2d096db9.dir
size: 484818
md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
size: 463197
nfiles: 1
- path: data/prepared_data
hash: md5
md5: 54204b6a31ba369cfbd26b9b25bfa355.dir
size: 46095230
nfiles: 3
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
nfiles: 2
params:
configs/settings.yaml:
default.generate_metrics.dataclient_type: local
@ -243,30 +161,30 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: 88a4e49229cc3c329faf5bf0fcae3318
size: 226
md5: 3e08df02fd5c5d094bcf936e1338d596
size: 223
generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py
deps:
- path: 5_generate_scenarios.py
hash: md5
md5: 872b0c762ce1c8933fcbc5f54d5d4b5d
size: 5658
md5: 40506749fefd926d47c60ff5b16db307
size: 5337
params:
configs/scenarios.yaml:
default.scenarios:
input_dataclient_type: aws-s3
output_dataclient_type: local
scenario_data_filepaths:
- s3://retrofit-data-dev/scenario_data/09-10-2024-18-21-08/recommendations_scoring_data.parquet
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md
outs:
- path: metrics/scenario_metrics.md
hash: md5
md5: 3326cc2e59ac1671d99d3e1f27131f54
size: 356
md5: fa4d6d7bbd7818613800da5f8f37ea96
size: 363
- path: metrics/scenario_table.md
hash: md5
md5: 0a434e055463ec9ade5de2de9bde7154
size: 872
md5: d6baf100a1623cc2467c2f8221d314c9
size: 2133

View file

@ -1,7 +1,7 @@
joblib==1.5.2
boto3==1.40.61
pandas==2.3.3
autogluon.tabular[all]==1.4.0
dynaconf==3.2.12
pyarrow==20.0.0
pre-commit==4.3.0
joblib==1.3.2
boto3==1.28.17
pandas==2.1.4
autogluon.tabular[all]==1.0.0
dynaconf==3.2.1
pyarrow==13.0.0
pre-commit==3.3.3

View file

@ -1,7 +1,7 @@
joblib==1.5.2
boto3==1.40.61
pandas==2.3.3
autogluon.tabular[all]==1.4.0
dynaconf==3.2.12
pyarrow==20.0.0
PyYAML==6.0.3
joblib==1.3.2
boto3==1.28.17
pandas==2.1.4
autogluon.tabular[all]==1.0.0
dynaconf==3.2.1
pyarrow==13.0.0
PyYAML==6.0.1

View file

@ -1,10 +1,10 @@
joblib==1.5.2
boto3==1.40.61
pandas==2.3.3
autogluon.tabular[all]==1.4.0
ray==2.44.1
dynaconf==3.2.12
# alibi
shap==0.49.1
pyarrow==20.0.0
pre-commit==4.3.0
joblib==1.3.2
boto3==1.28.17
pandas==2.1.4
autogluon.tabular[all]==1.0.0
ray==2.6.3
dynaconf==3.2.1
alibi==0.9.5
shap==0.42.1
pyarrow==13.0.0
pre-commit==3.3.3

View file

@ -1,4 +1,4 @@
boto3==1.40.61
pandas==2.3.3
autogluon.tabular[all]==1.4.0
dynaconf==3.2.12
boto3==1.28.41
pandas==2.1.4
autogluon.tabular[all]==1.0.0
dynaconf==3.2.1

View file

@ -1,4 +1,4 @@
dvc==3.51.0
dvc-s3==3.2.0
gto==1.9.0
gto==1.7.1
pyOpenSSL==23.3.0