Compare commits

...

35 commits

Author SHA1 Message Date
quandanrepo
2205a7a91e
Merge pull request #164 from Hestia-Homes/heatingkwh-dev-update
general improvements as per sap model
2025-11-06 18:34:16 +00:00
Michael Duong
37c6824f5f alternate build 2025-11-05 14:05:06 +00:00
Michael Duong
f092af0180 general improvements as per sap model 2025-11-05 10:43:41 +00:00
Github-Bot
8d44c6874a Update Registry 2024-08-09 07:00:29 +00:00
Github-Bot
3d73ad1bee Update Registry 2024-08-09 06:59:48 +00:00
KhalimCK
64e1b57b2d
Merge pull request #143 from Hestia-Homes/heatingkwh-dev-model
remove costing columns, photo supply and main-heating-control
2024-08-09 07:59:09 +01:00
Michael Duong
c2ad73743a remove costing columns, photo supply and main-heating-control 2024-08-08 23:15:53 +01:00
Github-Bot
2fa0353ea1 Update Registry 2024-08-07 09:00:46 +00:00
Github-Bot
cfb9272a7b Update Registry 2024-08-07 09:00:06 +00:00
KhalimCK
6f5857d644
Merge pull request #141 from Hestia-Homes/heatingkwh-dev-model
remove the area-to-heated rooms feature, and env features
2024-08-07 09:59:25 +01:00
Michael Duong
bcb505084f use retain features again with remove env features 2024-08-06 22:18:27 +01:00
Michael Duong
318a51589d remove the area-to-heated rooms feature, and env features 2024-08-06 21:15:23 +01:00
Github-Bot
9a49caa0cd Update Registry 2024-08-06 19:36:12 +00:00
Github-Bot
fb9f364da3 Update Registry 2024-08-06 19:35:38 +00:00
KhalimCK
8a053fc775
Merge pull request #140 from Hestia-Homes/heatingkwh-dev-model
remove the rounding the 100 kwh
2024-08-06 20:35:05 +01:00
Michael Duong
bdb55d3ffe add estimated kwh 2024-08-06 20:22:31 +01:00
Michael Duong
d9b08b98dc remove the rounding the 100 kwh 2024-08-06 16:54:38 +01:00
Github-Bot
a6f6bc6bb5 Update Registry 2024-08-06 11:39:11 +00:00
Github-Bot
b3564e3521 Update Registry 2024-08-06 11:38:37 +00:00
KhalimCK
5c41c45516
Merge pull request #137 from Hestia-Homes/heatingkwh-dev-model
removed features for new heatingkwh model
2024-08-06 12:37:37 +01:00
Michael Duong
7af43ecbef removed features for new model 2024-08-05 22:46:03 +01:00
Github-Bot
119ce13740 Update Registry 2024-08-02 13:15:59 +00:00
Github-Bot
2f26bdd2f5 Update Registry 2024-08-02 13:14:52 +00:00
KhalimCK
0051f9cf97
Merge pull request #135 from Hestia-Homes/heatingkwh-dev-model
try new model
2024-08-02 14:14:18 +01:00
Michael Duong
97b432bac9 try new model 2024-07-28 11:31:03 +01:00
Github-Bot
64e44d0637 Update Registry 2024-07-22 13:33:29 +00:00
Github-Bot
43e5cf5370 Update Registry 2024-07-22 13:32:54 +00:00
KhalimCK
23221c87da
Merge pull request #132 from Hestia-Homes/heatingkwh-dev-model
initial heatingkwh model commit
2024-07-22 14:32:23 +01:00
Michael Duong
5cb8a8a6aa clipped extremely small heating values 2024-07-12 23:03:31 +01:00
Michael Duong
9785181e80 remove hot_water_kwh feature, lower mean squared error 2024-07-12 22:46:32 +01:00
Michael Duong
99d28e8b61 initial model commit 2024-07-12 15:13:11 +01:00
Github-Bot
732ea48cd1 Update Registry 2024-07-05 12:12:30 +00:00
Github-Bot
e4ddad7abc Update Registry 2024-07-05 12:11:49 +00:00
KhalimCK
e78b9226b8
Merge pull request #124 from Hestia-Homes/lighting-dev-model
test lighting model
2024-07-05 13:11:14 +01:00
Michael Duong
d164bff8d2 test lighting model 2024-07-04 13:47:33 +01:00
25 changed files with 672 additions and 158 deletions

View file

@ -2,7 +2,7 @@ name: Sap Change Model Deploy
on: on:
push: push:
branches: [ sap-dev, sap-prod, heat-dev, heat-prod, carbon-dev, carbon-prod] branches: [ sap-dev, sap-prod, heat-dev, heat-prod, carbon-dev, carbon-prod, heatingkwh-dev, heatingkwh-prod]
jobs: jobs:
deploy: deploy:

View file

@ -13,6 +13,7 @@ on:
- "sap-dev" - "sap-dev"
- "heat-dev" - "heat-dev"
- "carbon-dev" - "carbon-dev"
- "heatingkwh-dev"
permissions: write-all permissions: write-all

View file

@ -5,7 +5,7 @@ on:
# branches: # branches:
# - "model-**" # - "model-**"
pull_request: pull_request:
branches: ["sap-dev", "heat-dev", "carbon-dev"] branches: ["sap-dev", "heat-dev", "carbon-dev", "heatingkwh-dev"]
label: label:
types: ["created", "edited"] types: ["created", "edited"]
@ -31,6 +31,80 @@ jobs:
# run: | # run: |
# echo "Please choose one of these tags: 'major', 'major', 'patch'" # echo "Please choose one of these tags: 'major', 'major', 'patch'"
# exit(1) # exit(1)
Verify-Lambda:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Install packages to retrieve artifacts
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
pip install --upgrade pip
pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
- name: Retrieve artifacts (dvc.lock)
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
cd modules/ml-pipeline/src/pipeline
dvc pull -r experiments
- name: Set timestamp
id: set_timestamp
run: |
echo "timestamp=$(date +%Y%m%d)" >> $GITHUB_ENV
echo "Generated timestamp: ${timestamp}"
- name: Upload sample row dataset to S3
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
cd modules/ml-pipeline/src/pipeline/data/prepared_data/
aws s3 cp sample_test.parquet s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/sample_test.parquet
- name: Build Lambda docker Image
run: |
docker build . --file ./deployment/Dockerfile.prediction.lambda --tag lambda_test
- name: Run lambda docker container
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
docker run -d -p 9000:8080 \
-e AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} \
-e AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} \
-e RUNTIME_ENVIRONMENT=dev \
-e PREDICTIONS_BUCKET=retrofit-sap-predictions-dev lambda_test
- name: Test Lambda endpoint
run: |
sleep 2
curl -X POST "http://localhost:9000/2015-03-31/functions/function/invocations" \
-H "Content-Type: application/json" \
-d "{\"body\": \"{\\\"file_location\\\": \\\"s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/sample_test.parquet\\\", \\\"property_id\\\": 1, \\\"portfolio_id\\\": 4, \\\"created_at\\\": \\\"now\\\", \\\"warm\\\": true}\"}"
- name: Get Lambda logs
run: |
docker logs $(docker ps -al -q)
- name: Test Lambda endpoint again
run: |
sleep 2
curl -X POST "http://localhost:9000/2015-03-31/functions/function/invocations" \
-H "Content-Type: application/json" \
-d "{\"body\": \"{\\\"file_location\\\": \\\"s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/sample_test.parquet\\\", \\\"property_id\\\": 1, \\\"portfolio_id\\\": 4, \\\"created_at\\\": \\\"now\\\", \\\"testing\\\": true}\"}"
- name: Get Lambda logs
run: |
docker logs $(docker ps -al -q)
- name: Stop Lambda container
run: |
docker stop lambda_test || echo "Container already stopped"
- name: Remove uploaded sample row dataset from S3
if: always()
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
aws s3 rm --recursive s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/
Verify-Model: Verify-Model:

View file

@ -16,17 +16,57 @@
"active": true "active": true
}, },
"heat": { "heat": {
"version": "v0.5.0", "version": "v0.6.0",
"stage": { "stage": {
"dev": "v0.5.0" "dev": "v0.6.0"
}, },
"registered": true, "registered": true,
"active": true "active": true
}, },
"carbon": { "carbon": {
"version": "v0.5.0", "version": "v0.6.0",
"stage": { "stage": {
"dev": "v0.5.0" "dev": "v0.6.0"
},
"registered": true,
"active": true
},
"hotwater": {
"version": "v1.0.0",
"stage": {
"dev": "v1.0.0"
},
"registered": true,
"active": true
},
"heating": {
"version": "v1.0.0",
"stage": {
"dev": "v1.0.0"
},
"registered": true,
"active": true
},
"lighting": {
"version": "v1.0.0",
"stage": {
"dev": "v1.0.0"
},
"registered": true,
"active": true
},
"hotwaterkwh": {
"version": "v1.2.0",
"stage": {
"dev": "v1.2.0"
},
"registered": true,
"active": true
},
"heatingkwh": {
"version": "v1.5.0",
"stage": {
"dev": "v1.5.0"
}, },
"registered": true, "registered": true,
"active": true "active": true

View file

@ -83,3 +83,13 @@ curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d
``` ```
This will send a POST request to the running Lambda function and pass in the required data as JSON. This will send a POST request to the running Lambda function and pass in the required data as JSON.
For the testing of warm or testing of the lambda, use:
```json
curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/one_sample_test_dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\", \"testing\": \"true\"}"}'
```
or
```json
curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/one_sample_test_dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\", \"warm\": \"true\"}"}'
```

View file

@ -1,19 +1,24 @@
FROM public.ecr.aws/lambda/python:3.10 FROM public.ecr.aws/lambda/python:3.12
# Set the working directory # Set the working directory
WORKDIR ${LAMBDA_TASK_ROOT} WORKDIR ${LAMBDA_TASK_ROOT}
ENV PYTHONPATH "${PYTHONPATH}:${LAMBDA_TASK_ROOT}" ENV PYTHONPATH="${PYTHONPATH}:${LAMBDA_TASK_ROOT}"
ENV MPLCONFIGDIR="/tmp/matplotlib"
# Environment variables # Environment variables
ARG RUNTIME_ENVIRONMENT ARG RUNTIME_ENVIRONMENT
ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT} ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT}
# Install necessary build tools - required to test locally # Install necessary build tools - required to test locally
RUN yum install -y gcc python3-devel gcc-c++ RUN dnf install -y gcc python3-devel gcc-c++
# Install python packages # Install python packages
COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt
RUN pip install --no-cache-dir -r ./requirements.txt
RUN pip install uv
RUN uv pip install -r requirements.txt --system
# RUN pip install --no-cache-dir -r ./requirements.txt
# Copy the project code # Copy the project code
COPY modules/ml-pipeline/src/pipeline ./pipeline COPY modules/ml-pipeline/src/pipeline ./pipeline
@ -22,4 +27,4 @@ COPY deployment/handlers/prediction_app.py ./pipeline/prediction_app.py
WORKDIR ${LAMBDA_TASK_ROOT}/pipeline WORKDIR ${LAMBDA_TASK_ROOT}/pipeline
CMD [ "prediction_app.handler" ] CMD [ "prediction_app.handler" ]

View file

@ -47,6 +47,30 @@ def upload_dataframe_to_s3(df, bucket, s3_file_name):
return False return False
def warming_up_invocation(
model,
model_filepath: str,
):
"""
Function to handle warm up invocations
"""
import pandas as pd
import numpy as np
model.load_model(model_filepath)
warmup_df = pd.DataFrame(
np.zeros((1, len(model.model.original_features))),
columns=model.model.original_features,
)
# model_names = model.model.model_names()
# if "NeuralNetFastAI" in model_names:
# model.model.predict(warmup_df, model="NeuralNetFastAI")
# else:
model.predict(data=warmup_df)
def handler(event, context): def handler(event, context):
""" """
Take in event and trigger the prediction pipeline Take in event and trigger the prediction pipeline
@ -66,9 +90,6 @@ def handler(event, context):
created_at = body["created_at"] created_at = body["created_at"]
# TODO: Implement the loading of the model and prediction # TODO: Implement the loading of the model and prediction
storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet"
logger.info(f"--- Initiate MLModel ---") logger.info(f"--- Initiate MLModel ---")
build_model_params = settings.build_model build_model_params = settings.build_model
@ -78,6 +99,32 @@ def handler(event, context):
model = model_factory(build_model_params["model_type"]) model = model_factory(build_model_params["model_type"])
model_filepath = build_model_params["model_save_filepath"]
if "warm" in body:
logger.info("Warm up invocation - synthetic prediction")
warming_up_invocation(model=model, model_filepath=model_filepath)
return {
"statusCode": 200,
"body": json.dumps(
{
"message": "Successfully warmed up invocation",
}
),
}
if "testing" in body:
logger.info(
"Testing invocation for CI/CD - save file to same location in S3"
)
storage_filepath = body["file_location"].replace(
".parquet", "_output.parquet"
)
else:
storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet"
logger.info(f"--- Initiate Input DataClient ---") logger.info(f"--- Initiate Input DataClient ---")
input_dataclient = dataclient_factory( input_dataclient = dataclient_factory(
dataclient_type="aws-s3", dataclient_type="aws-s3",
@ -95,7 +142,7 @@ def handler(event, context):
output_dataclient=output_dataclient, output_dataclient=output_dataclient,
model=model, model=model,
target=feature_process_params["feature_processor_config"]["target"], target=feature_process_params["feature_processor_config"]["target"],
model_filepath=build_model_params["model_save_filepath"], model_filepath=model_filepath,
test_data_filepath=body["file_location"], test_data_filepath=body["file_location"],
predictions_output_filepath=storage_filepath, predictions_output_filepath=storage_filepath,
predictions_column_name=generate_predictions_params[ predictions_column_name=generate_predictions_params[

View file

@ -51,3 +51,4 @@ functions:
path: /predict path: /predict
method: POST method: POST
timeout: 120 # Set max run time to 2 minutes - we shouldn't need this much time so this can be reviewed timeout: 120 # Set max run time to 2 minutes - we shouldn't need this much time so this can be reviewed
memorySize: 3008

View file

@ -1,7 +1,8 @@
export PYENV_ROOT=$(HOME)/.pyenv export PYENV_ROOT=$(HOME)/.pyenv
export PATH := $(PYENV_ROOT)/bin:$(PATH) export PATH := $(PYENV_ROOT)/bin:$(PATH)
PYTHON_VERSION ?= 3.10.12 PYTHON_VERSION ?= 3.12.12
CONDA_ENV=dev_env_pipeline CONDA_ENV=dev_env_pipeline
CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda deactivate ; conda activate
.PHONY: init .PHONY: init
init: dev-conda init: dev-conda
@ -12,11 +13,15 @@ dev-conda:
# conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously" # conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously"
conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y
conda init bash conda init bash
conda run -v -n ${CONDA_ENV} pip install --upgrade pip ${CONDA_ACTIVATE} ${CONDA_ENV} && \
conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt which pip && \
conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt pip install --upgrade pip && \
conda run -v -n ${CONDA_ENV} pre-commit install pip install uv && \
conda run -v -n ${CONDA_ENV} pip install ipykernel uv pip install -r src/pipeline/requirements/training/requirements-dev.txt && \
uv pip install -r src/pipeline/requirements/version_control/requirements.txt && \
pre-commit install && \
uv pip install ipykernel && \
conda install llvm-openmp -y
echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND"
echo "conda activate ${CONDA_ENV}" echo "conda activate ${CONDA_ENV}"
@ -33,4 +38,4 @@ dev-pyenv:
.PHONY: dvc-init .PHONY: dvc-init
dvc-init: dvc-init:
. .dev_env_pipeline/bin/activate && dvc init --subdir . .dev_env_pipeline/bin/activate && dvc init --subdir

View file

@ -17,14 +17,15 @@ Within `src` folder, the structure is as follows:
# How to develop using this pipeline: # How to develop using this pipeline:
Run `make init`, which will: First, download miniconda to use conda to manage Python Environments
- Download pyenv (Python version management) Rund `conda init`, to initialise your terminal
- Download Python 3.X.X as defined in the `make` file - current 3.10.12
- Create a virtual environment with this version of python Change to this directory and run `make init`, which will:
- Create a conda virtual environment with this version of python - current 3.10.12
- Install packages in the training and version control directories in the pipeline folder (dev version if applicable) - Install packages in the training and version control directories in the pipeline folder (dev version if applicable)
- Install pre-commit to enable pre-commit hooks - Install pre-commit to enable pre-commit hooks
To use the environment, run `source .dev_env_pipeline/bin/activate`. To use the environment, run `conda activate dev_env_pipeline`
To enable the virtual envrionemnt created in vscode: To enable the virtual envrionemnt created in vscode:
- Open settings - Open settings

View file

@ -1,16 +1,21 @@
# Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow) # Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow)
FROM python:3.10.12-slim FROM python:3.12.12-slim
RUN apt-get update && apt-get install -y libgomp1 gcc python3-dev RUN apt-get update && apt-get install -y libgomp1 gcc python3-dev
COPY pipeline/requirements/predictions/requirements.txt requirements.txt COPY pipeline/requirements/predictions/requirements.txt requirements.txt
RUN pip install --upgrade pip RUN pip install --upgrade pip
RUN pip install -r requirements.txt
RUN pip install uv
RUN uv pip install -r requirements.txt --system
# RUN pip install -r requirements.txt
# Assuming in the CI/CD step, there will be a dvc pull step to get data and model, so will just need to run a single script # Assuming in the CI/CD step, there will be a dvc pull step to get data and model, so will just need to run a single script
COPY pipeline/ /home/pipeline/ COPY pipeline/ /home/pipeline/
WORKDIR /home/pipeline/ WORKDIR /home/pipeline/
CMD [ "python", "3_generate_predictions.py"] CMD [ "python", "3_generate_predictions.py"]

View file

@ -29,6 +29,7 @@ data_filepath = prepare_data_params["data_filepath"]
train_proportion = prepare_data_params["train_proportion"] train_proportion = prepare_data_params["train_proportion"]
output_train_filepath = prepare_data_params["output_train_filepath"] output_train_filepath = prepare_data_params["output_train_filepath"]
output_test_filepath = prepare_data_params["output_test_filepath"] output_test_filepath = prepare_data_params["output_test_filepath"]
sample_test_filepath = prepare_data_params["sample_test_filepath"]
feature_processor_config = feature_process_params["feature_processor_config"] feature_processor_config = feature_process_params["feature_processor_config"]
logger.info(f"--- Initiate DataClient ---") logger.info(f"--- Initiate DataClient ---")
@ -99,6 +100,10 @@ def prepare_data(
logger.info("--- Outputting data ---") logger.info("--- Outputting data ---")
output_dataclient.save_data(
obj=data.sample(1), location=sample_test_filepath, save_config=None
)
output_dataclient.save_data( output_dataclient.save_data(
obj=train, location=output_train_filepath, save_config=None obj=train, location=output_train_filepath, save_config=None
) )

View file

@ -99,6 +99,12 @@ def generate_scenario_predictions(
] ]
) )
# TEMPORARY FIX: ADD is_post_sap10_starting and is_post_sap10_ending if not present
if "is_post_sap10_starting" not in scenario_data.columns:
scenario_data["is_post_sap10_starting"] = False
if "is_post_sap10_ending" not in scenario_data.columns:
scenario_data["is_post_sap10_ending"] = False
logger.info("--- Loading Model ---") logger.info("--- Loading Model ---")
model.load_model(model_filepath) model.load_model(model_filepath)

View file

@ -14,9 +14,23 @@ default:
output_filepath: ./data/model/allmodels/ output_filepath: ./data/model/allmodels/
problem_type: regression problem_type: regression
eval_metric: mean_squared_error #mean_absolute_error eval_metric: mean_squared_error #mean_absolute_error
time_limit: 1800 time_limit: 3600
presets: medium_quality presets: medium_quality
excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT'] excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT', 'FASTAI']
infer_limit: 0.05 infer_limit: 1
infer_limit_batch_size: 10000 infer_limit_batch_size: 10000
fit_strategy: "parallel"
ag_args_ensemble: {'num_folds_parallel': 2} ag_args_ensemble: {'num_folds_parallel': 2}
num_gpus: 0
hyperparameters:
{
'NN_TORCH': [{}],
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, {'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 3, 'ag_args': {'name_suffix': 'Large', 'priority': 0,}}],
# 'GBM': [{}],
'CAT': [{}],
'XGB': [{}, {'max_depth': 10, 'ag_args': {'name_suffix': 'Deep'}}],
'FASTAI': [{}],
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}

View file

@ -5,6 +5,18 @@ During the feature processor step, we can apply additional business logic and fe
""" """
Business Logic dict + functions Business Logic dict + functions
""" """
import pandas as pd
import numpy as np
import boto3
import msgpack
s3 = boto3.resource('s3')
# Get the MessagePack data from S3
obj = s3.Object("retrofit-data-dev", "cleaned_epc_data/cleaned.bson")
cleaned = obj.get()['Body'].read()
cleaned = msgpack.unpackb(cleaned, raw=False)
def remove_starting_columns(df): def remove_starting_columns(df):
@ -44,6 +56,111 @@ def keep_non_zero_rdsap(df):
df = df[df["rdsap_change"] != 0] df = df[df["rdsap_change"] != 0]
return df return df
def remove_heatingkwh_bottom_percentile(df, percentile=0.0001):
df = df[df["heating_kwh"] > df["heating_kwh"].quantile(percentile)]
return df
def add_features_from_code(df):
FEATURES = {
"heating_kwh": [
"lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
"heating-cost-current", "heating-cost-potential", "total-floor-area", "number-heated-rooms",
"mainheat-description", "mainheat-energy-eff", "main-fuel", "secondheat-description", "property-type",
"built-form", "mainheatcont-description", "hotwater-description", "hot-water-energy-eff",
"walls-energy-eff",
"roof-energy-eff", "windows-description", "windows-energy-eff", "floor-description", "flat-top-storey",
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
"low-energy-lighting", "environment-impact-current", "energy-tariff",
"county", "construction-age-band", "co2-emissions-current",
],
"hot_water_kwh": [
"lodgement-year", "lodgement-month",
"current-energy-efficiency",
"energy-consumption-current",
"hot-water-cost-current",
"total-floor-area", "number-heated-rooms",
"hotwater-description", "hot-water-energy-eff", "main-fuel", "property-type", "built-form",
"co2-emissions-current",
]
}
CATEGORICAL_COLUMNS = [
"lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
"number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type", "built-form",
"construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
"walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
"county",
"windows-description", "windows-energy-eff", "flat-top-storey",
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
"low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating"
]
NUMERICAL_COLUMNS = list({
x for x in FEATURES["heating_kwh"] + FEATURES["hot_water_kwh"]
if x not in CATEGORICAL_COLUMNS
})
"""Performs feature engineering on the dataset."""
df["lodgement-date"] = pd.to_datetime(df["lodgement-date"])
df["lodgement-year"] = df["lodgement-date"].dt.year
df["lodgement-month"] = df["lodgement-date"].dt.month
# For walls, roof, floor description where we have average thermal transmittance, to avoid too many categories
# we group them
ranges = {
"lessthan 0.1": (0, 0.1),
"0.1 - 0.3": (0.1, 0.3),
"0.3 - 0.5": (0.3, 0.5),
"morethan 0.5": (0.5, 2.5),
}
# Generate the lookup table
thermal_transmittance_lookup_table = []
for i in range(1, 251):
value = i / 100
for label, (low, high) in ranges.items():
if low < value <= high:
thermal_transmittance_lookup_table.append({"from": value, "to": label})
break
# Convert to DataFrame for display
thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
# Apply the lookup table to the data
for feature in ["walls-description", "roof-description", "floor-description"]:
cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
# Round to 2 decimal places and convert to string
cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
df = df.merge(
cleaned_df,
how="left",
left_on=feature,
right_on="original_description",
)
# We now have the thermal transmittance in the data, which we can use to group with the lookup table
df = df.merge(
thermal_transmittance_lookup_table,
how="left",
left_on="thermal_transmittance",
right_on="from",
)
# Where "to" is populated, replace feature with to
df[feature] = np.where(
~pd.isnull(df["to"]),
df["to"],
df[feature]
)
df = df.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
# Convert data types
df[NUMERICAL_COLUMNS] = df[NUMERICAL_COLUMNS].apply(pd.to_numeric)
df[CATEGORICAL_COLUMNS] = df[CATEGORICAL_COLUMNS].astype(str)
return df
# def keep_ending_columns(df): # def keep_ending_columns(df):
# ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
@ -53,7 +170,42 @@ def keep_non_zero_rdsap(df):
# df = df[keep_columns] # df = df[keep_columns]
# return df # return df
def enforce_minimum_habitable_room_size(df):
# Need minimum of 6.5m per habitable room
df = df[
df["total-floor-area"] / df["number-habitable-rooms"].astype(float) > 6.5
].reset_index(drop=True)
return df
def round_to_100s(df):
df['heating_kwh'] = (df['heating_kwh']/100).round()*100
return df
def remove_high_ratio_of_area_to_rooms(df):
df['area-to-heated-rooms'] = df['total-floor-area'] / df['number-heated-rooms'].astype(float)
# Remove na rows
df = df[(df['area-to-heated-rooms'].notna())].reset_index(drop=True)
# change any infinite values to 0
df['area-to-heated-rooms'] = df['area-to-heated-rooms'].replace([np.inf], 0)
# Remove top 0.05% of area-to-heated-rooms
df = df[df['area-to-heated-rooms'] < df['area-to-heated-rooms'].quantile(0.9995)].reset_index(drop=True)
df = df.drop(columns=['area-to-heated-rooms'])
return df
def add_estimate_annual_kwh(df):
df['estimate_annual_kwh'] = df['energy-consumption-current'] * df['total-floor-area']
return df
business_logic = { business_logic = {
"add_features_from_code": add_features_from_code,
"remove_heatingkwh_bottom_percentile": remove_heatingkwh_bottom_percentile,
# "round_to_100s": round_to_100s,
"enforce_minimum_habitable_room_size": enforce_minimum_habitable_room_size,
"remove_high_ratio_of_area_to_rooms": remove_high_ratio_of_area_to_rooms,
"add_estimate_annual_kwh": add_estimate_annual_kwh,
# "keep_non_zero_rdsap": keep_non_zero_rdsap, # "keep_non_zero_rdsap": keep_non_zero_rdsap,
# "keep_flats": keep_flats, # "keep_flats": keep_flats,
# "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,

View file

@ -30,6 +30,6 @@ def clip_predictions_to_minimum_value(
post_prediction_logic = { post_prediction_logic = {
"clip_predictions_to_minimum_value": clip_predictions_to_minimum_value, # "clip_predictions_to_minimum_value": clip_predictions_to_minimum_value,
# "round_predictions": round_predictions # "round_predictions": round_predictions
} }

View file

@ -8,6 +8,6 @@ default:
# - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md metrics_output_filepath: ./metrics/scenario_metrics.md

View file

@ -21,47 +21,75 @@ default:
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-07-03-23-11-39/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/energy_consumption/2024-07-08/energy_consumption_dataset.parquet
data_filepath: s3://retrofit-data-dev/energy_consumption/2024-07-25/energy_consumption_dataset.parquet
train_proportion: 0.9 train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet output_test_filepath: ./data/prepared_data/test.parquet
sample_test_filepath: ./data/prepared_data/sample_test.parquet
feature_processor: feature_processor:
feature_processor_type: dataframe feature_processor_type: dataframe
feature_processor_config: feature_processor_config:
subsample_amount: null subsample_amount: null
subsample_seed: 0 subsample_seed: 0
target: sap_ending target: heating_kwh
identifier_columns: ["uprn"] identifier_columns: ["uprn"]
# drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"] drop_columns: ["hot_water_kwh"]
drop_columns: [ retain_features: [
"heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending", 'uprn',
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending', # 'heating-cost-current',
'number_habitable_rooms', 'number_heated_rooms'] 'co2-emissions-current',
retain_features: null # 'hot-water-cost-current',
# retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending', 'total-floor-area',
# 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending', 'secondheat-description',
# 'walls_energy_eff_ending', 'secondheat_description_ending', 'floor-description',
# 'property_type', 'mainheatc_energy_eff_ending', 'built_form', 'mainheat-energy-eff',
# 'walls_insulation_thickness_ending', 'potential_energy_efficiency', 'current-energy-efficiency',
# 'transaction_type_ending', 'walls-energy-eff',
# 'floor_thermal_transmittance_ending', 'roof-energy-eff',
# 'low_energy_lighting_ending', 'heat_demand_starting', 'property-type',
# 'photo_supply_ending', 'carbon_starting', 'mainheat-description',
# 'walls_thermal_transmittance_ending', 'mechanical-ventilation',
# 'roof_insulation_thickness_ending', 'floor-level',
# 'total_floor_area_ending', 'number_open_fireplaces_ending', 'built-form',
# 'windows_energy_eff_ending', 'walls-description',
# 'floor_height_ending', 'mainheatcont-description',
# 'extension_count_ending', 'roof-description',
# 'has_air_source_heat_pump_ending', 'energy-consumption-current',
# 'charging_system_ending', 'construction_age_band', 'glazed_type_ending', 'construction-age-band',
# 'roof_thermal_transmittance_ending', 'hotwater-description',
# 'floor_insulation_thickness_ending', 'has_mains_gas_ending', 'main-fuel',
# 'estimated_perimeter_starting', 'energy_consumption_potential', 'hot-water-energy-eff',
# 'environment_impact_potential', 'heater_type_ending', 'co2-emiss-curr-per-floor-area',
# 'multi_glaze_proportion_ending', 'windows-energy-eff',
# 'lighting_energy_eff_ending', 'fixed_lighting_outlets_count'] 'current-energy-rating',
'lodgement-year',
'extension-count',
'number-open-fireplaces',
'number-heated-rooms',
'windows-description',
# 'photo-supply',
'heat-loss-corridor',
'flat-top-storey',
'unheated-corridor-length',
'fixed-lighting-outlets-count',
'tenure',
'multi-glaze-proportion',
'solar-water-heating-flag',
'energy-tariff',
'floor-height',
'constituency',
'transaction-type',
'floor-energy-eff',
'lodgement-month',
# 'lighting-cost-current',
'glazed-area',
# 'main-heating-controls',
'estimate_annual_kwh',
]
generate_predictions: generate_predictions:
input_dataclient_type: local input_dataclient_type: local

View file

@ -1,4 +1,4 @@
"""" """ "
Implementations of MLModels, all of which will have four methods to: Implementations of MLModels, all of which will have four methods to:
- Load model - Load model
- Save Model - Save Model
@ -11,9 +11,6 @@ import joblib
import pandas as pd import pandas as pd
from pathlib import Path from pathlib import Path
from typing import Union, List from typing import Union, List
from sklearn import linear_model
from sklearn.svm import SVR
from autogluon.tabular import TabularDataset, TabularPredictor
from core.interface.InterfaceModels import MLModel from core.interface.InterfaceModels import MLModel
from core.Logger import logger from core.Logger import logger
@ -69,6 +66,8 @@ class SKLearnLinearRegression:
""" """
Method to train a model Method to train a model
""" """
from sklearn import linear_model
self.model = linear_model.LinearRegression() self.model = linear_model.LinearRegression()
x_train = data.iloc[:, data.columns != target] x_train = data.iloc[:, data.columns != target]
@ -117,6 +116,7 @@ class SKLearnSVMRegression:
""" """
Method to train a model Method to train a model
""" """
from sklearn.svm import SVR
validate_dict_keys( validate_dict_keys(
list(model_hyperparameters.keys()), list(model_hyperparameters.keys()),
@ -152,12 +152,17 @@ class AutogluonAutoML:
"infer_limit", "infer_limit",
"infer_limit_batch_size", "infer_limit_batch_size",
"ag_args_ensemble", "ag_args_ensemble",
"fit_strategy",
"num_gpus",
"hyperparameters",
] ]
def load_model(self, path: Union[Path, str]) -> None: def load_model(self, path: Union[Path, str]) -> None:
""" """
Method to load a model Method to load a model
""" """
from autogluon.tabular import TabularPredictor
filepath = str(path) filepath = str(path)
self.model = TabularPredictor.load(path=filepath) self.model = TabularPredictor.load(path=filepath)
@ -183,6 +188,10 @@ class AutogluonAutoML:
""" """
Method to train a model Method to train a model
""" """
from autogluon.tabular import TabularDataset, TabularPredictor
# Force Parallel Model fitting
os.environ["AG_FORCE_PARALLEL"] = "True"
validate_dict_keys( validate_dict_keys(
keys_1=list(model_hyperparameters.keys()), keys_1=list(model_hyperparameters.keys()),
@ -209,6 +218,9 @@ class AutogluonAutoML:
infer_limit=model_hyperparameters["infer_limit"], infer_limit=model_hyperparameters["infer_limit"],
infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"], infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"],
ag_args_ensemble=model_hyperparameters["ag_args_ensemble"], ag_args_ensemble=model_hyperparameters["ag_args_ensemble"],
fit_strategy=model_hyperparameters["fit_strategy"],
num_gpus=model_hyperparameters["num_gpus"],
hyperparameters=model_hyperparameters["hyperparameters"].to_dict(),
) )
def predict( def predict(

View file

@ -16,42 +16,77 @@ stages:
deps: deps:
- path: 1_prepare_data.py - path: 1_prepare_data.py
hash: md5 hash: md5
md5: 11a3b8bfdfe199ab7ecc39ccc5652649 md5: a5ce162e1c402c0f811a80ef78cf4dd5
size: 4298 size: 4481
params: params:
configs/settings.yaml: configs/settings.yaml:
default.feature_processor.feature_processor_config.drop_columns: default.feature_processor.feature_processor_config.drop_columns:
- heat_demand_change - hot_water_kwh
- carbon_change
- rdsap_change
- heat_demand_ending
- carbon_ending
- days_to_starting
- days_to_ending
- number_habitable_rooms_starting
- number_habitable_rooms_ending
- number_heated_rooms_starting
- number_heated_rooms_ending
- number_habitable_rooms
- number_heated_rooms
default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.retain_features:
- uprn
- co2-emissions-current
- total-floor-area
- secondheat-description
- floor-description
- mainheat-energy-eff
- current-energy-efficiency
- walls-energy-eff
- roof-energy-eff
- property-type
- mainheat-description
- mechanical-ventilation
- floor-level
- built-form
- walls-description
- mainheatcont-description
- roof-description
- energy-consumption-current
- construction-age-band
- hotwater-description
- main-fuel
- hot-water-energy-eff
- co2-emiss-curr-per-floor-area
- windows-energy-eff
- current-energy-rating
- lodgement-year
- extension-count
- number-open-fireplaces
- number-heated-rooms
- windows-description
- heat-loss-corridor
- flat-top-storey
- unheated-corridor-length
- fixed-lighting-outlets-count
- tenure
- multi-glaze-proportion
- solar-water-heating-flag
- energy-tariff
- floor-height
- constituency
- transaction-type
- floor-energy-eff
- lodgement-month
- glazed-area
- estimate_annual_kwh
default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_config.target: heating_kwh
default.feature_processor.feature_processor_type: dataframe default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet s3://retrofit-data-dev/energy_consumption/2024-07-25/energy_consumption_dataset.parquet
default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_test_filepath:
default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet ./data/prepared_data/test.parquet
default.prepare_data.output_train_filepath:
./data/prepared_data/train.parquet
default.prepare_data.train_proportion: 0.9 default.prepare_data.train_proportion: 0.9
outs: outs:
- path: data/prepared_data/ - path: data/prepared_data/
hash: md5 hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir md5: d74d92498c1641cffe971f6b0634ccb0.dir
size: 45056059 size: 9623332
nfiles: 2 nfiles: 3
build_model: build_model:
cmd: python 2_build_model.py cmd: python 2_build_model.py
deps: deps:
@ -61,9 +96,9 @@ stages:
size: 4820 size: 4820
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir md5: d74d92498c1641cffe971f6b0634ccb0.dir
size: 45056059 size: 9623332
nfiles: 2 nfiles: 3
params: params:
configs/build_model.yaml: configs/build_model.yaml:
default: default:
@ -79,7 +114,7 @@ stages:
output_filepath: ./data/model/allmodels/ output_filepath: ./data/model/allmodels/
problem_type: regression problem_type: regression
eval_metric: mean_squared_error eval_metric: mean_squared_error
time_limit: 1800 time_limit: 3600
presets: medium_quality presets: medium_quality
excluded_model_types: excluded_model_types:
- RF - RF
@ -87,25 +122,97 @@ stages:
- NN_TORCH - NN_TORCH
- KNN - KNN
- XT - XT
infer_limit: 0.05 - FASTAI
infer_limit: 1
infer_limit_batch_size: 10000 infer_limit_batch_size: 10000
fit_strategy: parallel
ag_args_ensemble: ag_args_ensemble:
num_folds_parallel: 2 num_folds_parallel: 2
num_gpus: 0
hyperparameters:
NN_TORCH:
- {}
GBM:
- extra_trees: true
ag_args:
name_suffix: XT
- {}
- learning_rate: 0.03
num_leaves: 128
feature_fraction: 0.9
min_data_in_leaf: 3
ag_args:
name_suffix: Large
priority: 0
CAT:
- {}
XGB:
- {}
- max_depth: 10
ag_args:
name_suffix: Deep
FASTAI:
- {}
RF:
- criterion: gini
ag_args:
name_suffix: Gini
problem_types:
- binary
- multiclass
- criterion: entropy
ag_args:
name_suffix: Entr
problem_types:
- binary
- multiclass
- criterion: squared_error
ag_args:
name_suffix: MSE
problem_types:
- regression
- quantile
XT:
- criterion: gini
ag_args:
name_suffix: Gini
problem_types:
- binary
- multiclass
- criterion: entropy
ag_args:
name_suffix: Entr
problem_types:
- binary
- multiclass
- criterion: squared_error
ag_args:
name_suffix: MSE
problem_types:
- regression
- quantile
KNN:
- weights: uniform
ag_args:
name_suffix: Unif
- weights: distance
ag_args:
name_suffix: Dist
outs: outs:
- path: data/fit_predictions/ - path: data/fit_predictions/
hash: md5 hash: md5
md5: d9c9afc05e8780db47c0548b19bf7d19.dir md5: c9c8140e5a9fe111e5670810a36cd2ef.dir
size: 3349989 size: 1545780
nfiles: 1 nfiles: 1
- path: data/model/ - path: data/model/
hash: md5 hash: md5
md5: 13c3100e1486c27a83a8a47491077842.dir md5: d9f63a57f146409734cd8f84f707b3d9.dir
size: 773523079 size: 233231379
nfiles: 36 nfiles: 34
- path: metrics/fit_metrics.json - path: metrics/fit_metrics.json
hash: md5 hash: md5
md5: 2ff70a2a45813e1bcdf2ea3aa8e07d4a md5: a3d0eefbd5bd873fa0cd42390ac9575a
size: 224 size: 214
generate_predictions: generate_predictions:
cmd: python 3_generate_predictions.py cmd: python 3_generate_predictions.py
deps: deps:
@ -115,26 +222,28 @@ stages:
size: 2464 size: 2464
- path: data/model - path: data/model
hash: md5 hash: md5
md5: 13c3100e1486c27a83a8a47491077842.dir md5: d9f63a57f146409734cd8f84f707b3d9.dir
size: 773523079 size: 233231379
nfiles: 36 nfiles: 34
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir md5: d74d92498c1641cffe971f6b0634ccb0.dir
size: 45056059 size: 9623332
nfiles: 2 nfiles: 3
params: params:
configs/settings.yaml: configs/settings.yaml:
default.generate_predictions.input_dataclient_type: local default.generate_predictions.input_dataclient_type: local
default.generate_predictions.output_dataclient_type: local default.generate_predictions.output_dataclient_type: local
default.generate_predictions.predictions_column_name: predictions default.generate_predictions.predictions_column_name: predictions
default.generate_predictions.predictions_output_filepath: ./data/predictions/predictions.parquet default.generate_predictions.predictions_output_filepath:
default.generate_predictions.test_data_filepath: ./data/prepared_data/test.parquet ./data/predictions/predictions.parquet
default.generate_predictions.test_data_filepath:
./data/prepared_data/test.parquet
outs: outs:
- path: data/predictions/ - path: data/predictions/
hash: md5 hash: md5
md5: 5d07bcebf3160a72bb18dfd79106e85c.dir md5: 95172b679bf045e30fde8b6326780e15.dir
size: 463197 size: 163474
nfiles: 1 nfiles: 1
generate_metrics: generate_metrics:
cmd: python 4_generate_metrics.py cmd: python 4_generate_metrics.py
@ -145,14 +254,14 @@ stages:
size: 3484 size: 3484
- path: data/predictions - path: data/predictions
hash: md5 hash: md5
md5: 5d07bcebf3160a72bb18dfd79106e85c.dir md5: 95172b679bf045e30fde8b6326780e15.dir
size: 463197 size: 163474
nfiles: 1 nfiles: 1
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir md5: d74d92498c1641cffe971f6b0634ccb0.dir
size: 45056059 size: 9623332
nfiles: 2 nfiles: 3
params: params:
configs/settings.yaml: configs/settings.yaml:
default.generate_metrics.dataclient_type: local default.generate_metrics.dataclient_type: local
@ -161,30 +270,29 @@ stages:
outs: outs:
- path: metrics/metrics.json - path: metrics/metrics.json
hash: md5 hash: md5
md5: 3e08df02fd5c5d094bcf936e1338d596 md5: c079b41b1a0033b666f27f99be4e12ef
size: 223 size: 212
generate_scenerio_metrics: generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py cmd: python 5_generate_scenarios.py
deps: deps:
- path: 5_generate_scenarios.py - path: 5_generate_scenarios.py
hash: md5 hash: md5
md5: 40506749fefd926d47c60ff5b16db307 md5: 872b0c762ce1c8933fcbc5f54d5d4b5d
size: 5337 size: 5658
params: params:
configs/scenarios.yaml: configs/scenarios.yaml:
default.scenarios: default.scenarios:
input_dataclient_type: aws-s3 input_dataclient_type: aws-s3
output_dataclient_type: local output_dataclient_type: local
scenario_data_filepaths: scenario_data_filepaths:
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md metrics_output_filepath: ./metrics/scenario_metrics.md
outs: outs:
- path: metrics/scenario_metrics.md - path: metrics/scenario_metrics.md
hash: md5 hash: md5
md5: fa4d6d7bbd7818613800da5f8f37ea96 md5: d41d8cd98f00b204e9800998ecf8427e
size: 363 size: 0
- path: metrics/scenario_table.md - path: metrics/scenario_table.md
hash: md5 hash: md5
md5: d6baf100a1623cc2467c2f8221d314c9 md5: d41d8cd98f00b204e9800998ecf8427e
size: 2133 size: 0

View file

@ -1,7 +1,7 @@
joblib==1.3.2 joblib==1.5.2
boto3==1.28.17 boto3==1.40.61
pandas==2.1.4 pandas==2.3.3
autogluon.tabular[all]==1.0.0 autogluon.tabular[all]==1.4.0
dynaconf==3.2.1 dynaconf==3.2.12
pyarrow==13.0.0 pyarrow==20.0.0
pre-commit==3.3.3 pre-commit==4.3.0

View file

@ -1,7 +1,7 @@
joblib==1.3.2 joblib==1.5.2
boto3==1.28.17 boto3==1.40.61
pandas==2.1.4 pandas==2.3.3
autogluon.tabular[all]==1.0.0 autogluon.tabular[all]==1.4.0
dynaconf==3.2.1 dynaconf==3.2.12
pyarrow==13.0.0 pyarrow==20.0.0
PyYAML==6.0.1 PyYAML==6.0.3

View file

@ -1,10 +1,10 @@
joblib==1.3.2 joblib==1.5.2
boto3==1.28.17 boto3==1.40.61
pandas==2.1.4 pandas==2.3.3
autogluon.tabular[all]==1.0.0 autogluon.tabular[all]==1.4.0
ray==2.6.3 ray==2.44.1
dynaconf==3.2.1 dynaconf==3.2.12
alibi==0.9.5 # alibi
shap==0.42.1 shap==0.49.1
pyarrow==13.0.0 pyarrow==20.0.0
pre-commit==3.3.3 pre-commit==4.3.0

View file

@ -1,4 +1,4 @@
boto3==1.28.41 boto3==1.40.61
pandas==2.1.4 pandas==2.3.3
autogluon.tabular[all]==1.0.0 autogluon.tabular[all]==1.4.0
dynaconf==3.2.1 dynaconf==3.2.12

View file

@ -1,4 +1,4 @@
dvc==3.51.0 dvc==3.51.0
dvc-s3==3.2.0 dvc-s3==3.2.0
gto==1.7.1 gto==1.9.0
pyOpenSSL==23.3.0 pyOpenSSL==23.3.0