Compare commits

..

No commits in common. "carbon@v0.8.0" and "master" have entirely different histories.

24 changed files with 143 additions and 470 deletions

View file

@ -31,80 +31,6 @@ jobs:
# run: |
# echo "Please choose one of these tags: 'major', 'major', 'patch'"
# exit(1)
Verify-Lambda:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Install packages to retrieve artifacts
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
pip install --upgrade pip
pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
- name: Retrieve artifacts (dvc.lock)
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
cd modules/ml-pipeline/src/pipeline
dvc pull -r experiments
- name: Set timestamp
id: set_timestamp
run: |
echo "timestamp=$(date +%Y%m%d)" >> $GITHUB_ENV
echo "Generated timestamp: ${timestamp}"
- name: Upload sample row dataset to S3
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
cd modules/ml-pipeline/src/pipeline/data/prepared_data/
aws s3 cp sample_test.parquet s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/sample_test.parquet
- name: Build Lambda docker Image
run: |
docker build . --file ./deployment/Dockerfile.prediction.lambda --tag lambda_test
- name: Run lambda docker container
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
docker run -d -p 9000:8080 \
-e AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} \
-e AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} \
-e RUNTIME_ENVIRONMENT=dev \
-e PREDICTIONS_BUCKET=retrofit-sap-predictions-dev lambda_test
- name: Test Lambda endpoint
run: |
sleep 2
curl -X POST "http://localhost:9000/2015-03-31/functions/function/invocations" \
-H "Content-Type: application/json" \
-d "{\"body\": \"{\\\"file_location\\\": \\\"s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/sample_test.parquet\\\", \\\"property_id\\\": 1, \\\"portfolio_id\\\": 4, \\\"created_at\\\": \\\"now\\\", \\\"warm\\\": true}\"}"
- name: Get Lambda logs
run: |
docker logs $(docker ps -al -q)
- name: Test Lambda endpoint again
run: |
sleep 2
curl -X POST "http://localhost:9000/2015-03-31/functions/function/invocations" \
-H "Content-Type: application/json" \
-d "{\"body\": \"{\\\"file_location\\\": \\\"s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/sample_test.parquet\\\", \\\"property_id\\\": 1, \\\"portfolio_id\\\": 4, \\\"created_at\\\": \\\"now\\\", \\\"testing\\\": true}\"}"
- name: Get Lambda logs
run: |
docker logs $(docker ps -al -q)
- name: Stop Lambda container
run: |
docker stop lambda_test || echo "Container already stopped"
- name: Remove uploaded sample row dataset from S3
if: always()
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
aws s3 rm --recursive s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}/
Verify-Model:

View file

@ -8,65 +8,25 @@
"active": true
},
"sap": {
"version": "v0.15.0",
"version": "v0.14.0",
"stage": {
"dev": "v0.15.0"
"dev": "v0.14.0"
},
"registered": true,
"active": true
},
"heat": {
"version": "v0.7.0",
"version": "v0.5.0",
"stage": {
"dev": "v0.6.0"
"dev": "v0.5.0"
},
"registered": true,
"active": true
},
"carbon": {
"version": "v0.7.0",
"version": "v0.5.0",
"stage": {
"dev": "v0.7.0"
},
"registered": true,
"active": true
},
"hotwater": {
"version": "v1.0.0",
"stage": {
"dev": "v1.0.0"
},
"registered": true,
"active": true
},
"heating": {
"version": "v1.0.0",
"stage": {
"dev": "v1.0.0"
},
"registered": true,
"active": true
},
"lighting": {
"version": "v1.0.0",
"stage": {
"dev": "v1.0.0"
},
"registered": true,
"active": true
},
"hotwaterkwh": {
"version": "v1.3.0",
"stage": {
"dev": "v1.3.0"
},
"registered": true,
"active": true
},
"heatingkwh": {
"version": "v1.5.0",
"stage": {
"dev": "v1.5.0"
"dev": "v0.5.0"
},
"registered": true,
"active": true

View file

@ -83,13 +83,3 @@ curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d
```
This will send a POST request to the running Lambda function and pass in the required data as JSON.
For the testing of warm or testing of the lambda, use:
```json
curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/one_sample_test_dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\", \"testing\": \"true\"}"}'
```
or
```json
curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/one_sample_test_dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\", \"warm\": \"true\"}"}'
```

View file

@ -1,24 +1,19 @@
FROM public.ecr.aws/lambda/python:3.12
FROM public.ecr.aws/lambda/python:3.10
# Set the working directory
WORKDIR ${LAMBDA_TASK_ROOT}
ENV PYTHONPATH="${PYTHONPATH}:${LAMBDA_TASK_ROOT}"
ENV MPLCONFIGDIR="/tmp/matplotlib"
ENV PYTHONPATH "${PYTHONPATH}:${LAMBDA_TASK_ROOT}"
# Environment variables
ARG RUNTIME_ENVIRONMENT
ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT}
# Install necessary build tools - required to test locally
RUN dnf install -y gcc python3-devel gcc-c++
RUN yum install -y gcc python3-devel gcc-c++
# Install python packages
COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt
RUN pip install uv
RUN uv pip install -r requirements.txt --system
# RUN pip install --no-cache-dir -r ./requirements.txt
RUN pip install --no-cache-dir -r ./requirements.txt
# Copy the project code
COPY modules/ml-pipeline/src/pipeline ./pipeline

View file

@ -47,30 +47,6 @@ def upload_dataframe_to_s3(df, bucket, s3_file_name):
return False
def warming_up_invocation(
model,
model_filepath: str,
):
"""
Function to handle warm up invocations
"""
import pandas as pd
import numpy as np
model.load_model(model_filepath)
warmup_df = pd.DataFrame(
np.zeros((1, len(model.model.original_features))),
columns=model.model.original_features,
)
# model_names = model.model.model_names()
# if "NeuralNetFastAI" in model_names:
# model.model.predict(warmup_df, model="NeuralNetFastAI")
# else:
model.predict(data=warmup_df)
def handler(event, context):
"""
Take in event and trigger the prediction pipeline
@ -90,6 +66,9 @@ def handler(event, context):
created_at = body["created_at"]
# TODO: Implement the loading of the model and prediction
storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet"
logger.info(f"--- Initiate MLModel ---")
build_model_params = settings.build_model
@ -99,32 +78,6 @@ def handler(event, context):
model = model_factory(build_model_params["model_type"])
model_filepath = build_model_params["model_save_filepath"]
if "warm" in body:
logger.info("Warm up invocation - synthetic prediction")
warming_up_invocation(model=model, model_filepath=model_filepath)
return {
"statusCode": 200,
"body": json.dumps(
{
"message": "Successfully warmed up invocation",
}
),
}
if "testing" in body:
logger.info(
"Testing invocation for CI/CD - save file to same location in S3"
)
storage_filepath = body["file_location"].replace(
".parquet", "_output.parquet"
)
else:
storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet"
logger.info(f"--- Initiate Input DataClient ---")
input_dataclient = dataclient_factory(
dataclient_type="aws-s3",
@ -142,7 +95,7 @@ def handler(event, context):
output_dataclient=output_dataclient,
model=model,
target=feature_process_params["feature_processor_config"]["target"],
model_filepath=model_filepath,
model_filepath=build_model_params["model_save_filepath"],
test_data_filepath=body["file_location"],
predictions_output_filepath=storage_filepath,
predictions_column_name=generate_predictions_params[

View file

@ -51,4 +51,3 @@ functions:
path: /predict
method: POST
timeout: 120 # Set max run time to 2 minutes - we shouldn't need this much time so this can be reviewed
memorySize: 3008

View file

@ -1,8 +1,7 @@
export PYENV_ROOT=$(HOME)/.pyenv
export PATH := $(PYENV_ROOT)/bin:$(PATH)
PYTHON_VERSION ?= 3.12.12
PYTHON_VERSION ?= 3.10.12
CONDA_ENV=dev_env_pipeline
CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda deactivate ; conda activate
.PHONY: init
init: dev-conda
@ -13,15 +12,11 @@ dev-conda:
# conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously"
conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y
conda init bash
${CONDA_ACTIVATE} ${CONDA_ENV} && \
which pip && \
pip install --upgrade pip && \
pip install uv && \
uv pip install -r src/pipeline/requirements/training/requirements-dev.txt && \
uv pip install -r src/pipeline/requirements/version_control/requirements.txt && \
pre-commit install && \
uv pip install ipykernel && \
conda install llvm-openmp -y
conda run -v -n ${CONDA_ENV} pip install --upgrade pip
conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt
conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt
conda run -v -n ${CONDA_ENV} pre-commit install
conda run -v -n ${CONDA_ENV} pip install ipykernel
echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND"
echo "conda activate ${CONDA_ENV}"
@ -38,4 +33,4 @@ dev-pyenv:
.PHONY: dvc-init
dvc-init:
. .dev_env_pipeline/bin/activate && dvc init --subdir
. .dev_env_pipeline/bin/activate && dvc init --subdir

View file

@ -1,21 +1,16 @@
# Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow)
FROM python:3.12.12-slim
FROM python:3.10.12-slim
RUN apt-get update && apt-get install -y libgomp1 gcc python3-dev
COPY pipeline/requirements/predictions/requirements.txt requirements.txt
RUN pip install --upgrade pip
RUN pip install uv
RUN uv pip install -r requirements.txt --system
# RUN pip install -r requirements.txt
RUN pip install -r requirements.txt
# Assuming in the CI/CD step, there will be a dvc pull step to get data and model, so will just need to run a single script
COPY pipeline/ /home/pipeline/
WORKDIR /home/pipeline/
CMD [ "python", "3_generate_predictions.py"]
CMD [ "python", "3_generate_predictions.py"]

View file

@ -1,3 +1,3 @@
# The generic reproducible ML-pipeline!
# The generic reproducible ML-pipeline
Pipeline required to build a model to produce an output, that gets hashed via DVC

View file

@ -1,4 +1,3 @@
# Ignore dynaconf secret files
.secrets.*
example.py

View file

@ -29,7 +29,6 @@ data_filepath = prepare_data_params["data_filepath"]
train_proportion = prepare_data_params["train_proportion"]
output_train_filepath = prepare_data_params["output_train_filepath"]
output_test_filepath = prepare_data_params["output_test_filepath"]
sample_test_filepath = prepare_data_params["sample_test_filepath"]
feature_processor_config = feature_process_params["feature_processor_config"]
logger.info(f"--- Initiate DataClient ---")
@ -100,10 +99,6 @@ def prepare_data(
logger.info("--- Outputting data ---")
output_dataclient.save_data(
obj=data.sample(1), location=sample_test_filepath, save_config=None
)
output_dataclient.save_data(
obj=train, location=output_train_filepath, save_config=None
)

View file

@ -99,12 +99,6 @@ def generate_scenario_predictions(
]
)
# TEMPORARY FIX: ADD is_post_sap10_starting and is_post_sap10_ending if not present
if "is_post_sap10_starting" not in scenario_data.columns:
scenario_data["is_post_sap10_starting"] = False
if "is_post_sap10_ending" not in scenario_data.columns:
scenario_data["is_post_sap10_ending"] = False
logger.info("--- Loading Model ---")
model.load_model(model_filepath)

View file

@ -14,23 +14,9 @@ default:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error #mean_absolute_error
time_limit: 3600
time_limit: 1800
presets: medium_quality
excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT', 'FASTAI']
infer_limit: 1
excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT']
infer_limit: 0.05
infer_limit_batch_size: 10000
fit_strategy: "parallel"
ag_args_ensemble: {'num_folds_parallel': 2}
num_gpus: 0
hyperparameters:
{
'NN_TORCH': [{}],
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, {'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 3, 'ag_args': {'name_suffix': 'Large', 'priority': 0,}}],
# 'GBM': [{}],
'CAT': [{}],
'XGB': [{}],
'FASTAI': [{}],
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}

View file

@ -18,44 +18,30 @@ def remove_starting_columns(df):
return df
def keep_negative_heat_change(df):
df = df[df["heat_demand_change"] < 0]
def remove_floor_height_ending(df):
# df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING']
# shows bottom 0.5 percentile is 1.665
# So keep anything above this
df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True)
print("we in here")
return df
def keep_non_negative_carbon_ending(df):
df = df[df["carbon_ending"] > 0]
def remove_minimum_habitable_room_size(df):
# Need minimum of 6.5m per habitable room
df = df[
df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5
].reset_index(drop=True)
return df
def keep_negative_carbon_change(df):
df = df[df["carbon_change"] < 0]
def keep_flats(df):
df = df[df["property_type"] == "Flat"]
return df
# TODO: Move to ETL pipeline
def remove_unreasonable_habitable_rooms(df):
"""
Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2
"""
minimum_room_size_index = (
df["total_floor_area_ending"] / df["number_habitable_rooms"] >= 6.5
)
df = df[minimum_room_size_index]
return df
def remove_top_1_percent_heat_demand(df):
# threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%']
threshold_value = 860
df = df[df["heat_demand_starting"] < threshold_value]
return df
def remove_top_1_percent_carbon(df):
# threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%']
threshold_value = 18
df = df[df["carbon_starting"] < threshold_value]
def keep_non_zero_rdsap(df):
df = df[df["rdsap_change"] != 0]
return df
@ -68,12 +54,10 @@ def remove_top_1_percent_carbon(df):
# return df
business_logic = {
"remove_unreasonable_habitable_rooms": remove_unreasonable_habitable_rooms,
"keep_negative_heat_change": keep_negative_heat_change,
"keep_negative_carbon_change": keep_negative_carbon_change,
"remove_top_1_percent_heat_demand": remove_top_1_percent_heat_demand,
"remove_top_1_percent_carbon": remove_top_1_percent_carbon,
"keep_non_negative_carbon_ending": keep_non_negative_carbon_ending,
# "keep_non_zero_rdsap": keep_non_zero_rdsap,
# "keep_flats": keep_flats,
# "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,
# "remove_floor_height_ending": remove_floor_height_ending
# "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns
}

View file

@ -1,24 +1,23 @@
"""
After predictions, we may want to apply some post processing to the predictions
"""
import pandas as pd
def clip_predictions_to_minimum_value(
data: pd.DataFrame,
predictions: pd.Series,
data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0
) -> pd.Series:
series_name = predictions.name
predictions.name = "predictions"
predictions = predictions.astype(data["carbon_starting"].dtype)
predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement
replace_index = predictions_df["predictions"] > predictions_df["carbon_starting"]
predictions_df.loc[replace_index, "predictions"] = predictions_df.loc[
replace_index, "carbon_starting"
]
replace_index = (
predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"]
)
predictions_df.loc[replace_index, "predictions"] = (
predictions_df.loc[replace_index, "sap_starting"] + minimum_value
)
predictions_new = predictions_df["predictions"]
predictions_new.name = series_name

View file

@ -8,6 +8,6 @@ default:
# - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md

View file

@ -18,29 +18,26 @@ default:
prepare_data:
input_dataclient_type: aws-s3
output_dataclient_type: local
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-10-03-22-57-23/dataset_rooms.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/2025-11-02-09-32-42/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet
sample_test_filepath: ./data/prepared_data/sample_test.parquet
feature_processor:
feature_processor_type: dataframe
feature_processor_config:
subsample_amount: null
subsample_seed: 0
target: carbon_ending
target: sap_ending
identifier_columns: ["uprn"]
# drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "sap_ending"]
# drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"]
drop_columns: [
"heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "sap_ending", "days_to_starting", "days_to_ending",
"heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending",
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
'number_habitable_rooms', 'number_heated_rooms', 'lighting_cost_starting',
'lighting_cost_ending', 'heating_cost_starting', 'heating_cost_ending', 'hot_water_cost_starting', 'hot_water_cost_ending',
'floor_thermal_transmittance', 'floor_thermal_transmittance_ending', 'lodgement_date_starting', 'lodgement_date_ending',]
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
'number_habitable_rooms', 'number_heated_rooms']
retain_features: null
# retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending',
# 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending',

View file

@ -1,4 +1,4 @@
""" "
""""
Implementations of MLModels, all of which will have four methods to:
- Load model
- Save Model
@ -11,6 +11,9 @@ import joblib
import pandas as pd
from pathlib import Path
from typing import Union, List
from sklearn import linear_model
from sklearn.svm import SVR
from autogluon.tabular import TabularDataset, TabularPredictor
from core.interface.InterfaceModels import MLModel
from core.Logger import logger
@ -66,8 +69,6 @@ class SKLearnLinearRegression:
"""
Method to train a model
"""
from sklearn import linear_model
self.model = linear_model.LinearRegression()
x_train = data.iloc[:, data.columns != target]
@ -116,7 +117,6 @@ class SKLearnSVMRegression:
"""
Method to train a model
"""
from sklearn.svm import SVR
validate_dict_keys(
list(model_hyperparameters.keys()),
@ -152,17 +152,12 @@ class AutogluonAutoML:
"infer_limit",
"infer_limit_batch_size",
"ag_args_ensemble",
"fit_strategy",
"num_gpus",
"hyperparameters",
]
def load_model(self, path: Union[Path, str]) -> None:
"""
Method to load a model
"""
from autogluon.tabular import TabularPredictor
filepath = str(path)
self.model = TabularPredictor.load(path=filepath)
@ -188,10 +183,6 @@ class AutogluonAutoML:
"""
Method to train a model
"""
from autogluon.tabular import TabularDataset, TabularPredictor
# Force Parallel Model fitting
os.environ["AG_FORCE_PARALLEL"] = "True"
validate_dict_keys(
keys_1=list(model_hyperparameters.keys()),
@ -218,9 +209,6 @@ class AutogluonAutoML:
infer_limit=model_hyperparameters["infer_limit"],
infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"],
ag_args_ensemble=model_hyperparameters["ag_args_ensemble"],
fit_strategy=model_hyperparameters["fit_strategy"],
num_gpus=model_hyperparameters["num_gpus"],
hyperparameters=model_hyperparameters["hyperparameters"].to_dict(),
)
def predict(

View file

@ -16,8 +16,8 @@ stages:
deps:
- path: 1_prepare_data.py
hash: md5
md5: a5ce162e1c402c0f811a80ef78cf4dd5
size: 4481
md5: 11a3b8bfdfe199ab7ecc39ccc5652649
size: 4298
params:
configs/settings.yaml:
default.feature_processor.feature_processor_config.drop_columns:
@ -25,7 +25,7 @@ stages:
- carbon_change
- rdsap_change
- heat_demand_ending
- sap_ending
- carbon_ending
- days_to_starting
- days_to_ending
- number_habitable_rooms_starting
@ -34,36 +34,24 @@ stages:
- number_heated_rooms_ending
- number_habitable_rooms
- number_heated_rooms
- lighting_cost_starting
- lighting_cost_ending
- heating_cost_starting
- heating_cost_ending
- hot_water_cost_starting
- hot_water_cost_ending
- floor_thermal_transmittance
- floor_thermal_transmittance_ending
- lodgement_date_starting
- lodgement_date_ending
default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: carbon_ending
default.feature_processor.feature_processor_config.target: sap_ending
default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2025-11-02-09-32-42/dataset_rooms.parquet
default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath:
./data/prepared_data/test.parquet
default.prepare_data.output_train_filepath:
./data/prepared_data/train.parquet
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet
default.prepare_data.train_proportion: 0.9
outs:
- path: data/prepared_data/
hash: md5
md5: 219cd47a478057c6473e390611c46ba6.dir
size: 37781342
nfiles: 3
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
nfiles: 2
build_model:
cmd: python 2_build_model.py
deps:
@ -73,9 +61,9 @@ stages:
size: 4820
- path: data/prepared_data
hash: md5
md5: 219cd47a478057c6473e390611c46ba6.dir
size: 37781342
nfiles: 3
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
nfiles: 2
params:
configs/build_model.yaml:
default:
@ -91,7 +79,7 @@ stages:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error
time_limit: 3600
time_limit: 1800
presets: medium_quality
excluded_model_types:
- RF
@ -99,94 +87,25 @@ stages:
- NN_TORCH
- KNN
- XT
- FASTAI
infer_limit: 1
infer_limit: 0.05
infer_limit_batch_size: 10000
fit_strategy: parallel
ag_args_ensemble:
num_folds_parallel: 2
num_gpus: 0
hyperparameters:
NN_TORCH:
- {}
GBM:
- extra_trees: true
ag_args:
name_suffix: XT
- {}
- learning_rate: 0.03
num_leaves: 128
feature_fraction: 0.9
min_data_in_leaf: 3
ag_args:
name_suffix: Large
priority: 0
CAT:
- {}
XGB:
- {}
FASTAI:
- {}
RF:
- criterion: gini
ag_args:
name_suffix: Gini
problem_types:
- binary
- multiclass
- criterion: entropy
ag_args:
name_suffix: Entr
problem_types:
- binary
- multiclass
- criterion: squared_error
ag_args:
name_suffix: MSE
problem_types:
- regression
- quantile
XT:
- criterion: gini
ag_args:
name_suffix: Gini
problem_types:
- binary
- multiclass
- criterion: entropy
ag_args:
name_suffix: Entr
problem_types:
- binary
- multiclass
- criterion: squared_error
ag_args:
name_suffix: MSE
problem_types:
- regression
- quantile
KNN:
- weights: uniform
ag_args:
name_suffix: Unif
- weights: distance
ag_args:
name_suffix: Dist
outs:
- path: data/fit_predictions/
hash: md5
md5: 5411b43b1a372e77f90de28b60913ae6.dir
size: 3833319
md5: d9c9afc05e8780db47c0548b19bf7d19.dir
size: 3349989
nfiles: 1
- path: data/model/
hash: md5
md5: 354a0b7ea4268f1316c77257e57116fd.dir
size: 745138026
nfiles: 31
md5: 13c3100e1486c27a83a8a47491077842.dir
size: 773523079
nfiles: 36
- path: metrics/fit_metrics.json
hash: md5
md5: 56bbb666b56aeca7da9436138c881948
size: 225
md5: 2ff70a2a45813e1bcdf2ea3aa8e07d4a
size: 224
generate_predictions:
cmd: python 3_generate_predictions.py
deps:
@ -196,28 +115,26 @@ stages:
size: 2464
- path: data/model
hash: md5
md5: 354a0b7ea4268f1316c77257e57116fd.dir
size: 745138026
nfiles: 31
md5: 13c3100e1486c27a83a8a47491077842.dir
size: 773523079
nfiles: 36
- path: data/prepared_data
hash: md5
md5: 219cd47a478057c6473e390611c46ba6.dir
size: 37781342
nfiles: 3
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
nfiles: 2
params:
configs/settings.yaml:
default.generate_predictions.input_dataclient_type: local
default.generate_predictions.output_dataclient_type: local
default.generate_predictions.predictions_column_name: predictions
default.generate_predictions.predictions_output_filepath:
./data/predictions/predictions.parquet
default.generate_predictions.test_data_filepath:
./data/prepared_data/test.parquet
default.generate_predictions.predictions_output_filepath: ./data/predictions/predictions.parquet
default.generate_predictions.test_data_filepath: ./data/prepared_data/test.parquet
outs:
- path: data/predictions/
hash: md5
md5: 8ddd3dbe13df261dbbcb57d01f75c3ba.dir
size: 532652
md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
size: 463197
nfiles: 1
generate_metrics:
cmd: python 4_generate_metrics.py
@ -228,14 +145,14 @@ stages:
size: 3484
- path: data/predictions
hash: md5
md5: 8ddd3dbe13df261dbbcb57d01f75c3ba.dir
size: 532652
md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
size: 463197
nfiles: 1
- path: data/prepared_data
hash: md5
md5: 219cd47a478057c6473e390611c46ba6.dir
size: 37781342
nfiles: 3
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
nfiles: 2
params:
configs/settings.yaml:
default.generate_metrics.dataclient_type: local
@ -244,29 +161,30 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: 2c860657417421c757146f2dce76f444
size: 225
md5: 3e08df02fd5c5d094bcf936e1338d596
size: 223
generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py
deps:
- path: 5_generate_scenarios.py
hash: md5
md5: 872b0c762ce1c8933fcbc5f54d5d4b5d
size: 5658
md5: 40506749fefd926d47c60ff5b16db307
size: 5337
params:
configs/scenarios.yaml:
default.scenarios:
input_dataclient_type: aws-s3
output_dataclient_type: local
scenario_data_filepaths:
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md
outs:
- path: metrics/scenario_metrics.md
hash: md5
md5: d41d8cd98f00b204e9800998ecf8427e
size: 0
md5: fa4d6d7bbd7818613800da5f8f37ea96
size: 363
- path: metrics/scenario_table.md
hash: md5
md5: d41d8cd98f00b204e9800998ecf8427e
size: 0
md5: d6baf100a1623cc2467c2f8221d314c9
size: 2133

View file

@ -1,7 +1,7 @@
joblib==1.5.2
boto3==1.40.61
pandas==2.3.3
autogluon.tabular[all]==1.4.0
dynaconf==3.2.12
pyarrow==20.0.0
pre-commit==4.3.0
joblib==1.3.2
boto3==1.28.17
pandas==2.1.4
autogluon.tabular[all]==1.0.0
dynaconf==3.2.1
pyarrow==13.0.0
pre-commit==3.3.3

View file

@ -1,7 +1,7 @@
joblib==1.5.2
boto3==1.40.61
pandas==2.3.3
autogluon.tabular[all]==1.4.0
dynaconf==3.2.12
pyarrow==20.0.0
PyYAML==6.0.3
joblib==1.3.2
boto3==1.28.17
pandas==2.1.4
autogluon.tabular[all]==1.0.0
dynaconf==3.2.1
pyarrow==13.0.0
PyYAML==6.0.1

View file

@ -1,10 +1,10 @@
joblib==1.5.2
boto3==1.40.61
pandas==2.3.3
autogluon.tabular[all]==1.4.0
ray==2.44.1
dynaconf==3.2.12
# alibi
shap==0.49.1
pyarrow==20.0.0
pre-commit==4.3.0
joblib==1.3.2
boto3==1.28.17
pandas==2.1.4
autogluon.tabular[all]==1.0.0
ray==2.6.3
dynaconf==3.2.1
alibi==0.9.5
shap==0.42.1
pyarrow==13.0.0
pre-commit==3.3.3

View file

@ -1,4 +1,4 @@
boto3==1.40.61
pandas==2.3.3
autogluon.tabular[all]==1.4.0
dynaconf==3.2.12
boto3==1.28.41
pandas==2.1.4
autogluon.tabular[all]==1.0.0
dynaconf==3.2.1

View file

@ -1,4 +1,4 @@
dvc==3.51.0
dvc-s3==3.2.0
gto==1.9.0
gto==1.7.1
pyOpenSSL==23.3.0