Compare commits

..

No commits in common. "master" and "model@v12.10.12" have entirely different histories.

46 changed files with 253 additions and 595 deletions

View file

@ -1,9 +0,0 @@
modules/ml-pipeline/src/pipeline/data/predictions
modules/ml-pipeline/src/pipeline/data/fit_predictions
modules/ml-pipeline/src/pipeline/data/prepared_data
modules/ml-pipeline/src/pipeline/data/model/allmodels
modules/ml-pipeline/src/pipeline/metrics
modules/ml-pipeline/src/pipeline/__pycache__
modules/ml-pipeline/src/pipeline/.dvc
modules/ml-pipeline/src/pipeline/analysis
modules/ml-pipeline/src/pipeline/metrics

View file

@ -2,7 +2,7 @@ name: Sap Change Model Deploy
on: on:
push: push:
branches: [ sap-dev, sap-prod, heat-dev, heat-prod, carbon-dev, carbon-prod] branches: [ sap_change-dev, sap_change-prod ]
jobs: jobs:
deploy: deploy:
@ -19,8 +19,8 @@ jobs:
- name: Install Serverless and plugins - name: Install Serverless and plugins
run: | run: |
npm install -g serverless@^3.38.0 npm install -g serverless
npm install -g serverless-domain-manager@^7.3.8 npm install -g serverless-domain-manager
- name: Install DVC - name: Install DVC
run: | run: |
@ -54,12 +54,10 @@ jobs:
- name: Set stack_name - name: Set stack_name
id: set_stack_name id: set_stack_name
run: | run: |
# Take branch prefix and add "model" for stack name if [[ "${{ github.ref_name }}" == "sap_change-dev" || "${{ github.ref_name }}" == "sap_change-prod" ]]; then
stack_name=$( echo ${{ github.ref_name }} | awk -F"-" '{print $1}' | sed 's/$/model/g') echo "::set-output name=stack_name::sapmodel"
if [ -z "${stack_name}" ]; then
echo "::set-output name=stack_name::"
else else
echo "::set-output name=stack_name::${stack_name}" echo "::set-output name=stack_name::"
fi fi
- name: Set runtime_environment - name: Set runtime_environment

View file

@ -10,9 +10,10 @@ on:
types: types:
- closed - closed
branches: branches:
- "sap-dev" - "master"
- "heat-dev" - "sap_change-dev"
- "carbon-dev" - "heat_change-dev"
- "carbon_change-dev"
permissions: write-all permissions: write-all
@ -42,14 +43,7 @@ jobs:
if [ -z "${latest_version}" ]; then if [ -z "${latest_version}" ]; then
increment_version="1.0.0" increment_version="1.0.0"
else else
increment_version=$(echo ${latest_version} | awk 'BEGIN { increment_version=$(echo ${latest_version} | awk -F'.' '{OFS="."; $1+=1; print}')
FS="\\." # Set the field separator to a period
OFS="." # Set the output field separator to a period
}
{
major = $1 + 1 # Increment the major version
print major, "0", "0" # Print the new version
}')
fi fi
new_tag=${REGISTER_MODEL_NAME}@v${increment_version} new_tag=${REGISTER_MODEL_NAME}@v${increment_version}
@ -87,14 +81,7 @@ jobs:
if [ -z "${latest_version}" ]; then if [ -z "${latest_version}" ]; then
increment_version="0.1.0" increment_version="0.1.0"
else else
increment_version=$(echo ${latest_version} | awk 'BEGIN { increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$2++; print}')
FS="\\." # Set the field separator to a period
OFS="." # Set the output field separator to a period
}
{
minor = $2 + 1 # Increment the minor version
print $1, minor, "0" # Print the new version
}')
fi fi
new_tag=${REGISTER_MODEL_NAME}@v${increment_version} new_tag=${REGISTER_MODEL_NAME}@v${increment_version}
@ -132,14 +119,7 @@ jobs:
if [ -z "${latest_version}" ]; then if [ -z "${latest_version}" ]; then
increment_version="0.0.1" increment_version="0.0.1"
else else
increment_version=$(echo ${latest_version} | awk 'BEGIN { increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$3++; print}')
FS="\\." # Set the field separator to a period
OFS="." # Set the output field separator to a period
}
{
patch = $3 + 1 # Increment the patch version
print $1, $2, patch # Print the new version
}')
fi fi
new_tag=${REGISTER_MODEL_NAME}@v${increment_version} new_tag=${REGISTER_MODEL_NAME}@v${increment_version}
@ -199,8 +179,6 @@ jobs:
pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
- name: Register Model - name: Register Model
env:
TARGET_BRANCH: ${{ github.base_ref }}
run: | run: |
REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}')
@ -209,7 +187,7 @@ jobs:
git config user.name "Github-Bot" git config user.name "Github-Bot"
git config user.email "Github-Bot@no-reply.com" git config user.email "Github-Bot@no-reply.com"
latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/' | awk 'END {print}') latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/')
if [ -z "${latest_dev_version}" ]; then if [ -z "${latest_dev_version}" ]; then
increment_version="1" increment_version="1"
else else
@ -217,7 +195,7 @@ jobs:
fi fi
new_tag=${REGISTER_MODEL_NAME}#dev#${increment_version} new_tag=${REGISTER_MODEL_NAME}#dev#${increment_version}
latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@" '{print $2}') latest_version=$(gto show model@latest --ref | awk -F"@" '{print $2}')
echo ${new_tag} echo ${new_tag}
@ -228,11 +206,11 @@ jobs:
git tag -a ${new_tag} -m "Assigning stage dev to artifact ${REGISTER_MODEL_NAME} version ${latest_version}" git tag -a ${new_tag} -m "Assigning stage dev to artifact ${REGISTER_MODEL_NAME} version ${latest_version}"
git push origin ${new_tag} git push origin ${new_tag}
git checkout ${TARGET_BRANCH} git checkout master
git fetch --all git fetch --all
git pull git pull
gto show --json > MODEL_REGISTRY.md gto show --json > MODEL_REGISTRY.md
git add . git add .
git commit -m "Update Registry" git commit -m "Update Registry"
git push origin ${TARGET_BRANCH} git push origin master

View file

@ -5,7 +5,7 @@ on:
# branches: # branches:
# - "model-**" # - "model-**"
pull_request: pull_request:
branches: ["sap-dev", "heat-dev", "carbon-dev"] branches: [ "master", "sap_change-dev", "heat_change-dev", "carbon_change-dev"]
label: label:
types: ["created", "edited"] types: ["created", "edited"]
@ -89,24 +89,13 @@ jobs:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
TARGET_BRANCH: ${{ github.base_ref }}
run: | run: |
cd modules/ml-pipeline/src/pipeline cd modules/ml-pipeline/src/pipeline
echo "## Model metrics" > report.md echo "## Model metrics" > report.md
# Compare metrics to master # Compare metrics to master
git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH} git fetch --depth=1 origin master:master
dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md dvc metrics diff --md --all master >> report.md
echo "## Scenario comparison" >> report.md
cat metrics/scenario_table.md >> report.md
echo "" >> report.md
echo "## Scenario metrics" >> report.md
cat metrics/scenario_metrics.md >> report.md
cml comment create report.md cml comment create report.md

View file

@ -1,32 +1,16 @@
{ {
"model": { "model": {
"version": "v12.10.12", "version": "v11.10.12",
"stage": { "stage": {
"dev": "v11.10.12" "dev": "v11.10.12"
}, },
"registered": true, "registered": true,
"active": true "active": true
}, },
"sap": { "migrate": {
"version": "v0.14.0", "version": null,
"stage": { "stage": {
"dev": "v0.14.0" "dev": "f320b9e0e9f3ea7735aed1abee07b1fb498c39c3"
},
"registered": true,
"active": true
},
"heat": {
"version": "v0.5.0",
"stage": {
"dev": "v0.5.0"
},
"registered": true,
"active": true
},
"carbon": {
"version": "v0.5.0",
"stage": {
"dev": "v0.5.0"
}, },
"registered": true, "registered": true,
"active": true "active": true

View file

@ -10,9 +10,9 @@ tracking and a model registry
- A bolt-on service that can implement model monitoring - A bolt-on service that can implement model monitoring
There are multiple protected branches which adapt the generic pipeline to produce different models: There are multiple protected branches which adapt the generic pipeline to produce different models:
- sap-{dev/staging/prod}-** - sap_change-**
- heat-{dev/staging/prod}-** - heat_change-**
- carbon-{dev/staging/prod}-** - carbon_change-**
These branches will differ by the configuration files that define the data used and the outputs of the ML-pipeline These branches will differ by the configuration files that define the data used and the outputs of the ML-pipeline
- There can be different additional logic for each branch but the pipeline will be the same. - There can be different additional logic for each branch but the pipeline will be the same.
@ -31,7 +31,7 @@ In order for this to be set up, some key environment variables needs to be inser
secrets. Each different model and protected branch has its own set of secrets which allows for flexibility secrets. Each different model and protected branch has its own set of secrets which allows for flexibility
between different pipelines. between different pipelines.
For example, for the branch sap-dev, the prefix=SAP_DEV, and the following secrets are: For example, for the branch sap_change-dev, the prefix=SAP_CHANGE_DEV, and the following secrets are:
- {prefix}_ECR_URI, which is the URI of the ECR repository to push to. For example, for the - {prefix}_ECR_URI, which is the URI of the ECR repository to push to. For example, for the
sap change model this is the lambda-sap-prediction-dev repository. sap change model this is the lambda-sap-prediction-dev repository.
@ -58,7 +58,7 @@ First, navigate to the root directory of the repository. Open a terminal and exe
2. command to build the Docker image: 2. command to build the Docker image:
```bash ```bash
docker build -t sap -f deployment/Dockerfile.prediction.lambda . docker build -t sap_change -f deployment/Dockerfile.prediction.lambda .
``` ```
This will build a Docker image tagged as sap_change using the Dockerfile.prediction.lambda located This will build a Docker image tagged as sap_change using the Dockerfile.prediction.lambda located
@ -68,7 +68,7 @@ in the deployment directory.
Once the image is built, you can run it using the following command: Once the image is built, you can run it using the following command:
```bash ```bash
docker run -p 9000:8080 -v ~/.aws/credentials:/root/.aws/credentials:ro -e RUNTIME_ENVIRONMENT=dev -e PREDICTIONS_BUCKET=retrofit-sap-predictions-dev sap docker run -p 9000:8080 -v ~/.aws/credentials:/root/.aws/credentials:ro -e RUNTIME_ENVIRONMENT=dev sap_change
``` ```
This command does the following: This command does the following:
@ -79,7 +79,6 @@ Sets the RUNTIME_ENVIRONMENT variable to dev.
To test the Lambda function, use the following curl command: To test the Lambda function, use the following curl command:
```json ```json
curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/one_sample_test_dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"}' curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data_with_id.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"'
``` ```
This will send a POST request to the running Lambda function and pass in the required data as JSON. This will send a POST request to the running Lambda function and pass in the required data as JSON.

View file

@ -1,9 +0,0 @@
modules/ml-pipeline/src/pipeline/data/predictions
modules/ml-pipeline/src/pipeline/data/fit_predictions
modules/ml-pipeline/src/pipeline/data/prepared_data
modules/ml-pipeline/src/pipeline/data/model/allmodels
modules/ml-pipeline/src/pipeline/metrics
modules/ml-pipeline/src/__pycache__
modules/ml-pipeline/src/.dvc
modules/ml-pipeline/src/analysis
modules/ml-pipeline/src/metrics

View file

@ -9,7 +9,7 @@ ARG RUNTIME_ENVIRONMENT
ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT} ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT}
# Install necessary build tools - required to test locally # Install necessary build tools - required to test locally
RUN yum install -y gcc python3-devel gcc-c++ RUN yum install -y gcc python3-devel
# Install python packages # Install python packages
COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt

View file

@ -69,7 +69,9 @@ def handler(event, context):
storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet" storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet"
logger.info("-------------------------")
logger.info(f"--- Initiate MLModel ---") logger.info(f"--- Initiate MLModel ---")
logger.info("-------------------------")
build_model_params = settings.build_model build_model_params = settings.build_model
client_params = settings.client client_params = settings.client
@ -78,13 +80,17 @@ def handler(event, context):
model = model_factory(build_model_params["model_type"]) model = model_factory(build_model_params["model_type"])
logger.info("----------------------------")
logger.info(f"--- Initiate Input DataClient ---") logger.info(f"--- Initiate Input DataClient ---")
logger.info("----------------------------")
input_dataclient = dataclient_factory( input_dataclient = dataclient_factory(
dataclient_type="aws-s3", dataclient_type="aws-s3",
dataclient_config=client_params["aws-s3"], dataclient_config=client_params["aws-s3"],
) )
logger.info("----------------------------")
logger.info(f"--- Initiate Output DataClient ---") logger.info(f"--- Initiate Output DataClient ---")
logger.info("----------------------------")
output_dataclient = dataclient_factory( output_dataclient = dataclient_factory(
dataclient_type="aws-s3", dataclient_type="aws-s3",
dataclient_config=client_params["aws-s3"], dataclient_config=client_params["aws-s3"],
@ -101,7 +107,6 @@ def handler(event, context):
predictions_column_name=generate_predictions_params[ predictions_column_name=generate_predictions_params[
"predictions_column_name" "predictions_column_name"
], ],
identifier_column=generate_predictions_params["identifier_column"],
) )
return { return {

3
modules/ml-pipeline/.dvc/.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
/config.local
/tmp
/cache

View file

@ -0,0 +1,2 @@
['remote "myremote"']
url = /tmp/dvcstore

View file

@ -0,0 +1,3 @@
# Add patterns of files dvc should ignore, which could improve
# the performance. Learn more at
# https://dvc.org/doc/user-guide/dvcignore

2
modules/ml-pipeline/.gto Normal file
View file

@ -0,0 +1,2 @@
# .gto config file
stages: [dev, stage, prod] # list of allowed Stages

View file

@ -9,16 +9,16 @@ init: dev-conda
.PHONY: dev-conda .PHONY: dev-conda
dev-conda: dev-conda:
# conda deactivate || echo "Not in conda environment" # conda deactivate || echo "Not in conda environment"
# conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously" # conda remove --name $CONDA_ENV --all -y || echo "No environment created previously"
conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y conda create --name $CONDA_ENV python=$(PYTHON_VERSION) -y
conda init bash conda init bash
conda run -v -n ${CONDA_ENV} pip install --upgrade pip conda run -vvvv -n $CONDA_ENV pip install --upgrade pip
conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/training/requirements-dev.txt
conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/version_control/requirements.txt
conda run -v -n ${CONDA_ENV} pre-commit install conda run -vvvv -n $CONDA_ENV pre-commit install
conda run -v -n ${CONDA_ENV} pip install ipykernel conda run -vvvv -n $CONDA_ENV pip install ipykernel
echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND"
echo "conda activate ${CONDA_ENV}" echo "conda activate $CONDA_ENV"
.PHONY: dev-pyenv .PHONY: dev-pyenv

View file

@ -1,8 +0,0 @@
pipeline/data/predictions
pipeline/data/fit_predictions
pipeline/data/prepared_data/train.parquet
pipeline/data/fit_predictions
pipeline/data/model/allmodels
pipeline/metrics
pipeline/.dvc
pipeline/analysis

View file

@ -1,7 +1,7 @@
# Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow) # Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow)
FROM python:3.10.12-slim FROM python:3.10.12-slim
RUN apt-get update && apt-get install -y libgomp1 gcc python3-dev RUN apt-get update && apt-get install -y libgomp1
COPY pipeline/requirements/predictions/requirements.txt requirements.txt COPY pipeline/requirements/predictions/requirements.txt requirements.txt

View file

@ -1,3 +1,3 @@
# The generic reproducible ML-pipeline # The generic reproducible ML-pipeline
Pipeline required to build a model to produce an output, that gets hashed via DVC Pipeline required to build a model to produce an output

View file

@ -16,9 +16,13 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
Remove the directory where artefacts are stored Remove the directory where artefacts are stored
""" """
logger.info("---------------------")
logger.info(f"--- Run Clean up ---") logger.info(f"--- Run Clean up ---")
logger.info("---------------------")
logger.info("-------------------------")
logger.info(f"--- Delete artefacts ---") logger.info(f"--- Delete artefacts ---")
logger.info("-------------------------")
artefact_directory_path = Path(artefacts_directory) artefact_directory_path = Path(artefacts_directory)
@ -27,7 +31,9 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
logger.info(f"Removing the directory: {artefacts_directory}") logger.info(f"Removing the directory: {artefacts_directory}")
shutil.rmtree(artefact_directory_path) shutil.rmtree(artefact_directory_path)
logger.info("-----------------------")
logger.info(f"--- Delete metrics ---") logger.info(f"--- Delete metrics ---")
logger.info("-----------------------")
metrics_directory_path = Path(metrics_directory) metrics_directory_path = Path(metrics_directory)
@ -39,11 +45,15 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
if __name__ == "__main__": if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---") logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
run_cleanup( run_cleanup(
artefacts_directory=startup_cleanup_params["artefacts"], artefacts_directory=startup_cleanup_params["artefacts"],
metrics_directory=startup_cleanup_params["metrics"], metrics_directory=startup_cleanup_params["metrics"],
) )
logger.info("-------------------------------")
logger.info(f"--- {__file__} - Complete! ---") logger.info(f"--- {__file__} - Complete! ---")
logger.info("-------------------------------")

View file

@ -17,7 +17,9 @@ from core.DataClient import dataclient_factory
from core.FeatureProcessor import feature_processor_factory from core.FeatureProcessor import feature_processor_factory
from config import settings from config import settings
logger.info("----------------------------")
logger.info(f"--- Initiate Parameters ---") logger.info(f"--- Initiate Parameters ---")
logger.info("----------------------------")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@ -31,7 +33,9 @@ output_train_filepath = prepare_data_params["output_train_filepath"]
output_test_filepath = prepare_data_params["output_test_filepath"] output_test_filepath = prepare_data_params["output_test_filepath"]
feature_processor_config = feature_process_params["feature_processor_config"] feature_processor_config = feature_process_params["feature_processor_config"]
logger.info("----------------------------")
logger.info(f"--- Initiate DataClient ---") logger.info(f"--- Initiate DataClient ---")
logger.info("----------------------------")
input_dataclient_type = prepare_data_params["input_dataclient_type"] input_dataclient_type = prepare_data_params["input_dataclient_type"]
output_dataclient_type = prepare_data_params["output_dataclient_type"] output_dataclient_type = prepare_data_params["output_dataclient_type"]
@ -45,7 +49,9 @@ output_dataclient = dataclient_factory(
dataclient_config=client_params[output_dataclient_type], dataclient_config=client_params[output_dataclient_type],
) )
logger.info("----------------------------------")
logger.info(f"--- Initiate FeatureProcessor ---") logger.info(f"--- Initiate FeatureProcessor ---")
logger.info("----------------------------------")
feature_processor = feature_processor_factory( feature_processor = feature_processor_factory(
feature_process_params["feature_processor_type"] feature_process_params["feature_processor_type"]
@ -70,11 +76,15 @@ def prepare_data(
:param pipeline_mode: bool, Default False, this caches out the file for experimentation, objects returned in pipeline mode :param pipeline_mode: bool, Default False, this caches out the file for experimentation, objects returned in pipeline mode
""" """
logger.info("--------------------")
logger.info("--- Loading data ---") logger.info("--- Loading data ---")
logger.info("--------------------")
data = input_dataclient.load_data(location=data_filepath, load_config={}) data = input_dataclient.load_data(location=data_filepath, load_config={})
logger.info("--------------------------")
logger.info("--- Feature Processing ---") logger.info("--- Feature Processing ---")
logger.info("--------------------------")
data = feature_processor.feature_process( data = feature_processor.feature_process(
data, data,
@ -83,12 +93,13 @@ def prepare_data(
new_feature_funcs=new_feature_funcs, new_feature_funcs=new_feature_funcs,
) )
logger.info("----------------------")
logger.info("--- Splitting data ---") logger.info("--- Splitting data ---")
logger.info("----------------------")
if train_proportion == 1: if train_proportion == 1:
train = data train = data
# Sample 10% of the data for testing test = None
test = data.sample(round(len(data) * 0.1))
else: else:
train, test = train_test_split( train, test = train_test_split(
data, train_size=train_proportion, test_size=(1 - train_proportion) data, train_size=train_proportion, test_size=(1 - train_proportion)
@ -97,7 +108,9 @@ def prepare_data(
train = train.reset_index(drop=True) train = train.reset_index(drop=True)
logger.info("-----------------------")
logger.info("--- Outputting data ---") logger.info("--- Outputting data ---")
logger.info("-----------------------")
output_dataclient.save_data( output_dataclient.save_data(
obj=train, location=output_train_filepath, save_config=None obj=train, location=output_train_filepath, save_config=None
@ -113,9 +126,13 @@ def prepare_data(
if __name__ == "__main__": if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---") logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
logger.info("---------------------------")
logger.info(f"--- Prepare Data Stage ---") logger.info(f"--- Prepare Data Stage ---")
logger.info("---------------------------")
prepare_data( prepare_data(
input_dataclient=input_dataclient, input_dataclient=input_dataclient,
@ -130,4 +147,6 @@ if __name__ == "__main__":
new_feature_funcs=new_feature_funcs, new_feature_funcs=new_feature_funcs,
) )
logger.info("-------------------------------")
logger.info(f"--- {__file__} - Complete! ---") logger.info(f"--- {__file__} - Complete! ---")
logger.info("-------------------------------")

View file

@ -18,7 +18,9 @@ from core.MLMetrics import metrics_factory
from configs.post_prediction_logic import post_prediction_logic from configs.post_prediction_logic import post_prediction_logic
from config import settings from config import settings
logger.info("----------------------------")
logger.info(f"--- Initiate Parameters ---") logger.info(f"--- Initiate Parameters ---")
logger.info("----------------------------")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@ -26,12 +28,9 @@ prepare_data_params = settings.prepare_data
build_model_params = settings.build_model build_model_params = settings.build_model
feature_process_params = settings.feature_processor feature_process_params = settings.feature_processor
generate_metrics_params = settings.generate_metrics generate_metrics_params = settings.generate_metrics
generate_predictions_params = settings.generate_predictions
model_type = build_model_params["model_type"] model_type = build_model_params["model_type"]
target = feature_process_params["feature_processor_config"]["target"] target = feature_process_params["feature_processor_config"]["target"]
fit_predictions_filepath = build_model_params["fit_predictions_filepath"]
predictions_column_name = generate_predictions_params["predictions_column_name"]
identifier_columns = feature_process_params["feature_processor_config"][ identifier_columns = feature_process_params["feature_processor_config"][
"identifier_columns" "identifier_columns"
] ]
@ -41,16 +40,22 @@ train_filepath = prepare_data_params["output_train_filepath"]
test_filepath = prepare_data_params["output_test_filepath"] test_filepath = prepare_data_params["output_test_filepath"]
fit_metrics_filepath = build_model_params["fit_metrics_filepath"] fit_metrics_filepath = build_model_params["fit_metrics_filepath"]
logger.info("----------------------------")
logger.info(f"--- Initiate DataClient ---") logger.info(f"--- Initiate DataClient ---")
logger.info("----------------------------")
# Output of previous prepare data step, will be where the data is # Output of previous prepare data step, will be where the data is
dataclient = dataclient_factory(prepare_data_params["output_dataclient_type"]) dataclient = dataclient_factory(prepare_data_params["output_dataclient_type"])
logger.info("-------------------------")
logger.info(f"--- Initiate MLModel ---") logger.info(f"--- Initiate MLModel ---")
logger.info("-------------------------")
model = model_factory(model_type) model = model_factory(model_type)
logger.info("-------------------------")
logger.info(f"--- Initiate Metrics ---") logger.info(f"--- Initiate Metrics ---")
logger.info("-------------------------")
metrics = metrics_factory(generate_metrics_params["metrics_type"]) metrics = metrics_factory(generate_metrics_params["metrics_type"])
@ -63,8 +68,6 @@ def build_model(
identifier_columns: List[str], identifier_columns: List[str],
model_save_location: str, model_save_location: str,
model_hyperparameters: dict, model_hyperparameters: dict,
fit_predictions_filepath: str,
predictions_column_name: str,
fit_metrics_filepath: str, fit_metrics_filepath: str,
train_filepath: Union[str, None] = None, train_filepath: Union[str, None] = None,
test_filepath: Union[str, None] = None, test_filepath: Union[str, None] = None,
@ -72,7 +75,9 @@ def build_model(
test_data: Union[pd.DataFrame, None] = None, test_data: Union[pd.DataFrame, None] = None,
pipeline_mode: bool = False, pipeline_mode: bool = False,
): ):
logger.info("--------------------------------------")
logger.info("--- Loading Data for build process ---") logger.info("--- Loading Data for build process ---")
logger.info("--------------------------------------")
if train_data is None: if train_data is None:
if train_filepath is None: if train_filepath is None:
@ -84,7 +89,9 @@ def build_model(
raise ValueError(f"Need {test_filepath} if no data supplied") raise ValueError(f"Need {test_filepath} if no data supplied")
test_data = dataclient.load_data(location=test_filepath, load_config=None) test_data = dataclient.load_data(location=test_filepath, load_config=None)
logger.info("----------------------")
logger.info("--- Training model ---") logger.info("--- Training model ---")
logger.info("----------------------")
model.train_model( model.train_model(
data=train_data.drop(columns=identifier_columns), data=train_data.drop(columns=identifier_columns),
@ -92,33 +99,32 @@ def build_model(
model_hyperparameters=model_hyperparameters, model_hyperparameters=model_hyperparameters,
) )
logger.info("----------------------------------")
logger.info("--- Generating fit predictions ---") logger.info("--- Generating fit predictions ---")
logger.info("----------------------------------")
fit_predictions = model.predict( fit_predictions = model.predict(
data=train_data, post_prediction_logic=post_prediction_logic data=train_data, post_prediction_logic=post_prediction_logic
) )
logger.info("--- Saving fit predictions ---") logger.info("------------------------------")
predictions_df = pd.DataFrame(fit_predictions)
predictions_df.columns = [predictions_column_name]
dataclient.save_data(
obj=predictions_df, location=fit_predictions_filepath, save_config=None
)
logger.info("--- Generating fit metrics ---") logger.info("--- Generating fit metrics ---")
logger.info("------------------------------")
metrics_output = metrics.generate_metrics( metrics_output = metrics.generate_metrics(
target=train_data[target], target=train_data[target],
predictions=pd.Series(fit_predictions), predictions=pd.Series(fit_predictions),
) )
logger.info("--------------------")
logger.info("--- Saving model ---") logger.info("--- Saving model ---")
logger.info("--------------------")
model.save_model(path=Path(model_save_location)) model.save_model(path=Path(model_save_location))
logger.info("--------------------------")
logger.info("--- Saving fit metrics ---") logger.info("--- Saving fit metrics ---")
logger.info("--------------------------")
dataclient.save_data( dataclient.save_data(
obj=metrics_output, location=fit_metrics_filepath, save_config=None obj=metrics_output, location=fit_metrics_filepath, save_config=None
@ -127,9 +133,13 @@ def build_model(
if __name__ == "__main__": if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---") logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
logger.info("--------------------------")
logger.info(f"--- Build Model Stage ---") logger.info(f"--- Build Model Stage ---")
logger.info("--------------------------")
build_model( build_model(
dataclient=dataclient, dataclient=dataclient,
@ -142,8 +152,8 @@ if __name__ == "__main__":
train_filepath=train_filepath, train_filepath=train_filepath,
test_filepath=test_filepath, test_filepath=test_filepath,
fit_metrics_filepath=fit_metrics_filepath, fit_metrics_filepath=fit_metrics_filepath,
fit_predictions_filepath=fit_predictions_filepath,
predictions_column_name=predictions_column_name,
) )
logger.info("-------------------------------")
logger.info(f"--- {__file__} - Complete! ---") logger.info(f"--- {__file__} - Complete! ---")
logger.info("-------------------------------")

View file

@ -10,7 +10,9 @@ from core.Logger import logger
from config import settings from config import settings
from generate_predictions import generate_predictions from generate_predictions import generate_predictions
logger.info("----------------------------")
logger.info(f"--- Initiate Parameters ---") logger.info(f"--- Initiate Parameters ---")
logger.info("----------------------------")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@ -31,11 +33,15 @@ model_filepath = build_model_params["model_save_filepath"]
predictions_output_filepath = generate_predictions_params["predictions_output_filepath"] predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
predictions_column_name = generate_predictions_params["predictions_column_name"] predictions_column_name = generate_predictions_params["predictions_column_name"]
logger.info("-------------------------")
logger.info(f"--- Initiate MLModel ---") logger.info(f"--- Initiate MLModel ---")
logger.info("-------------------------")
model = model_factory(build_model_params["model_type"]) model = model_factory(build_model_params["model_type"])
logger.info("----------------------------")
logger.info(f"--- Initiate DataClient ---") logger.info(f"--- Initiate DataClient ---")
logger.info("----------------------------")
# We may have different locations of loading hence why we use one specified in generate_predictions.yaml # We may have different locations of loading hence why we use one specified in generate_predictions.yaml
# I.e. for metric runs, this will be a local data client # I.e. for metric runs, this will be a local data client
@ -53,9 +59,13 @@ output_dataclient = dataclient_factory(
if __name__ == "__main__": if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---") logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
logger.info("----------------------------------")
logger.info(f"--- Generate Predictions Stage---") logger.info(f"--- Generate Predictions Stage---")
logger.info("----------------------------------")
generate_predictions( generate_predictions(
input_dataclient=input_dataclient, input_dataclient=input_dataclient,
@ -68,4 +78,6 @@ if __name__ == "__main__":
predictions_column_name=predictions_column_name, predictions_column_name=predictions_column_name,
) )
logger.info("-------------------------------")
logger.info(f"--- {__file__} - Complete! ---") logger.info(f"--- {__file__} - Complete! ---")
logger.info("-------------------------------")

View file

@ -16,7 +16,9 @@ from core.MLMetrics import metrics_factory
from core.Logger import logger from core.Logger import logger
from config import settings from config import settings
logger.info("----------------------------")
logger.info(f"--- Initiate Parameters ---") logger.info(f"--- Initiate Parameters ---")
logger.info("----------------------------")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@ -33,11 +35,16 @@ predictions_output_filepath = generate_predictions_params["predictions_output_fi
predictions_column_name = generate_predictions_params["predictions_column_name"] predictions_column_name = generate_predictions_params["predictions_column_name"]
metrics_output_filepath = generate_metrics_params["metrics_output_filepath"] metrics_output_filepath = generate_metrics_params["metrics_output_filepath"]
logger.info("-------------------------")
logger.info(f"--- Initiate MLModel ---") logger.info(f"--- Initiate MLModel ---")
logger.info("-------------------------")
model = model_factory(build_model_params["model_type"]) model = model_factory(build_model_params["model_type"])
logger.info("----------------------------")
logger.info(f"--- Initiate DataClient ---") logger.info(f"--- Initiate DataClient ---")
logger.info("----------------------------")
# Use data client for input and output, as we use dvc to cache later to the cloud # Use data client for input and output, as we use dvc to cache later to the cloud
dataclient_type = generate_metrics_params["dataclient_type"] dataclient_type = generate_metrics_params["dataclient_type"]
@ -46,7 +53,9 @@ dataclient = dataclient_factory(
dataclient_config=client_params[dataclient_type], dataclient_config=client_params[dataclient_type],
) )
logger.info("---------------------------")
logger.info(f"--- Initiate MLMetrics ---") logger.info(f"--- Initiate MLMetrics ---")
logger.info("---------------------------")
metrics = metrics_factory(generate_metrics_params["metrics_type"]) metrics = metrics_factory(generate_metrics_params["metrics_type"])
@ -66,26 +75,34 @@ def generate_metrics(
For a given model, we generate prediction and evaluate this against the true target For a given model, we generate prediction and evaluate this against the true target
""" """
logger.info("-------------------------")
logger.info("--- Loading test data ---") logger.info("--- Loading test data ---")
logger.info("-------------------------")
test_data = input_dataclient.load_data( test_data = input_dataclient.load_data(
location=test_data_filepath, load_config=None location=test_data_filepath, load_config=None
) )
logger.info("---------------------------")
logger.info("--- Loading predictions ---") logger.info("--- Loading predictions ---")
logger.info("---------------------------")
predictions = input_dataclient.load_data( predictions = input_dataclient.load_data(
location=predictions_output_filepath, load_config=None location=predictions_output_filepath, load_config=None
) )
logger.info("--------------------------")
logger.info("--- Generating metrics ---") logger.info("--- Generating metrics ---")
logger.info("--------------------------")
metrics_output = metrics.generate_metrics( metrics_output = metrics.generate_metrics(
target=test_data[target], target=test_data[target],
predictions=pd.Series(predictions[predictions_column_name]), predictions=pd.Series(predictions[predictions_column_name]),
) )
logger.info("----------------------")
logger.info("--- Saving metrics ---") logger.info("--- Saving metrics ---")
logger.info("----------------------")
output_dataclient.save_data( output_dataclient.save_data(
obj=metrics_output, location=metrics_output_filepath, save_config=None obj=metrics_output, location=metrics_output_filepath, save_config=None
@ -94,9 +111,13 @@ def generate_metrics(
if __name__ == "__main__": if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---") logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
logger.info("------------------------------")
logger.info(f"--- Generate Metrics Stage---") logger.info(f"--- Generate Metrics Stage---")
logger.info("------------------------------")
generate_metrics( generate_metrics(
input_dataclient=dataclient, input_dataclient=dataclient,
@ -110,4 +131,6 @@ if __name__ == "__main__":
metrics_output_filepath=metrics_output_filepath, metrics_output_filepath=metrics_output_filepath,
) )
logger.info("-------------------------------")
logger.info(f"--- {__file__} - Complete! ---") logger.info(f"--- {__file__} - Complete! ---")
logger.info("-------------------------------")

View file

@ -1,162 +0,0 @@
"""
Fourth part of the pipeline:
After the model is built and metrics are generated,
we want to test this model against known scenarios
"""
import os
import pandas as pd
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from core.interface.InterfaceMetrics import MLMetrics
from configs.post_prediction_logic import post_prediction_logic
from core.DataClient import dataclient_factory
from core.MLModels import model_factory
from core.MLMetrics import metrics_factory
from core.Logger import logger
from config import settings
logger.info(f"--- Initiate Parameters ---")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
client_params = settings.client
prepare_data_params = settings.prepare_data
build_model_params = settings.build_model
generate_predictions_params = settings.generate_predictions
generate_metrics_params = settings.generate_metrics
feature_process_params = settings.feature_processor
scenarios_params = settings.scenarios
model_filepath = build_model_params["model_save_filepath"]
target = feature_process_params["feature_processor_config"]["target"]
scenario_data_filepaths = scenarios_params["scenario_data_filepaths"]
predictions_column_name = generate_predictions_params["predictions_column_name"]
comparison_output_filepath = scenarios_params["comparison_output_filepath"]
metrics_output_filepath = scenarios_params["metrics_output_filepath"]
logger.info(f"--- Initiate MLModel ---")
model = model_factory(build_model_params["model_type"])
logger.info(f"--- Initiate DataClient ---")
# Use data client for input and output, as we use dvc to cache later to the cloud
input_dataclient_type = scenarios_params["input_dataclient_type"]
input_dataclient = dataclient_factory(
dataclient_type=input_dataclient_type,
dataclient_config=client_params[input_dataclient_type],
)
output_dataclient_type = scenarios_params["output_dataclient_type"]
output_dataclient = dataclient_factory(
dataclient_type=output_dataclient_type,
dataclient_config=client_params[output_dataclient_type],
)
logger.info(f"--- Initiate MLMetrics ---")
metrics = metrics_factory(generate_metrics_params["metrics_type"])
def generate_scenario_predictions(
input_dataclient: DataClient,
output_dataclient: DataClient,
model: MLModel,
metrics: MLMetrics,
model_filepath: str,
scenario_data_filepaths: list,
predictions_column_name: str,
comparison_output_filepath: str,
metrics_output_filepath: str,
):
"""
Given the new model, we generate prediction for expected scenarios
"""
logger.info("--- Loading Scenario Data ---")
scenario_data = pd.DataFrame()
# If we have no scenario data, we can save empty dataframes
if scenario_data_filepaths is None:
logger.info("No scenario data filepaths provided")
output_dataclient.save_data(
obj=scenario_data, location=comparison_output_filepath, save_config=None
)
output_dataclient.save_data(
obj=scenario_data, location=metrics_output_filepath, save_config=None
)
return
# Can have multiple scenario data files
for scenario_data_filepath in scenario_data_filepaths:
scenario_data = pd.concat(
[
scenario_data,
input_dataclient.load_data(scenario_data_filepath, load_config=None),
]
)
logger.info("--- Loading Model ---")
model.load_model(model_filepath)
logger.info("--- Generating Predictions ---")
predictions = model.predict(
data=scenario_data, post_prediction_logic=post_prediction_logic
)
logger.info("--- Generate Scenario Predicted Impact ---")
predictions_df = pd.DataFrame(predictions)
predictions_df.columns = [predictions_column_name]
scenario_data = pd.concat([scenario_data, predictions_df], axis=1)
scenario_data["predicted_impact"] = abs(
scenario_data[predictions_column_name] - scenario_data["sap_starting"]
)
logger.info("--- Generate Metrics ---")
metrics_dict = metrics.generate_metrics(
scenario_data["impact"], scenario_data["predicted_impact"]
)
metrics_df = pd.DataFrame(metrics_dict, index=[0]).T.reset_index()
metrics_df.columns = ["metric", "value"]
logger.info("--- Save prediction into metrics ---")
output_df = scenario_data[["uprn", "id", "impact", "predicted_impact"]]
output_dataclient.save_data(
obj=output_df, location=comparison_output_filepath, save_config=None
)
output_dataclient.save_data(
obj=metrics_df, location=metrics_output_filepath, save_config=None
)
if __name__ == "__main__":
logger.info(f"--- {__file__} - Start! ---")
logger.info(f"--- Generate Scenario Predictions ---")
generate_scenario_predictions(
input_dataclient=input_dataclient,
output_dataclient=output_dataclient,
model=model,
metrics=metrics,
model_filepath=model_filepath,
scenario_data_filepaths=scenario_data_filepaths,
predictions_column_name=predictions_column_name,
comparison_output_filepath=comparison_output_filepath,
metrics_output_filepath=metrics_output_filepath,
)
logger.info(f"--- {__file__} - Complete! ---")

View file

@ -37,4 +37,3 @@ Workflow:
- This experiment will have the corresponding .dvc files for the hashed model and data - This experiment will have the corresponding .dvc files for the hashed model and data
- Use version control as normal - Use version control as normal
- git add, git commit etc - git add, git commit etc
- To revert change, use `git checkout {COMMIT_HASH}`, followed by `git switch -c {NEW_BRANCH_NAME}`

View file

@ -7,7 +7,6 @@ settings = Dynaconf(
"./configs/settings.yaml", "./configs/settings.yaml",
"./configs/build_model.yaml", "./configs/build_model.yaml",
"./configs/analysis.yaml", "./configs/analysis.yaml",
"./configs/scenarios.yaml",
], ],
) )

View file

@ -13,4 +13,4 @@ default:
dataclient_type: local dataclient_type: local
nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower
row_index: [20695, 50243, 7653] # index of an example datapoint row_index: [0, 10, 20] # index of an example datapoint

View file

@ -3,7 +3,6 @@ default:
model_type: AutogluonAutoML model_type: AutogluonAutoML
model_save_filepath: ./data/model/optimised/ model_save_filepath: ./data/model/optimised/
fit_metrics_filepath: ./metrics/fit_metrics.json fit_metrics_filepath: ./metrics/fit_metrics.json
fit_predictions_filepath: ./data/fit_predictions/predictions.parquet
SKLearnLinearRegression: null SKLearnLinearRegression: null
@ -14,9 +13,6 @@ default:
output_filepath: ./data/model/allmodels/ output_filepath: ./data/model/allmodels/
problem_type: regression problem_type: regression
eval_metric: mean_squared_error #mean_absolute_error eval_metric: mean_squared_error #mean_absolute_error
time_limit: 1800 time_limit: 4000
presets: medium_quality presets: medium_quality
excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT'] excluded_model_types: ['KNN', 'RF']
infer_limit: 0.05
infer_limit_batch_size: 10000
ag_args_ensemble: {'num_folds_parallel': 2}

View file

@ -9,42 +9,15 @@ Business Logic dict + functions
def remove_starting_columns(df): def remove_starting_columns(df):
keep_column_index = [ keep_column_index = [
False if col_name.endswith("_starting") else True False if col_name.endswith("_STARTING") else True
for col_name in list(df.columns) for col_name in list(df.columns)
] ]
keep_columns = df.columns[keep_column_index].to_list() keep_columns = df.columns[keep_column_index].to_list()
keep_columns.append("sap_starting") keep_columns.append("SAP_STARTING")
df = df[keep_columns] df = df[keep_columns]
return df return df
def remove_floor_height_ending(df):
# df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING']
# shows bottom 0.5 percentile is 1.665
# So keep anything above this
df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True)
print("we in here")
return df
def remove_minimum_habitable_room_size(df):
# Need minimum of 6.5m per habitable room
df = df[
df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5
].reset_index(drop=True)
return df
def keep_flats(df):
df = df[df["property_type"] == "Flat"]
return df
def keep_non_zero_rdsap(df):
df = df[df["rdsap_change"] != 0]
return df
# def keep_ending_columns(df): # def keep_ending_columns(df):
# ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
# keep_columns = df.columns[ending_column_index].to_list() # keep_columns = df.columns[ending_column_index].to_list()
@ -54,10 +27,6 @@ def keep_non_zero_rdsap(df):
# return df # return df
business_logic = { business_logic = {
# "keep_non_zero_rdsap": keep_non_zero_rdsap,
# "keep_flats": keep_flats,
# "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,
# "remove_floor_height_ending": remove_floor_height_ending
# "remove_starting_columns": remove_starting_columns # "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns # "keep_ENDING_COLUMNS": keep_ending_columns
} }

View file

@ -5,18 +5,16 @@ import pandas as pd
def clip_predictions_to_minimum_value( def clip_predictions_to_minimum_value(
data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0 data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 1
) -> pd.Series: ) -> pd.Series:
series_name = predictions.name series_name = predictions.name
predictions.name = "predictions" predictions.name = "predictions"
predictions_df = pd.concat([data, predictions], axis=1) predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement # We expect all prediction to be atleast one point improvement
replace_index = ( replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"]
predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"]
)
predictions_df.loc[replace_index, "predictions"] = ( predictions_df.loc[replace_index, "predictions"] = (
predictions_df.loc[replace_index, "sap_starting"] + minimum_value predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value
) )
predictions_new = predictions_df["predictions"] predictions_new = predictions_df["predictions"]

View file

@ -1,13 +0,0 @@
default:
scenarios:
input_dataclient_type: aws-s3
output_dataclient_type: local
scenario_data_filepaths:
# - s3://retrofit-data-dev/scenario_data/22-03-2024-19-20-09/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md

View file

@ -18,10 +18,10 @@ default:
prepare_data: prepare_data:
input_dataclient_type: aws-s3 input_dataclient_type: aws-s3
output_dataclient_type: local output_dataclient_type: local
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
train_proportion: 0.9 train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet output_test_filepath: ./data/prepared_data/test.parquet
@ -31,37 +31,11 @@ default:
feature_processor_config: feature_processor_config:
subsample_amount: null subsample_amount: null
subsample_seed: 0 subsample_seed: 0
target: sap_ending target: SAP_ENDING
identifier_columns: ["uprn"] identifier_columns: ["UPRN"]
# drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"] drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
drop_columns: [ # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
"heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending",
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
'number_habitable_rooms', 'number_heated_rooms']
retain_features: null retain_features: null
# retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending',
# 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending',
# 'walls_energy_eff_ending', 'secondheat_description_ending',
# 'property_type', 'mainheatc_energy_eff_ending', 'built_form',
# 'walls_insulation_thickness_ending', 'potential_energy_efficiency',
# 'transaction_type_ending',
# 'floor_thermal_transmittance_ending',
# 'low_energy_lighting_ending', 'heat_demand_starting',
# 'photo_supply_ending', 'carbon_starting',
# 'walls_thermal_transmittance_ending',
# 'roof_insulation_thickness_ending',
# 'total_floor_area_ending', 'number_open_fireplaces_ending',
# 'windows_energy_eff_ending',
# 'floor_height_ending',
# 'extension_count_ending',
# 'has_air_source_heat_pump_ending',
# 'charging_system_ending', 'construction_age_band', 'glazed_type_ending',
# 'roof_thermal_transmittance_ending',
# 'floor_insulation_thickness_ending', 'has_mains_gas_ending',
# 'estimated_perimeter_starting', 'energy_consumption_potential',
# 'environment_impact_potential', 'heater_type_ending',
# 'multi_glaze_proportion_ending',
# 'lighting_energy_eff_ending', 'fixed_lighting_outlets_count']
generate_predictions: generate_predictions:
input_dataclient_type: local input_dataclient_type: local
@ -69,7 +43,6 @@ default:
test_data_filepath: ./data/prepared_data/test.parquet test_data_filepath: ./data/prepared_data/test.parquet
predictions_output_filepath: ./data/predictions/predictions.parquet predictions_output_filepath: ./data/predictions/predictions.parquet
predictions_column_name: predictions predictions_column_name: predictions
identifier_column: id
generate_metrics: generate_metrics:
dataclient_type: local dataclient_type: local

View file

@ -142,15 +142,9 @@ class AWSS3Client:
buffer = BytesIO() buffer = BytesIO()
obj.to_parquet(buffer, index=False) obj.to_parquet(buffer, index=False)
# Reset the buffer position to the beginning
buffer.seek(0)
bucket, key = location.strip("s3://").split("/", 1) bucket, key = location.strip("s3://").split("/", 1)
self.client.upload_fileobj(buffer, bucket, key) self.client.upload_fileobj(buffer, bucket, key)
# Close the buffer
buffer.close()
def _load_parquet(self, location: str, load_config: dict) -> pd.DataFrame: def _load_parquet(self, location: str, load_config: dict) -> pd.DataFrame:
""" """
Load a parquet file Load a parquet file
@ -245,8 +239,7 @@ class LocalClient:
save_methods = { save_methods = {
".parquet": self._save_parquet, ".parquet": self._save_parquet,
".json": self._save_json, ".json": self._save_json
".md": self._save_md,
# "": _save_directory(**save_config), # "": _save_directory(**save_config),
# ADD MORE save_methods HERE # ADD MORE save_methods HERE
} }
@ -295,10 +288,3 @@ class LocalClient:
# Write the contents of the buffer to the local file # Write the contents of the buffer to the local file
with open(location, "wb") as f: with open(location, "wb") as f:
f.write(buffer.getvalue()) f.write(buffer.getvalue())
def _save_md(self, obj: pd.DataFrame, location: str, save_config: dict):
"""
Save object as markdown
"""
obj.to_markdown(location, **save_config)

View file

@ -21,7 +21,6 @@ def setup_logger():
# Add the stream handler to the logger # Add the stream handler to the logger
logger.addHandler(stream_handler) logger.addHandler(stream_handler)
logger.propagate = False
return logger return logger

View file

@ -4,7 +4,6 @@ Implementation of MLMetrics, all of which will have two methods:
- Generate Plot Suite - Generate Plot Suite
""" """
import numpy as np
import pandas as pd import pandas as pd
from typing import Union from typing import Union
from sklearn.metrics import ( from sklearn.metrics import (
@ -15,18 +14,6 @@ from sklearn.metrics import (
) )
from core.interface.InterfaceMetrics import MLMetrics from core.interface.InterfaceMetrics import MLMetrics
# Define the function to return the SMAPE value
def symmetric_mape(actual, predicted) -> float:
# Convert actual and predicted to numpy
# array data type if not already
if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]):
actual, predicted = np.array(actual), np.array(predicted)
return np.mean(
np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual)) / 2)
)
def metrics_factory(metrics_type: str) -> MLMetrics: def metrics_factory(metrics_type: str) -> MLMetrics:
metrics = { metrics = {
@ -47,7 +34,7 @@ class RegressionMetrics:
median_absolute_error, median_absolute_error,
mean_squared_error, mean_squared_error,
mean_absolute_percentage_error, mean_absolute_percentage_error,
symmetric_mape, # max_error
] ]
def generate_metrics( def generate_metrics(

View file

@ -25,7 +25,7 @@ def model_factory(model_type: str) -> MLModel:
models = { models = {
"SKLearnLinearRegression": SKLearnLinearRegression(), "SKLearnLinearRegression": SKLearnLinearRegression(),
"SKLearnSVMRegression": SKLearnSVMRegression(), "SKLearnSVMRegression": SKLearnSVMRegression(),
"AutogluonAutoML": AutogluonAutoML(), "AutogluonAutoML": AutogluonAutoML()
# ADD OTHER MODELS HERE # ADD OTHER MODELS HERE
} }
@ -149,9 +149,6 @@ class AutogluonAutoML:
"time_limit", "time_limit",
"presets", "presets",
"excluded_model_types", "excluded_model_types",
"infer_limit",
"infer_limit_batch_size",
"ag_args_ensemble",
] ]
def load_model(self, path: Union[Path, str]) -> None: def load_model(self, path: Union[Path, str]) -> None:
@ -206,9 +203,6 @@ class AutogluonAutoML:
time_limit=model_hyperparameters["time_limit"], time_limit=model_hyperparameters["time_limit"],
presets=model_hyperparameters["presets"], presets=model_hyperparameters["presets"],
excluded_model_types=model_hyperparameters["excluded_model_types"], excluded_model_types=model_hyperparameters["excluded_model_types"],
infer_limit=model_hyperparameters["infer_limit"],
infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"],
ag_args_ensemble=model_hyperparameters["ag_args_ensemble"],
) )
def predict( def predict(

View file

@ -1,46 +1,26 @@
schema: '2.0' schema: '2.0'
stages: stages:
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:
- path: 0_startup_cleanup.py
hash: md5
md5: b1b12f6b6393fbf8b83d23684df0a3d4
size: 1220
params:
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data
default.startup_cleanup.metrics: ./metrics
prepare_data: prepare_data:
cmd: python 1_prepare_data.py cmd: python 1_prepare_data.py
deps: deps:
- path: 1_prepare_data.py - path: 1_prepare_data.py
hash: md5 hash: md5
md5: 11a3b8bfdfe199ab7ecc39ccc5652649 md5: c9f030df733e318b80d1fa91b7732f79
size: 4298 size: 5132
params: params:
configs/settings.yaml: configs/settings.yaml:
default.feature_processor.feature_processor_config.drop_columns: default.feature_processor.feature_processor_config.drop_columns:
- heat_demand_change - HEAT_DEMAND_CHANGE
- carbon_change - CARBON_CHANGE
- rdsap_change - RDSAP_CHANGE
- heat_demand_ending - HEAT_DEMAND_ENDING
- carbon_ending - CARBON_ENDING
- days_to_starting
- days_to_ending
- number_habitable_rooms_starting
- number_habitable_rooms_ending
- number_heated_rooms_starting
- number_heated_rooms_ending
- number_habitable_rooms
- number_heated_rooms
default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_config.target: SAP_ENDING
default.feature_processor.feature_processor_type: dataframe default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
@ -49,20 +29,20 @@ stages:
outs: outs:
- path: data/prepared_data/ - path: data/prepared_data/
hash: md5 hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 45056059 size: 33881619
nfiles: 2 nfiles: 2
build_model: build_model:
cmd: python 2_build_model.py cmd: python 2_build_model.py
deps: deps:
- path: 2_build_model.py - path: 2_build_model.py
hash: md5 hash: md5
md5: 7231450b78920b0c5e7c6bada496b24a md5: 84699d208874c52accaff61c6af9bb0a
size: 4820 size: 5359
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 45056059 size: 33881619
nfiles: 2 nfiles: 2
params: params:
configs/build_model.yaml: configs/build_model.yaml:
@ -71,7 +51,6 @@ stages:
model_type: AutogluonAutoML model_type: AutogluonAutoML
model_save_filepath: ./data/model/optimised/ model_save_filepath: ./data/model/optimised/
fit_metrics_filepath: ./metrics/fit_metrics.json fit_metrics_filepath: ./metrics/fit_metrics.json
fit_predictions_filepath: ./data/fit_predictions/predictions.parquet
SKLearnLinearRegression: SKLearnLinearRegression:
SKLearnSVMRegression: SKLearnSVMRegression:
kernel: linear kernel: linear
@ -79,49 +58,37 @@ stages:
output_filepath: ./data/model/allmodels/ output_filepath: ./data/model/allmodels/
problem_type: regression problem_type: regression
eval_metric: mean_squared_error eval_metric: mean_squared_error
time_limit: 1800 time_limit: 4000
presets: medium_quality presets: medium_quality
excluded_model_types: excluded_model_types:
- RF
- CAT
- NN_TORCH
- KNN - KNN
- XT - RF
infer_limit: 0.05
infer_limit_batch_size: 10000
ag_args_ensemble:
num_folds_parallel: 2
outs: outs:
- path: data/fit_predictions/
hash: md5
md5: d9c9afc05e8780db47c0548b19bf7d19.dir
size: 3349989
nfiles: 1
- path: data/model/ - path: data/model/
hash: md5 hash: md5
md5: 13c3100e1486c27a83a8a47491077842.dir md5: 7bb5156243b4db39349e80a01ffecde4.dir
size: 773523079 size: 473398662
nfiles: 36 nfiles: 27
- path: metrics/fit_metrics.json - path: metrics/fit_metrics.json
hash: md5 hash: md5
md5: 2ff70a2a45813e1bcdf2ea3aa8e07d4a md5: 2bb16ac67de8778fbc08171d562b34d5
size: 224 size: 184
generate_predictions: generate_predictions:
cmd: python 3_generate_predictions.py cmd: python 3_generate_predictions.py
deps: deps:
- path: 3_generate_predictions.py - path: 3_generate_predictions.py
hash: md5 hash: md5
md5: 0a70ad4dfe99414a75d1261c75a177b9 md5: 5ef2856a5a977304f1ec01f9b4205262
size: 2464 size: 3028
- path: data/model - path: data/model
hash: md5 hash: md5
md5: 13c3100e1486c27a83a8a47491077842.dir md5: 7bb5156243b4db39349e80a01ffecde4.dir
size: 773523079 size: 473398662
nfiles: 36 nfiles: 27
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 45056059 size: 33881619
nfiles: 2 nfiles: 2
params: params:
configs/settings.yaml: configs/settings.yaml:
@ -133,25 +100,25 @@ stages:
outs: outs:
- path: data/predictions/ - path: data/predictions/
hash: md5 hash: md5
md5: 5d07bcebf3160a72bb18dfd79106e85c.dir md5: 0bb3cf991906953def81c8204cdcfaf0.dir
size: 463197 size: 374532
nfiles: 1 nfiles: 1
generate_metrics: generate_metrics:
cmd: python 4_generate_metrics.py cmd: python 4_generate_metrics.py
deps: deps:
- path: 4_generate_metrics.py - path: 4_generate_metrics.py
hash: md5 hash: md5
md5: 4fedb86d89d528f0a6597934ba3890a0 md5: 2c9fb78955a8c19cff0a098976f81d1b
size: 3484 size: 4487
- path: data/predictions - path: data/predictions
hash: md5 hash: md5
md5: 5d07bcebf3160a72bb18dfd79106e85c.dir md5: 0bb3cf991906953def81c8204cdcfaf0.dir
size: 463197 size: 374532
nfiles: 1 nfiles: 1
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 45056059 size: 33881619
nfiles: 2 nfiles: 2
params: params:
configs/settings.yaml: configs/settings.yaml:
@ -161,30 +128,16 @@ stages:
outs: outs:
- path: metrics/metrics.json - path: metrics/metrics.json
hash: md5 hash: md5
md5: 3e08df02fd5c5d094bcf936e1338d596 md5: 2e13ae67759a64261d03224f1c0d4bf4
size: 223 size: 185
generate_scenerio_metrics: startup_cleanup:
cmd: python 5_generate_scenarios.py cmd: python 0_startup_cleanup.py
deps: deps:
- path: 5_generate_scenarios.py - path: 0_startup_cleanup.py
hash: md5 hash: md5
md5: 40506749fefd926d47c60ff5b16db307 md5: fbb7e3b1b98b517c870f3e1df3e7f695
size: 5337 size: 1676
params: params:
configs/scenarios.yaml: configs/settings.yaml:
default.scenarios: default.startup_cleanup.artefacts: ./data
input_dataclient_type: aws-s3 default.startup_cleanup.metrics: ./metrics
output_dataclient_type: local
scenario_data_filepaths:
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md
outs:
- path: metrics/scenario_metrics.md
hash: md5
md5: fa4d6d7bbd7818613800da5f8f37ea96
size: 363
- path: metrics/scenario_table.md
hash: md5
md5: d6baf100a1623cc2467c2f8221d314c9
size: 2133

View file

@ -38,7 +38,6 @@ stages:
- configs/build_model.yaml: - configs/build_model.yaml:
outs: outs:
- data/model/ - data/model/
- data/fit_predictions/
- metrics/fit_metrics.json - metrics/fit_metrics.json
always_changed: true always_changed: true
generate_predictions: generate_predictions:
@ -71,17 +70,6 @@ stages:
outs: outs:
- metrics/metrics.json - metrics/metrics.json
always_changed: true always_changed: true
generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py
deps:
- 5_generate_scenarios.py
params:
- configs/scenarios.yaml:
- default.scenarios
outs:
- metrics/scenario_table.md
- metrics/scenario_metrics.md
always_changed: true
metrics: metrics:
- metrics/metrics.json - metrics/metrics.json
- metrics/fit_metrics.json - metrics/fit_metrics.json

View file

@ -190,35 +190,28 @@ prediction_analysis_params = settings.prediction_analysis
model = model_factory(build_model_params["model_type"]) model = model_factory(build_model_params["model_type"])
model.load_model(build_model_params["model_save_filepath"]) model.load_model(build_model_params["model_save_filepath"])
dataclient_type = prediction_analysis_params["dataclient_type"] dataclient_type = prediction_analysis_params["dataclient_type"]
# dataclient_type = 'aws-s3' dataclient = dataclient_factory(
# dataclient = dataclient_factory( dataclient_type=dataclient_type,
# dataclient_type=dataclient_type, dataclient_config=client_params[dataclient_type],
# dataclient_config=client_params[dataclient_type], )
# )
# data = dataclient.load_data("s3://retrofit-data-dev/sap_change_model/dataset.parquet")
target = feature_process_params["feature_processor_config"]["target"] target = feature_process_params["feature_processor_config"]["target"]
predictions_column_name = generate_predictions_params["predictions_column_name"] predictions_column_name = generate_predictions_params["predictions_column_name"]
output_test_filepath = prepare_data_params["output_test_filepath"] output_test_filepath = prepare_data_params["output_test_filepath"]
predictions_output_filepath = generate_predictions_params["predictions_output_filepath"] predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
# score_data = dataclient.load_data("s3://retrofit-data-dev/carbon_change_predictions/51/2023-11-28T21:01:21.869339.parquet") test_df = dataclient.load_data(output_test_filepath)
predictions = dataclient.load_data(predictions_output_filepath)
local_dataclient = dataclient_factory(
dataclient_type="local",
dataclient_config=client_params["local"],
)
test_df = local_dataclient.load_data(output_test_filepath)
predictions = local_dataclient.load_data(predictions_output_filepath)
mix_df = pd.concat([test_df.copy(), predictions], axis=1) mix_df = pd.concat([test_df.copy(), predictions], axis=1)
mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target]) mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
mix_df = mix_df.sort_values("residual", ascending=False) mix_df = mix_df.sort_values("residual", ascending=False)
cosine_similarity_df = mix_df[mix_df.columns.difference(["predictions", "residual"])] cosine_similarity_df = mix_df[
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
]
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
row_index = 0 row_index = 58199
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
@ -232,17 +225,7 @@ feature_vector = cosine_similarity_df.loc[[row_index]]
cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector) cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector)
similar_index = ( similar_index = (
cosine_similarity_df.sort_values("cosine", ascending=False).head(15).index cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index
) )
check_df = mix_df.loc[similar_index] check_df = mix_df.loc[similar_index]
columns_to_check = [
"LOW_ENERGY_LIGHTING_ENDING",
"walls_thermal_transmittance_ENDING",
"floor_thermal_transmittance_ENDING",
"roof_thermal_transmittance_ENDING",
"roof_insulation_thickness_ENDING",
]
cosine_similarity_df = mix_df[columns_to_check]

View file

@ -14,23 +14,28 @@ def generate_predictions(
test_data_filepath: str, test_data_filepath: str,
predictions_output_filepath: str, predictions_output_filepath: str,
predictions_column_name: str, predictions_column_name: str,
identifier_column: str = "id",
): ):
""" """
For a given model, we generate prediction and evaluate this against the true target For a given model, we generate prediction and evaluate this against the true target
""" """
logger.info("-------------------------")
logger.info("--- Loading test data ---") logger.info("--- Loading test data ---")
logger.info("-------------------------")
test_data = input_dataclient.load_data( test_data = input_dataclient.load_data(
location=test_data_filepath, load_config=None location=test_data_filepath, load_config=None
) )
logger.info("---------------------")
logger.info("--- Loading model ---") logger.info("--- Loading model ---")
logger.info("---------------------")
model.load_model(model_filepath) model.load_model(model_filepath)
logger.info("------------------------------")
logger.info("--- Generating predictions ---") logger.info("--- Generating predictions ---")
logger.info("------------------------------")
prediction_data = ( prediction_data = (
test_data.drop(columns=target) if target in test_data.columns else test_data test_data.drop(columns=target) if target in test_data.columns else test_data
@ -40,17 +45,13 @@ def generate_predictions(
data=prediction_data, post_prediction_logic=post_prediction_logic data=prediction_data, post_prediction_logic=post_prediction_logic
) )
logger.info("--------------------------")
logger.info("--- Saving predictions ---") logger.info("--- Saving predictions ---")
logger.info("--------------------------")
predictions_df = pd.DataFrame(predictions) predictions_df = pd.DataFrame(predictions)
predictions_df.columns = [predictions_column_name] predictions_df.columns = [predictions_column_name]
output_df = (
pd.concat([test_data[identifier_column], predictions_df], axis=1)
if identifier_column in test_data.columns
else predictions_df
)
output_dataclient.save_data( output_dataclient.save_data(
obj=output_df, location=predictions_output_filepath, save_config=None obj=predictions_df, location=predictions_output_filepath, save_config=None
) )

View file

@ -1,4 +1,2 @@
/fit_metrics.json /fit_metrics.json
/metrics.json /metrics.json
/scenario_table.md
/scenario_metrics.md

View file

@ -1,7 +1,7 @@
joblib==1.3.2 joblib==1.3.2
boto3==1.28.17 boto3==1.28.17
pandas==2.1.4 pandas==1.5.3
autogluon.tabular[all]==1.0.0 autogluon==0.8.2
dynaconf==3.2.1 dynaconf==3.2.0
pyarrow==13.0.0 pyarrow==13.0.0
pre-commit==3.3.3 pre-commit==3.3.3

View file

@ -1,7 +1,7 @@
joblib==1.3.2 joblib==1.3.2
boto3==1.28.17 boto3==1.28.17
pandas==2.1.4 pandas==1.5.3
autogluon.tabular[all]==1.0.0 autogluon==0.8.2
dynaconf==3.2.1 dynaconf==3.2.0
pyarrow==13.0.0 pyarrow==13.0.0
PyYAML==6.0.1 PyYAML==6.0.1

View file

@ -1,10 +1,9 @@
joblib==1.3.2 joblib==1.3.2
boto3==1.28.17 boto3==1.28.17
pandas==2.1.4 pandas==1.5.3
autogluon.tabular[all]==1.0.0 autogluon==0.8.2
ray==2.6.3 dynaconf==3.2.0
dynaconf==3.2.1 alibi==0.9.4
alibi==0.9.5
shap==0.42.1 shap==0.42.1
pyarrow==13.0.0 pyarrow==13.0.0
pre-commit==3.3.3 pre-commit==3.3.3

View file

@ -1,4 +1,4 @@
boto3==1.28.41 boto3==1.28.41
pandas==2.1.4 pandas==1.5.3
autogluon.tabular[all]==1.0.0 autogluon==0.8.2
dynaconf==3.2.1 dynaconf==3.2.0

View file

@ -1,4 +1,4 @@
dvc==3.51.0 dvc==3.18.0
dvc-s3==3.2.0 dvc-s3==2.23.0
gto==1.7.1 gto==1.0.4
pyOpenSSL==23.3.0 pyOpenSSL==23.2.0