2026-06-08 11:17:25 +00:00
46 changed files with 253 additions and 595 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,9 +0,0 @@
-modules/ml-pipeline/src/pipeline/data/predictions
-modules/ml-pipeline/src/pipeline/data/fit_predictions
-modules/ml-pipeline/src/pipeline/data/prepared_data
-modules/ml-pipeline/src/pipeline/data/model/allmodels
-modules/ml-pipeline/src/pipeline/metrics
-modules/ml-pipeline/src/pipeline/__pycache__
-modules/ml-pipeline/src/pipeline/.dvc
-modules/ml-pipeline/src/pipeline/analysis
-modules/ml-pipeline/src/pipeline/metrics
--- a/.github/workflows/Deploy.yml
+++ b/.github/workflows/Deploy.yml
@ -2,7 +2,7 @@ name: Sap Change Model Deploy

 on:
  push:
-    branches: [ sap-dev, sap-prod, heat-dev, heat-prod, carbon-dev, carbon-prod]
+    branches: [ sap_change-dev, sap_change-prod ]

 jobs:
  deploy:
@ -19,8 +19,8 @@ jobs:

      - name: Install Serverless and plugins
        run: |
-          npm install -g serverless@^3.38.0
-          npm install -g serverless-domain-manager@^7.3.8
+          npm install -g serverless
+          npm install -g serverless-domain-manager

      - name: Install DVC
        run: |
@ -54,12 +54,10 @@ jobs:
      - name: Set stack_name
        id: set_stack_name
        run: |
-          # Take branch prefix and add "model" for stack name
-          stack_name=$( echo ${{ github.ref_name }} | awk -F"-" '{print $1}' | sed 's/$/model/g')
-          if [ -z "${stack_name}" ]; then
-            echo "::set-output name=stack_name::"
+          if [[ "${{ github.ref_name }}" == "sap_change-dev" || "${{ github.ref_name }}" == "sap_change-prod" ]]; then
+            echo "::set-output name=stack_name::sapmodel"
          else
-            echo "::set-output name=stack_name::${stack_name}"
+            echo "::set-output name=stack_name::"
          fi

      - name: Set runtime_environment
--- a/.github/workflows/MLPipelinePostMerge.yml
+++ b/.github/workflows/MLPipelinePostMerge.yml
@ -10,9 +10,10 @@ on:
    types:
      - closed
    branches:
-      - "sap-dev"
-      - "heat-dev"
-      - "carbon-dev"
+      - "master"
+      - "sap_change-dev"
+      - "heat_change-dev"
+      - "carbon_change-dev"

 permissions: write-all

@ -42,14 +43,7 @@ jobs:
        if [ -z "${latest_version}" ]; then
          increment_version="1.0.0"
        else
-          increment_version=$(echo ${latest_version} | awk 'BEGIN {
-              FS="\\."   # Set the field separator to a period
-              OFS="."    # Set the output field separator to a period
-          }
-          {
-              major = $1 + 1   # Increment the major version
-              print major, "0", "0"   # Print the new version
-          }')
+          increment_version=$(echo ${latest_version} | awk -F'.' '{OFS="."; $1+=1; print}')
        fi

        new_tag=${REGISTER_MODEL_NAME}@v${increment_version}
@ -87,14 +81,7 @@ jobs:
        if [ -z "${latest_version}" ]; then
          increment_version="0.1.0"
        else
-          increment_version=$(echo ${latest_version} | awk 'BEGIN {
-              FS="\\."   # Set the field separator to a period
-              OFS="."    # Set the output field separator to a period
-          }
-          {
-              minor = $2 + 1   # Increment the minor version
-              print $1, minor, "0"   # Print the new version
-          }')
+          increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$2++; print}')
        fi

        new_tag=${REGISTER_MODEL_NAME}@v${increment_version}
@ -132,14 +119,7 @@ jobs:
        if [ -z "${latest_version}" ]; then
          increment_version="0.0.1"
        else
-          increment_version=$(echo ${latest_version} | awk 'BEGIN {
-              FS="\\."   # Set the field separator to a period
-              OFS="."    # Set the output field separator to a period
-          }
-          {
-              patch = $3 + 1   # Increment the patch version
-              print $1, $2, patch   # Print the new version
-          }')
+          increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$3++; print}')
        fi

        new_tag=${REGISTER_MODEL_NAME}@v${increment_version}
@ -199,8 +179,6 @@ jobs:
        pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt

    - name: Register Model
-      env:
-        TARGET_BRANCH: ${{ github.base_ref }}
      run: |

        REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}')
@ -209,7 +187,7 @@ jobs:
        git config user.name "Github-Bot"
        git config user.email "Github-Bot@no-reply.com"

-        latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/' | awk 'END {print}')
+        latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/')
        if [ -z "${latest_dev_version}" ]; then
          increment_version="1"
        else
@ -217,7 +195,7 @@ jobs:
        fi

        new_tag=${REGISTER_MODEL_NAME}#dev#${increment_version}
-        latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@" '{print $2}')
+        latest_version=$(gto show model@latest --ref | awk -F"@" '{print $2}')

        echo ${new_tag}

@ -228,11 +206,11 @@ jobs:
        git tag -a ${new_tag} -m "Assigning stage dev to artifact ${REGISTER_MODEL_NAME} version ${latest_version}"
        git push origin ${new_tag}

-        git checkout ${TARGET_BRANCH}
+        git checkout master
        git fetch --all
        git pull

        gto show --json > MODEL_REGISTRY.md
        git add .
        git commit -m "Update Registry"
-        git push origin ${TARGET_BRANCH}
+        git push origin master
--- a/.github/workflows/MLPipelinePullRequest.yml
+++ b/.github/workflows/MLPipelinePullRequest.yml
@ -5,7 +5,7 @@ on:
  #   branches:
  #     - "model-**"
  pull_request:
-    branches: ["sap-dev", "heat-dev", "carbon-dev"]
+    branches: [ "master", "sap_change-dev", "heat_change-dev", "carbon_change-dev"]
  label:
    types: ["created", "edited"]

@ -89,24 +89,13 @@ jobs:
        AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
        AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
        REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        TARGET_BRANCH: ${{ github.base_ref }}
      run: |
        cd modules/ml-pipeline/src/pipeline
        echo "## Model metrics" > report.md

        # Compare metrics to master
-        git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH}
-        dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md
-
-        echo "## Scenario comparison" >> report.md
-
-        cat metrics/scenario_table.md >> report.md
-
-        echo "" >> report.md
-
-        echo "## Scenario metrics" >> report.md
-
-        cat metrics/scenario_metrics.md >> report.md
+        git fetch --depth=1 origin master:master
+        dvc metrics diff --md --all master >> report.md

        cml comment create report.md

--- a/MODEL_REGISTRY.md
+++ b/MODEL_REGISTRY.md
@ -1,32 +1,16 @@
 {
    "model": {
-        "version": "v12.10.12",
+        "version": "v11.10.12",
        "stage": {
            "dev": "v11.10.12"
        },
        "registered": true,
        "active": true
    },
-    "sap": {
-        "version": "v0.14.0",
+    "migrate": {
+        "version": null,
        "stage": {
-            "dev": "v0.14.0"
-        },
-        "registered": true,
-        "active": true
-    },
-    "heat": {
-        "version": "v0.5.0",
-        "stage": {
-            "dev": "v0.5.0"
-        },
-        "registered": true,
-        "active": true
-    },
-    "carbon": {
-        "version": "v0.5.0",
-        "stage": {
-            "dev": "v0.5.0"
+            "dev": "f320b9e0e9f3ea7735aed1abee07b1fb498c39c3"
        },
        "registered": true,
        "active": true
--- a/README.md
+++ b/README.md
@ -10,9 +10,9 @@ tracking and a model registry
 	- A bolt-on service that can implement model monitoring

 There are multiple protected branches which adapt the generic pipeline to produce different models:
- sap-{dev/staging/prod}-**
- heat-{dev/staging/prod}-**
- carbon-{dev/staging/prod}-**
+- sap_change-**
+- heat_change-**
+- carbon_change-**

 These branches will differ by the configuration files that define the data used and the outputs of the ML-pipeline
 - There can be different additional logic for each branch but the pipeline will be the same.
@ -31,7 +31,7 @@ In order for this to be set up, some key environment variables needs to be inser
 secrets. Each different model and protected branch has its own set of secrets which allows for flexibility
 between different pipelines.

-For example, for the branch sap-dev, the prefix=SAP_DEV, and the following secrets are:
+For example, for the branch sap_change-dev, the prefix=SAP_CHANGE_DEV, and the following secrets are:

 - {prefix}_ECR_URI, which is the URI of the ECR repository to push to. For example, for the
   sap change model this is the lambda-sap-prediction-dev repository.
@ -58,7 +58,7 @@ First, navigate to the root directory of the repository. Open a terminal and exe
 2. command to build the Docker image:

 ```bash
-docker build -t sap -f deployment/Dockerfile.prediction.lambda .
+docker build -t sap_change -f deployment/Dockerfile.prediction.lambda .
 ```

 This will build a Docker image tagged as sap_change using the Dockerfile.prediction.lambda located
@ -68,7 +68,7 @@ in the deployment directory.
 Once the image is built, you can run it using the following command:

 ```bash
-docker run -p 9000:8080 -v ~/.aws/credentials:/root/.aws/credentials:ro -e RUNTIME_ENVIRONMENT=dev -e PREDICTIONS_BUCKET=retrofit-sap-predictions-dev sap
+docker run -p 9000:8080 -v ~/.aws/credentials:/root/.aws/credentials:ro -e RUNTIME_ENVIRONMENT=dev sap_change
 ```
 This command does the following:

@ -79,7 +79,6 @@ Sets the RUNTIME_ENVIRONMENT variable to dev.
 To test the Lambda function, use the following curl command:

 ```json
-curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/one_sample_test_dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"}'
+curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data_with_id.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"'
 ```
-
 This will send a POST request to the running Lambda function and pass in the required data as JSON.
--- a/deployment/.dockerignore
+++ b/deployment/.dockerignore
@ -1,9 +0,0 @@
-modules/ml-pipeline/src/pipeline/data/predictions
-modules/ml-pipeline/src/pipeline/data/fit_predictions
-modules/ml-pipeline/src/pipeline/data/prepared_data
-modules/ml-pipeline/src/pipeline/data/model/allmodels
-modules/ml-pipeline/src/pipeline/metrics
-modules/ml-pipeline/src/__pycache__
-modules/ml-pipeline/src/.dvc
-modules/ml-pipeline/src/analysis
-modules/ml-pipeline/src/metrics
--- a/deployment/Dockerfile.prediction.lambda
+++ b/deployment/Dockerfile.prediction.lambda
@ -9,7 +9,7 @@ ARG RUNTIME_ENVIRONMENT
 ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT}

 # Install necessary build tools - required to test locally
-RUN yum install -y gcc python3-devel gcc-c++
+RUN yum install -y gcc python3-devel

 # Install python packages
 COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt
--- a/deployment/handlers/prediction_app.py
+++ b/deployment/handlers/prediction_app.py
@ -69,7 +69,9 @@ def handler(event, context):

        storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet"

+        logger.info("-------------------------")
        logger.info(f"--- Initiate MLModel ---")
+        logger.info("-------------------------")

        build_model_params = settings.build_model
        client_params = settings.client
@ -78,13 +80,17 @@ def handler(event, context):

        model = model_factory(build_model_params["model_type"])

+        logger.info("----------------------------")
        logger.info(f"--- Initiate Input DataClient ---")
+        logger.info("----------------------------")
        input_dataclient = dataclient_factory(
            dataclient_type="aws-s3",
            dataclient_config=client_params["aws-s3"],
        )

+        logger.info("----------------------------")
        logger.info(f"--- Initiate Output DataClient ---")
+        logger.info("----------------------------")
        output_dataclient = dataclient_factory(
            dataclient_type="aws-s3",
            dataclient_config=client_params["aws-s3"],
@ -101,7 +107,6 @@ def handler(event, context):
            predictions_column_name=generate_predictions_params[
                "predictions_column_name"
            ],
-            identifier_column=generate_predictions_params["identifier_column"],
        )

        return {
--- a/modules/ml-pipeline/.dvc/.gitignore
+++ b/modules/ml-pipeline/.dvc/.gitignore
@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
--- a/modules/ml-pipeline/.dvc/config
+++ b/modules/ml-pipeline/.dvc/config
@ -0,0 +1,2 @@
+['remote "myremote"']
+    url = /tmp/dvcstore
--- a/modules/ml-pipeline/.dvcignore
+++ b/modules/ml-pipeline/.dvcignore
@ -0,0 +1,3 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore
--- a/modules/ml-pipeline/.gto
+++ b/modules/ml-pipeline/.gto
@ -0,0 +1,2 @@
+# .gto config file
+stages: [dev, stage, prod] # list of allowed Stages
--- a/modules/ml-pipeline/Makefile
+++ b/modules/ml-pipeline/Makefile
@ -9,16 +9,16 @@ init: dev-conda
 .PHONY: dev-conda
 dev-conda:
 	# conda deactivate || echo "Not in conda environment"
-	# conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously"
-	conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y
+	# conda remove --name $CONDA_ENV --all -y || echo "No environment created previously"
+	conda create --name $CONDA_ENV python=$(PYTHON_VERSION) -y
 	conda init bash
-	conda run -v -n ${CONDA_ENV} pip install --upgrade pip
-	conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt
-	conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt
-	conda run -v -n ${CONDA_ENV} pre-commit install
-	conda run -v -n ${CONDA_ENV} pip install ipykernel
+	conda run -vvvv -n $CONDA_ENV pip install --upgrade pip
+	conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/training/requirements-dev.txt
+	conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/version_control/requirements.txt
+	conda run -vvvv -n $CONDA_ENV pre-commit install
+	conda run -vvvv -n $CONDA_ENV pip install ipykernel
 	echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND"
-	echo "conda activate ${CONDA_ENV}"
+	echo "conda activate $CONDA_ENV"


 .PHONY: dev-pyenv
--- a/modules/ml-pipeline/src/.dockerignore
+++ b/modules/ml-pipeline/src/.dockerignore
@ -1,8 +0,0 @@
-pipeline/data/predictions
-pipeline/data/fit_predictions
-pipeline/data/prepared_data/train.parquet
-pipeline/data/fit_predictions
-pipeline/data/model/allmodels
-pipeline/metrics
-pipeline/.dvc
-pipeline/analysis
--- a/modules/ml-pipeline/src/Prediction.Dockerfile
+++ b/modules/ml-pipeline/src/Prediction.Dockerfile
@ -1,7 +1,7 @@
 # Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow)
 FROM python:3.10.12-slim

-RUN apt-get update && apt-get install -y libgomp1 gcc python3-dev
+RUN apt-get update && apt-get install -y libgomp1

 COPY pipeline/requirements/predictions/requirements.txt requirements.txt

--- a/modules/ml-pipeline/src/README.md
+++ b/modules/ml-pipeline/src/README.md
@ -1,3 +1,3 @@
 # The generic reproducible ML-pipeline

-Pipeline required to build a model to produce an output, that gets hashed via DVC
+Pipeline required to build a model to produce an output
--- a/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py
+++ b/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py
@ -16,9 +16,13 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
    Remove the directory where artefacts are stored
    """

+    logger.info("---------------------")
    logger.info(f"--- Run Clean up ---")
+    logger.info("---------------------")

+    logger.info("-------------------------")
    logger.info(f"--- Delete artefacts ---")
+    logger.info("-------------------------")

    artefact_directory_path = Path(artefacts_directory)

@ -27,7 +31,9 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
        logger.info(f"Removing the directory: {artefacts_directory}")
        shutil.rmtree(artefact_directory_path)

+    logger.info("-----------------------")
    logger.info(f"--- Delete metrics ---")
+    logger.info("-----------------------")

    metrics_directory_path = Path(metrics_directory)

@ -39,11 +45,15 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:

 if __name__ == "__main__":

+    logger.info("----------------------------")
    logger.info(f"--- {__file__} - Start! ---")
+    logger.info("----------------------------")

    run_cleanup(
        artefacts_directory=startup_cleanup_params["artefacts"],
        metrics_directory=startup_cleanup_params["metrics"],
    )

+    logger.info("-------------------------------")
    logger.info(f"--- {__file__} - Complete! ---")
+    logger.info("-------------------------------")
--- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py
+++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py
@ -17,7 +17,9 @@ from core.DataClient import dataclient_factory
 from core.FeatureProcessor import feature_processor_factory
 from config import settings

+logger.info("----------------------------")
 logger.info(f"--- Initiate Parameters ---")
+logger.info("----------------------------")

 RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")

@ -31,7 +33,9 @@ output_train_filepath = prepare_data_params["output_train_filepath"]
 output_test_filepath = prepare_data_params["output_test_filepath"]
 feature_processor_config = feature_process_params["feature_processor_config"]

+logger.info("----------------------------")
 logger.info(f"--- Initiate DataClient ---")
+logger.info("----------------------------")

 input_dataclient_type = prepare_data_params["input_dataclient_type"]
 output_dataclient_type = prepare_data_params["output_dataclient_type"]
@ -45,7 +49,9 @@ output_dataclient = dataclient_factory(
    dataclient_config=client_params[output_dataclient_type],
 )

+logger.info("----------------------------------")
 logger.info(f"--- Initiate FeatureProcessor ---")
+logger.info("----------------------------------")

 feature_processor = feature_processor_factory(
    feature_process_params["feature_processor_type"]
@ -70,11 +76,15 @@ def prepare_data(
    :param pipeline_mode: bool, Default False, this caches out the file for experimentation, objects returned in pipeline mode
    """

+    logger.info("--------------------")
    logger.info("--- Loading data ---")
+    logger.info("--------------------")

    data = input_dataclient.load_data(location=data_filepath, load_config={})

+    logger.info("--------------------------")
    logger.info("--- Feature Processing ---")
+    logger.info("--------------------------")

    data = feature_processor.feature_process(
        data,
@ -83,12 +93,13 @@ def prepare_data(
        new_feature_funcs=new_feature_funcs,
    )

+    logger.info("----------------------")
    logger.info("--- Splitting data ---")
+    logger.info("----------------------")

    if train_proportion == 1:
        train = data
-        # Sample 10% of the data for testing
-        test = data.sample(round(len(data) * 0.1))
+        test = None
    else:
        train, test = train_test_split(
            data, train_size=train_proportion, test_size=(1 - train_proportion)
@ -97,7 +108,9 @@ def prepare_data(

    train = train.reset_index(drop=True)

+    logger.info("-----------------------")
    logger.info("--- Outputting data ---")
+    logger.info("-----------------------")

    output_dataclient.save_data(
        obj=train, location=output_train_filepath, save_config=None
@ -113,9 +126,13 @@ def prepare_data(

 if __name__ == "__main__":

+    logger.info("----------------------------")
    logger.info(f"--- {__file__} - Start! ---")
+    logger.info("----------------------------")

+    logger.info("---------------------------")
    logger.info(f"--- Prepare Data Stage ---")
+    logger.info("---------------------------")

    prepare_data(
        input_dataclient=input_dataclient,
@ -130,4 +147,6 @@ if __name__ == "__main__":
        new_feature_funcs=new_feature_funcs,
    )

+    logger.info("-------------------------------")
    logger.info(f"--- {__file__} - Complete! ---")
+    logger.info("-------------------------------")
--- a/modules/ml-pipeline/src/pipeline/2_build_model.py
+++ b/modules/ml-pipeline/src/pipeline/2_build_model.py
@ -18,7 +18,9 @@ from core.MLMetrics import metrics_factory
 from configs.post_prediction_logic import post_prediction_logic
 from config import settings

+logger.info("----------------------------")
 logger.info(f"--- Initiate Parameters ---")
+logger.info("----------------------------")

 RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")

@ -26,12 +28,9 @@ prepare_data_params = settings.prepare_data
 build_model_params = settings.build_model
 feature_process_params = settings.feature_processor
 generate_metrics_params = settings.generate_metrics
-generate_predictions_params = settings.generate_predictions

 model_type = build_model_params["model_type"]
 target = feature_process_params["feature_processor_config"]["target"]
-fit_predictions_filepath = build_model_params["fit_predictions_filepath"]
-predictions_column_name = generate_predictions_params["predictions_column_name"]
 identifier_columns = feature_process_params["feature_processor_config"][
    "identifier_columns"
 ]
@ -41,16 +40,22 @@ train_filepath = prepare_data_params["output_train_filepath"]
 test_filepath = prepare_data_params["output_test_filepath"]
 fit_metrics_filepath = build_model_params["fit_metrics_filepath"]

+logger.info("----------------------------")
 logger.info(f"--- Initiate DataClient ---")
+logger.info("----------------------------")

 # Output of previous prepare data step, will be where the data is
 dataclient = dataclient_factory(prepare_data_params["output_dataclient_type"])

+logger.info("-------------------------")
 logger.info(f"--- Initiate MLModel ---")
+logger.info("-------------------------")

 model = model_factory(model_type)

+logger.info("-------------------------")
 logger.info(f"--- Initiate Metrics ---")
+logger.info("-------------------------")

 metrics = metrics_factory(generate_metrics_params["metrics_type"])

@ -63,8 +68,6 @@ def build_model(
    identifier_columns: List[str],
    model_save_location: str,
    model_hyperparameters: dict,
-    fit_predictions_filepath: str,
-    predictions_column_name: str,
    fit_metrics_filepath: str,
    train_filepath: Union[str, None] = None,
    test_filepath: Union[str, None] = None,
@ -72,7 +75,9 @@ def build_model(
    test_data: Union[pd.DataFrame, None] = None,
    pipeline_mode: bool = False,
 ):
+    logger.info("--------------------------------------")
    logger.info("--- Loading Data for build process ---")
+    logger.info("--------------------------------------")

    if train_data is None:
        if train_filepath is None:
@ -84,7 +89,9 @@ def build_model(
            raise ValueError(f"Need {test_filepath} if no data supplied")
        test_data = dataclient.load_data(location=test_filepath, load_config=None)

+    logger.info("----------------------")
    logger.info("--- Training model ---")
+    logger.info("----------------------")

    model.train_model(
        data=train_data.drop(columns=identifier_columns),
@ -92,33 +99,32 @@ def build_model(
        model_hyperparameters=model_hyperparameters,
    )

+    logger.info("----------------------------------")
    logger.info("--- Generating fit predictions ---")
+    logger.info("----------------------------------")

    fit_predictions = model.predict(
        data=train_data, post_prediction_logic=post_prediction_logic
    )

-    logger.info("--- Saving fit predictions ---")
-
-    predictions_df = pd.DataFrame(fit_predictions)
-    predictions_df.columns = [predictions_column_name]
-
-    dataclient.save_data(
-        obj=predictions_df, location=fit_predictions_filepath, save_config=None
-    )
-
+    logger.info("------------------------------")
    logger.info("--- Generating fit metrics ---")
+    logger.info("------------------------------")

    metrics_output = metrics.generate_metrics(
        target=train_data[target],
        predictions=pd.Series(fit_predictions),
    )

+    logger.info("--------------------")
    logger.info("--- Saving model ---")
+    logger.info("--------------------")

    model.save_model(path=Path(model_save_location))

+    logger.info("--------------------------")
    logger.info("--- Saving fit metrics ---")
+    logger.info("--------------------------")

    dataclient.save_data(
        obj=metrics_output, location=fit_metrics_filepath, save_config=None
@ -127,9 +133,13 @@ def build_model(

 if __name__ == "__main__":

+    logger.info("----------------------------")
    logger.info(f"--- {__file__} - Start! ---")
+    logger.info("----------------------------")

+    logger.info("--------------------------")
    logger.info(f"--- Build Model Stage ---")
+    logger.info("--------------------------")

    build_model(
        dataclient=dataclient,
@ -142,8 +152,8 @@ if __name__ == "__main__":
        train_filepath=train_filepath,
        test_filepath=test_filepath,
        fit_metrics_filepath=fit_metrics_filepath,
-        fit_predictions_filepath=fit_predictions_filepath,
-        predictions_column_name=predictions_column_name,
    )

+    logger.info("-------------------------------")
    logger.info(f"--- {__file__} - Complete! ---")
+    logger.info("-------------------------------")
--- a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py
+++ b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py
@ -10,7 +10,9 @@ from core.Logger import logger
 from config import settings
 from generate_predictions import generate_predictions

+logger.info("----------------------------")
 logger.info(f"--- Initiate Parameters ---")
+logger.info("----------------------------")

 RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")

@ -31,11 +33,15 @@ model_filepath = build_model_params["model_save_filepath"]
 predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
 predictions_column_name = generate_predictions_params["predictions_column_name"]

+logger.info("-------------------------")
 logger.info(f"--- Initiate MLModel ---")
+logger.info("-------------------------")

 model = model_factory(build_model_params["model_type"])

+logger.info("----------------------------")
 logger.info(f"--- Initiate DataClient ---")
+logger.info("----------------------------")

 # We may have different locations of loading hence why we use one specified in generate_predictions.yaml
 # I.e. for metric runs, this will be a local data client
@ -53,9 +59,13 @@ output_dataclient = dataclient_factory(

 if __name__ == "__main__":

+    logger.info("----------------------------")
    logger.info(f"--- {__file__} - Start! ---")
+    logger.info("----------------------------")

+    logger.info("----------------------------------")
    logger.info(f"--- Generate Predictions Stage---")
+    logger.info("----------------------------------")

    generate_predictions(
        input_dataclient=input_dataclient,
@ -68,4 +78,6 @@ if __name__ == "__main__":
        predictions_column_name=predictions_column_name,
    )

+    logger.info("-------------------------------")
    logger.info(f"--- {__file__} - Complete! ---")
+    logger.info("-------------------------------")
--- a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py
+++ b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py
@ -16,7 +16,9 @@ from core.MLMetrics import metrics_factory
 from core.Logger import logger
 from config import settings

+logger.info("----------------------------")
 logger.info(f"--- Initiate Parameters ---")
+logger.info("----------------------------")

 RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")

@ -33,11 +35,16 @@ predictions_output_filepath = generate_predictions_params["predictions_output_fi
 predictions_column_name = generate_predictions_params["predictions_column_name"]
 metrics_output_filepath = generate_metrics_params["metrics_output_filepath"]

+
+logger.info("-------------------------")
 logger.info(f"--- Initiate MLModel ---")
+logger.info("-------------------------")

 model = model_factory(build_model_params["model_type"])

+logger.info("----------------------------")
 logger.info(f"--- Initiate DataClient ---")
+logger.info("----------------------------")

 # Use data client for input and output, as we use dvc to cache later to the cloud
 dataclient_type = generate_metrics_params["dataclient_type"]
@ -46,7 +53,9 @@ dataclient = dataclient_factory(
    dataclient_config=client_params[dataclient_type],
 )

+logger.info("---------------------------")
 logger.info(f"--- Initiate MLMetrics ---")
+logger.info("---------------------------")

 metrics = metrics_factory(generate_metrics_params["metrics_type"])

@ -66,26 +75,34 @@ def generate_metrics(
    For a given model, we generate prediction and evaluate this against the true target
    """

+    logger.info("-------------------------")
    logger.info("--- Loading test data ---")
+    logger.info("-------------------------")

    test_data = input_dataclient.load_data(
        location=test_data_filepath, load_config=None
    )

+    logger.info("---------------------------")
    logger.info("--- Loading predictions ---")
+    logger.info("---------------------------")

    predictions = input_dataclient.load_data(
        location=predictions_output_filepath, load_config=None
    )

+    logger.info("--------------------------")
    logger.info("--- Generating metrics ---")
+    logger.info("--------------------------")

    metrics_output = metrics.generate_metrics(
        target=test_data[target],
        predictions=pd.Series(predictions[predictions_column_name]),
    )

+    logger.info("----------------------")
    logger.info("--- Saving metrics ---")
+    logger.info("----------------------")

    output_dataclient.save_data(
        obj=metrics_output, location=metrics_output_filepath, save_config=None
@ -94,9 +111,13 @@ def generate_metrics(

 if __name__ == "__main__":

+    logger.info("----------------------------")
    logger.info(f"--- {__file__} - Start! ---")
+    logger.info("----------------------------")

+    logger.info("------------------------------")
    logger.info(f"--- Generate Metrics Stage---")
+    logger.info("------------------------------")

    generate_metrics(
        input_dataclient=dataclient,
@ -110,4 +131,6 @@ if __name__ == "__main__":
        metrics_output_filepath=metrics_output_filepath,
    )

+    logger.info("-------------------------------")
    logger.info(f"--- {__file__} - Complete! ---")
+    logger.info("-------------------------------")
--- a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py
+++ b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py
@ -1,162 +0,0 @@
-"""
-Fourth part of the pipeline:
-After the model is built and metrics are generated,
-we want to test this model against known scenarios
-"""
-
-import os
-import pandas as pd
-from core.interface.InterfaceModels import MLModel
-from core.interface.InterfaceDataClient import DataClient
-from core.interface.InterfaceMetrics import MLMetrics
-from configs.post_prediction_logic import post_prediction_logic
-from core.DataClient import dataclient_factory
-from core.MLModels import model_factory
-from core.MLMetrics import metrics_factory
-from core.Logger import logger
-from config import settings
-
-logger.info(f"--- Initiate Parameters ---")
-
-RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
-
-client_params = settings.client
-prepare_data_params = settings.prepare_data
-build_model_params = settings.build_model
-generate_predictions_params = settings.generate_predictions
-generate_metrics_params = settings.generate_metrics
-feature_process_params = settings.feature_processor
-scenarios_params = settings.scenarios
-
-model_filepath = build_model_params["model_save_filepath"]
-target = feature_process_params["feature_processor_config"]["target"]
-scenario_data_filepaths = scenarios_params["scenario_data_filepaths"]
-predictions_column_name = generate_predictions_params["predictions_column_name"]
-comparison_output_filepath = scenarios_params["comparison_output_filepath"]
-metrics_output_filepath = scenarios_params["metrics_output_filepath"]
-
-logger.info(f"--- Initiate MLModel ---")
-
-model = model_factory(build_model_params["model_type"])
-
-logger.info(f"--- Initiate DataClient ---")
-
-# Use data client for input and output, as we use dvc to cache later to the cloud
-input_dataclient_type = scenarios_params["input_dataclient_type"]
-input_dataclient = dataclient_factory(
-    dataclient_type=input_dataclient_type,
-    dataclient_config=client_params[input_dataclient_type],
-)
-
-output_dataclient_type = scenarios_params["output_dataclient_type"]
-output_dataclient = dataclient_factory(
-    dataclient_type=output_dataclient_type,
-    dataclient_config=client_params[output_dataclient_type],
-)
-
-logger.info(f"--- Initiate MLMetrics ---")
-
-metrics = metrics_factory(generate_metrics_params["metrics_type"])
-
-
-def generate_scenario_predictions(
-    input_dataclient: DataClient,
-    output_dataclient: DataClient,
-    model: MLModel,
-    metrics: MLMetrics,
-    model_filepath: str,
-    scenario_data_filepaths: list,
-    predictions_column_name: str,
-    comparison_output_filepath: str,
-    metrics_output_filepath: str,
-):
-    """
-    Given the new model, we generate prediction for expected scenarios
-    """
-
-    logger.info("--- Loading Scenario Data ---")
-
-    scenario_data = pd.DataFrame()
-
-    # If we have no scenario data, we can save empty dataframes
-    if scenario_data_filepaths is None:
-        logger.info("No scenario data filepaths provided")
-        output_dataclient.save_data(
-            obj=scenario_data, location=comparison_output_filepath, save_config=None
-        )
-
-        output_dataclient.save_data(
-            obj=scenario_data, location=metrics_output_filepath, save_config=None
-        )
-        return
-
-    # Can have multiple scenario data files
-    for scenario_data_filepath in scenario_data_filepaths:
-        scenario_data = pd.concat(
-            [
-                scenario_data,
-                input_dataclient.load_data(scenario_data_filepath, load_config=None),
-            ]
-        )
-
-    logger.info("--- Loading Model ---")
-
-    model.load_model(model_filepath)
-
-    logger.info("--- Generating Predictions ---")
-
-    predictions = model.predict(
-        data=scenario_data, post_prediction_logic=post_prediction_logic
-    )
-
-    logger.info("--- Generate Scenario Predicted Impact ---")
-
-    predictions_df = pd.DataFrame(predictions)
-    predictions_df.columns = [predictions_column_name]
-
-    scenario_data = pd.concat([scenario_data, predictions_df], axis=1)
-    scenario_data["predicted_impact"] = abs(
-        scenario_data[predictions_column_name] - scenario_data["sap_starting"]
-    )
-
-    logger.info("--- Generate Metrics ---")
-
-    metrics_dict = metrics.generate_metrics(
-        scenario_data["impact"], scenario_data["predicted_impact"]
-    )
-
-    metrics_df = pd.DataFrame(metrics_dict, index=[0]).T.reset_index()
-    metrics_df.columns = ["metric", "value"]
-
-    logger.info("--- Save prediction into metrics ---")
-
-    output_df = scenario_data[["uprn", "id", "impact", "predicted_impact"]]
-
-    output_dataclient.save_data(
-        obj=output_df, location=comparison_output_filepath, save_config=None
-    )
-
-    output_dataclient.save_data(
-        obj=metrics_df, location=metrics_output_filepath, save_config=None
-    )
-
-
-if __name__ == "__main__":
-
-    logger.info(f"--- {__file__} - Start! ---")
-
-    logger.info(f"--- Generate Scenario Predictions ---")
-
-    generate_scenario_predictions(
-        input_dataclient=input_dataclient,
-        output_dataclient=output_dataclient,
-        model=model,
-        metrics=metrics,
-        model_filepath=model_filepath,
-        scenario_data_filepaths=scenario_data_filepaths,
-        predictions_column_name=predictions_column_name,
-        comparison_output_filepath=comparison_output_filepath,
-        metrics_output_filepath=metrics_output_filepath,
-    )
-
-    logger.info(f"--- {__file__} - Complete! ---")
--- a/modules/ml-pipeline/src/pipeline/README.md
+++ b/modules/ml-pipeline/src/pipeline/README.md
@ -37,4 +37,3 @@ Workflow:
    - This experiment will have the corresponding .dvc files for the hashed model and data
 - Use version control as normal
    - git add, git commit etc
- To revert change, use `git checkout {COMMIT_HASH}`, followed by `git switch -c {NEW_BRANCH_NAME}`
--- a/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet
+++ b/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet
--- a/modules/ml-pipeline/src/pipeline/config.py
+++ b/modules/ml-pipeline/src/pipeline/config.py
@ -7,7 +7,6 @@ settings = Dynaconf(
        "./configs/settings.yaml",
        "./configs/build_model.yaml",
        "./configs/analysis.yaml",
-        "./configs/scenarios.yaml",
    ],
 )

--- a/modules/ml-pipeline/src/pipeline/configs/analysis.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/analysis.yaml
@ -13,4 +13,4 @@ default:
    dataclient_type: local
    nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
    n_val: 30  # how many datapoints from validation data should we interpret predictions for, larger values will be slower
-    row_index: [20695, 50243, 7653] # index of an example datapoint
+    row_index: [0, 10, 20] # index of an example datapoint
--- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
@ -3,7 +3,6 @@ default:
    model_type: AutogluonAutoML
    model_save_filepath: ./data/model/optimised/
    fit_metrics_filepath: ./metrics/fit_metrics.json
-    fit_predictions_filepath: ./data/fit_predictions/predictions.parquet

    SKLearnLinearRegression: null

@ -14,9 +13,6 @@ default:
      output_filepath: ./data/model/allmodels/
      problem_type: regression
      eval_metric: mean_squared_error #mean_absolute_error
-      time_limit: 1800
+      time_limit: 4000
      presets: medium_quality
-      excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT']
-      infer_limit: 0.05
-      infer_limit_batch_size: 10000
-      ag_args_ensemble: {'num_folds_parallel': 2}
+      excluded_model_types: ['KNN', 'RF']
--- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
+++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
@ -9,42 +9,15 @@ Business Logic dict + functions

 def remove_starting_columns(df):
    keep_column_index = [
-        False if col_name.endswith("_starting") else True
+        False if col_name.endswith("_STARTING") else True
        for col_name in list(df.columns)
    ]
    keep_columns = df.columns[keep_column_index].to_list()
-    keep_columns.append("sap_starting")
+    keep_columns.append("SAP_STARTING")
    df = df[keep_columns]
    return df


-def remove_floor_height_ending(df):
-    # df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING']
-    # shows bottom 0.5 percentile is 1.665
-    # So keep anything above this
-    df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True)
-    print("we in here")
-    return df
-
-
-def remove_minimum_habitable_room_size(df):
-    # Need minimum of 6.5m per habitable room
-    df = df[
-        df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5
-    ].reset_index(drop=True)
-    return df
-
-
-def keep_flats(df):
-    df = df[df["property_type"] == "Flat"]
-    return df
-
-
-def keep_non_zero_rdsap(df):
-    df = df[df["rdsap_change"] != 0]
-    return df
-
-
 # def keep_ending_columns(df):
 #     ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
 #     keep_columns = df.columns[ending_column_index].to_list()
@ -54,10 +27,6 @@ def keep_non_zero_rdsap(df):
 #     return df

 business_logic = {
-    # "keep_non_zero_rdsap": keep_non_zero_rdsap,
-    # "keep_flats": keep_flats,
-    # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,
-    # "remove_floor_height_ending": remove_floor_height_ending
    # "remove_starting_columns": remove_starting_columns
    # "keep_ENDING_COLUMNS": keep_ending_columns
 }
--- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py
+++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py
@ -5,18 +5,16 @@ import pandas as pd


 def clip_predictions_to_minimum_value(
-    data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0
+    data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 1
 ) -> pd.Series:

    series_name = predictions.name
    predictions.name = "predictions"
    predictions_df = pd.concat([data, predictions], axis=1)
    # We expect all prediction to be atleast one point improvement
-    replace_index = (
-        predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"]
-    )
+    replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"]
    predictions_df.loc[replace_index, "predictions"] = (
-        predictions_df.loc[replace_index, "sap_starting"] + minimum_value
+        predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value
    )

    predictions_new = predictions_df["predictions"]
--- a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml
@ -1,13 +0,0 @@
-default:
-  scenarios:
-    input_dataclient_type: aws-s3
-    output_dataclient_type: local
-    scenario_data_filepaths:
-      # - s3://retrofit-data-dev/scenario_data/22-03-2024-19-20-09/recommendations_scoring_data.parquet
-      # - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet
-      # - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
-      # - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet
-      # - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet
-      - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
-    comparison_output_filepath: ./metrics/scenario_table.md
-    metrics_output_filepath: ./metrics/scenario_metrics.md
--- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml
@ -18,10 +18,10 @@ default:
  prepare_data:
    input_dataclient_type: aws-s3
    output_dataclient_type: local
-    # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
-    # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet
-    # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet
-    data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
+    # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet
+    # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet
+    # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
+    data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
    train_proportion: 0.9
    output_train_filepath: ./data/prepared_data/train.parquet
    output_test_filepath: ./data/prepared_data/test.parquet
@ -31,37 +31,11 @@ default:
    feature_processor_config:
      subsample_amount: null
      subsample_seed: 0
-      target: sap_ending
-      identifier_columns: ["uprn"]
-      # drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"]
-      drop_columns: [
-        "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending",
-        'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
-        'number_habitable_rooms', 'number_heated_rooms']
+      target: SAP_ENDING
+      identifier_columns: ["UPRN"]
+      drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
+      # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
      retain_features: null
-      # retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending',
-      #  'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending',
-      #  'walls_energy_eff_ending', 'secondheat_description_ending',
-      #  'property_type', 'mainheatc_energy_eff_ending', 'built_form',
-      #  'walls_insulation_thickness_ending', 'potential_energy_efficiency',
-      #  'transaction_type_ending',
-      #  'floor_thermal_transmittance_ending',
-      #  'low_energy_lighting_ending', 'heat_demand_starting',
-      #  'photo_supply_ending', 'carbon_starting',
-      #  'walls_thermal_transmittance_ending',
-      #  'roof_insulation_thickness_ending',
-      #  'total_floor_area_ending', 'number_open_fireplaces_ending',
-      #  'windows_energy_eff_ending',
-      #  'floor_height_ending',
-      #  'extension_count_ending',
-      #  'has_air_source_heat_pump_ending',
-      #  'charging_system_ending', 'construction_age_band', 'glazed_type_ending',
-      #  'roof_thermal_transmittance_ending',
-      #  'floor_insulation_thickness_ending', 'has_mains_gas_ending',
-      #  'estimated_perimeter_starting', 'energy_consumption_potential',
-      #  'environment_impact_potential', 'heater_type_ending',
-      #  'multi_glaze_proportion_ending',
-      #  'lighting_energy_eff_ending', 'fixed_lighting_outlets_count']

  generate_predictions:
    input_dataclient_type: local
@ -69,7 +43,6 @@ default:
    test_data_filepath: ./data/prepared_data/test.parquet
    predictions_output_filepath: ./data/predictions/predictions.parquet
    predictions_column_name: predictions
-    identifier_column: id

  generate_metrics:
    dataclient_type: local
--- a/modules/ml-pipeline/src/pipeline/core/DataClient.py
+++ b/modules/ml-pipeline/src/pipeline/core/DataClient.py
@ -142,15 +142,9 @@ class AWSS3Client:
        buffer = BytesIO()
        obj.to_parquet(buffer, index=False)

-        # Reset the buffer position to the beginning
-        buffer.seek(0)
-
        bucket, key = location.strip("s3://").split("/", 1)
        self.client.upload_fileobj(buffer, bucket, key)

-        # Close the buffer
-        buffer.close()
-
    def _load_parquet(self, location: str, load_config: dict) -> pd.DataFrame:
        """
        Load a parquet file
@ -245,8 +239,7 @@ class LocalClient:

        save_methods = {
            ".parquet": self._save_parquet,
-            ".json": self._save_json,
-            ".md": self._save_md,
+            ".json": self._save_json
            # "": _save_directory(**save_config),
            # ADD MORE save_methods HERE
        }
@ -295,10 +288,3 @@ class LocalClient:
        # Write the contents of the buffer to the local file
        with open(location, "wb") as f:
            f.write(buffer.getvalue())
-
-    def _save_md(self, obj: pd.DataFrame, location: str, save_config: dict):
-        """
-        Save object as markdown
-        """
-
-        obj.to_markdown(location, **save_config)
--- a/modules/ml-pipeline/src/pipeline/core/Logger.py
+++ b/modules/ml-pipeline/src/pipeline/core/Logger.py
@ -21,7 +21,6 @@ def setup_logger():

    # Add the stream handler to the logger
    logger.addHandler(stream_handler)
-    logger.propagate = False

    return logger

--- a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py
+++ b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py
@ -4,7 +4,6 @@ Implementation of MLMetrics, all of which will have two methods:
 - Generate Plot Suite
 """

-import numpy as np
 import pandas as pd
 from typing import Union
 from sklearn.metrics import (
@ -15,18 +14,6 @@ from sklearn.metrics import (
 )
 from core.interface.InterfaceMetrics import MLMetrics

-# Define the function to return the SMAPE value
-def symmetric_mape(actual, predicted) -> float:
-
-    # Convert actual and predicted to numpy
-    # array data type if not already
-    if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]):
-        actual, predicted = np.array(actual), np.array(predicted)
-
-    return np.mean(
-        np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual)) / 2)
-    )
-

 def metrics_factory(metrics_type: str) -> MLMetrics:
    metrics = {
@ -47,7 +34,7 @@ class RegressionMetrics:
        median_absolute_error,
        mean_squared_error,
        mean_absolute_percentage_error,
-        symmetric_mape,
+        # max_error
    ]

    def generate_metrics(
--- a/modules/ml-pipeline/src/pipeline/core/MLModels.py
+++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py
@ -25,7 +25,7 @@ def model_factory(model_type: str) -> MLModel:
    models = {
        "SKLearnLinearRegression": SKLearnLinearRegression(),
        "SKLearnSVMRegression": SKLearnSVMRegression(),
-        "AutogluonAutoML": AutogluonAutoML(),
+        "AutogluonAutoML": AutogluonAutoML()
        # ADD OTHER MODELS HERE
    }

@ -149,9 +149,6 @@ class AutogluonAutoML:
        "time_limit",
        "presets",
        "excluded_model_types",
-        "infer_limit",
-        "infer_limit_batch_size",
-        "ag_args_ensemble",
    ]

    def load_model(self, path: Union[Path, str]) -> None:
@ -206,9 +203,6 @@ class AutogluonAutoML:
            time_limit=model_hyperparameters["time_limit"],
            presets=model_hyperparameters["presets"],
            excluded_model_types=model_hyperparameters["excluded_model_types"],
-            infer_limit=model_hyperparameters["infer_limit"],
-            infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"],
-            ag_args_ensemble=model_hyperparameters["ag_args_ensemble"],
        )

    def predict(
--- a/modules/ml-pipeline/src/pipeline/dvc.lock
+++ b/modules/ml-pipeline/src/pipeline/dvc.lock
@ -1,46 +1,26 @@
 schema: '2.0'
 stages:
-  startup_cleanup:
-    cmd: python 0_startup_cleanup.py
-    deps:
-    - path: 0_startup_cleanup.py
-      hash: md5
-      md5: b1b12f6b6393fbf8b83d23684df0a3d4
-      size: 1220
-    params:
-      configs/settings.yaml:
-        default.startup_cleanup.artefacts: ./data
-        default.startup_cleanup.metrics: ./metrics
  prepare_data:
    cmd: python 1_prepare_data.py
    deps:
    - path: 1_prepare_data.py
      hash: md5
-      md5: 11a3b8bfdfe199ab7ecc39ccc5652649
-      size: 4298
+      md5: c9f030df733e318b80d1fa91b7732f79
+      size: 5132
    params:
      configs/settings.yaml:
        default.feature_processor.feature_processor_config.drop_columns:
-        - heat_demand_change
-        - carbon_change
-        - rdsap_change
-        - heat_demand_ending
-        - carbon_ending
-        - days_to_starting
-        - days_to_ending
-        - number_habitable_rooms_starting
-        - number_habitable_rooms_ending
-        - number_heated_rooms_starting
-        - number_heated_rooms_ending
-        - number_habitable_rooms
-        - number_heated_rooms
+        - HEAT_DEMAND_CHANGE
+        - CARBON_CHANGE
+        - RDSAP_CHANGE
+        - HEAT_DEMAND_ENDING
+        - CARBON_ENDING
        default.feature_processor.feature_processor_config.retain_features:
        default.feature_processor.feature_processor_config.subsample_amount:
        default.feature_processor.feature_processor_config.subsample_seed: 0
-        default.feature_processor.feature_processor_config.target: sap_ending
+        default.feature_processor.feature_processor_config.target: SAP_ENDING
        default.feature_processor.feature_processor_type: dataframe
-        default.prepare_data.data_filepath:
-          s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
+        default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
        default.prepare_data.input_dataclient_type: aws-s3
        default.prepare_data.output_dataclient_type: local
        default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
@ -49,20 +29,20 @@ stages:
    outs:
    - path: data/prepared_data/
      hash: md5
-      md5: 80c9e138146a1d96b9d16091c207e2e8.dir
-      size: 45056059
+      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
+      size: 33881619
      nfiles: 2
  build_model:
    cmd: python 2_build_model.py
    deps:
    - path: 2_build_model.py
      hash: md5
-      md5: 7231450b78920b0c5e7c6bada496b24a
-      size: 4820
+      md5: 84699d208874c52accaff61c6af9bb0a
+      size: 5359
    - path: data/prepared_data
      hash: md5
-      md5: 80c9e138146a1d96b9d16091c207e2e8.dir
-      size: 45056059
+      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
+      size: 33881619
      nfiles: 2
    params:
      configs/build_model.yaml:
@ -71,7 +51,6 @@ stages:
            model_type: AutogluonAutoML
            model_save_filepath: ./data/model/optimised/
            fit_metrics_filepath: ./metrics/fit_metrics.json
-            fit_predictions_filepath: ./data/fit_predictions/predictions.parquet
            SKLearnLinearRegression:
            SKLearnSVMRegression:
              kernel: linear
@ -79,49 +58,37 @@ stages:
              output_filepath: ./data/model/allmodels/
              problem_type: regression
              eval_metric: mean_squared_error
-              time_limit: 1800
+              time_limit: 4000
              presets: medium_quality
              excluded_model_types:
-              - RF
-              - CAT
-              - NN_TORCH
              - KNN
-              - XT
-              infer_limit: 0.05
-              infer_limit_batch_size: 10000
-              ag_args_ensemble:
-                num_folds_parallel: 2
+              - RF
    outs:
-    - path: data/fit_predictions/
-      hash: md5
-      md5: d9c9afc05e8780db47c0548b19bf7d19.dir
-      size: 3349989
-      nfiles: 1
    - path: data/model/
      hash: md5
-      md5: 13c3100e1486c27a83a8a47491077842.dir
-      size: 773523079
-      nfiles: 36
+      md5: 7bb5156243b4db39349e80a01ffecde4.dir
+      size: 473398662
+      nfiles: 27
    - path: metrics/fit_metrics.json
      hash: md5
-      md5: 2ff70a2a45813e1bcdf2ea3aa8e07d4a
-      size: 224
+      md5: 2bb16ac67de8778fbc08171d562b34d5
+      size: 184
  generate_predictions:
    cmd: python 3_generate_predictions.py
    deps:
    - path: 3_generate_predictions.py
      hash: md5
-      md5: 0a70ad4dfe99414a75d1261c75a177b9
-      size: 2464
+      md5: 5ef2856a5a977304f1ec01f9b4205262
+      size: 3028
    - path: data/model
      hash: md5
-      md5: 13c3100e1486c27a83a8a47491077842.dir
-      size: 773523079
-      nfiles: 36
+      md5: 7bb5156243b4db39349e80a01ffecde4.dir
+      size: 473398662
+      nfiles: 27
    - path: data/prepared_data
      hash: md5
-      md5: 80c9e138146a1d96b9d16091c207e2e8.dir
-      size: 45056059
+      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
+      size: 33881619
      nfiles: 2
    params:
      configs/settings.yaml:
@ -133,25 +100,25 @@ stages:
    outs:
    - path: data/predictions/
      hash: md5
-      md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
-      size: 463197
+      md5: 0bb3cf991906953def81c8204cdcfaf0.dir
+      size: 374532
      nfiles: 1
  generate_metrics:
    cmd: python 4_generate_metrics.py
    deps:
    - path: 4_generate_metrics.py
      hash: md5
-      md5: 4fedb86d89d528f0a6597934ba3890a0
-      size: 3484
+      md5: 2c9fb78955a8c19cff0a098976f81d1b
+      size: 4487
    - path: data/predictions
      hash: md5
-      md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
-      size: 463197
+      md5: 0bb3cf991906953def81c8204cdcfaf0.dir
+      size: 374532
      nfiles: 1
    - path: data/prepared_data
      hash: md5
-      md5: 80c9e138146a1d96b9d16091c207e2e8.dir
-      size: 45056059
+      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
+      size: 33881619
      nfiles: 2
    params:
      configs/settings.yaml:
@ -161,30 +128,16 @@ stages:
    outs:
    - path: metrics/metrics.json
      hash: md5
-      md5: 3e08df02fd5c5d094bcf936e1338d596
-      size: 223
-  generate_scenerio_metrics:
-    cmd: python 5_generate_scenarios.py
+      md5: 2e13ae67759a64261d03224f1c0d4bf4
+      size: 185
+  startup_cleanup:
+    cmd: python 0_startup_cleanup.py
    deps:
-    - path: 5_generate_scenarios.py
+    - path: 0_startup_cleanup.py
      hash: md5
-      md5: 40506749fefd926d47c60ff5b16db307
-      size: 5337
+      md5: fbb7e3b1b98b517c870f3e1df3e7f695
+      size: 1676
    params:
-      configs/scenarios.yaml:
-        default.scenarios:
-          input_dataclient_type: aws-s3
-          output_dataclient_type: local
-          scenario_data_filepaths:
-          - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
-          comparison_output_filepath: ./metrics/scenario_table.md
-          metrics_output_filepath: ./metrics/scenario_metrics.md
-    outs:
-    - path: metrics/scenario_metrics.md
-      hash: md5
-      md5: fa4d6d7bbd7818613800da5f8f37ea96
-      size: 363
-    - path: metrics/scenario_table.md
-      hash: md5
-      md5: d6baf100a1623cc2467c2f8221d314c9
-      size: 2133
+      configs/settings.yaml:
+        default.startup_cleanup.artefacts: ./data
+        default.startup_cleanup.metrics: ./metrics
--- a/modules/ml-pipeline/src/pipeline/dvc.yaml
+++ b/modules/ml-pipeline/src/pipeline/dvc.yaml
@ -38,7 +38,6 @@ stages:
    - configs/build_model.yaml:
    outs:
    - data/model/
-    - data/fit_predictions/
    - metrics/fit_metrics.json
    always_changed: true
  generate_predictions:
@ -71,17 +70,6 @@ stages:
    outs:
    - metrics/metrics.json
    always_changed: true
-  generate_scenerio_metrics:
-    cmd: python 5_generate_scenarios.py
-    deps:
-    - 5_generate_scenarios.py
-    params:
-    - configs/scenarios.yaml:
-      - default.scenarios
-    outs:
-    - metrics/scenario_table.md
-    - metrics/scenario_metrics.md
-    always_changed: true
 metrics:
  - metrics/metrics.json
  - metrics/fit_metrics.json
--- a/modules/ml-pipeline/src/pipeline/eda.py
+++ b/modules/ml-pipeline/src/pipeline/eda.py
@ -190,35 +190,28 @@ prediction_analysis_params = settings.prediction_analysis
 model = model_factory(build_model_params["model_type"])
 model.load_model(build_model_params["model_save_filepath"])
 dataclient_type = prediction_analysis_params["dataclient_type"]
-# dataclient_type = 'aws-s3'
-# dataclient = dataclient_factory(
-#     dataclient_type=dataclient_type,
-#     dataclient_config=client_params[dataclient_type],
-# )
-# data = dataclient.load_data("s3://retrofit-data-dev/sap_change_model/dataset.parquet")
+dataclient = dataclient_factory(
+    dataclient_type=dataclient_type,
+    dataclient_config=client_params[dataclient_type],
+)

 target = feature_process_params["feature_processor_config"]["target"]
 predictions_column_name = generate_predictions_params["predictions_column_name"]
 output_test_filepath = prepare_data_params["output_test_filepath"]
 predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]

-# score_data = dataclient.load_data("s3://retrofit-data-dev/carbon_change_predictions/51/2023-11-28T21:01:21.869339.parquet")
-
-
-local_dataclient = dataclient_factory(
-    dataclient_type="local",
-    dataclient_config=client_params["local"],
-)
-test_df = local_dataclient.load_data(output_test_filepath)
-predictions = local_dataclient.load_data(predictions_output_filepath)
+test_df = dataclient.load_data(output_test_filepath)
+predictions = dataclient.load_data(predictions_output_filepath)
 mix_df = pd.concat([test_df.copy(), predictions], axis=1)
 mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
 mix_df = mix_df.sort_values("residual", ascending=False)

-cosine_similarity_df = mix_df[mix_df.columns.difference(["predictions", "residual"])]
+cosine_similarity_df = mix_df[
+    mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
+]
 from sklearn.metrics.pairwise import cosine_similarity

-row_index = 0
+row_index = 58199

 from sklearn.preprocessing import LabelEncoder

@ -232,17 +225,7 @@ feature_vector = cosine_similarity_df.loc[[row_index]]

 cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector)
 similar_index = (
-    cosine_similarity_df.sort_values("cosine", ascending=False).head(15).index
+    cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index
 )

 check_df = mix_df.loc[similar_index]
-
-columns_to_check = [
-    "LOW_ENERGY_LIGHTING_ENDING",
-    "walls_thermal_transmittance_ENDING",
-    "floor_thermal_transmittance_ENDING",
-    "roof_thermal_transmittance_ENDING",
-    "roof_insulation_thickness_ENDING",
-]
-
-cosine_similarity_df = mix_df[columns_to_check]
--- a/modules/ml-pipeline/src/pipeline/generate_predictions.py
+++ b/modules/ml-pipeline/src/pipeline/generate_predictions.py
@ -14,23 +14,28 @@ def generate_predictions(
    test_data_filepath: str,
    predictions_output_filepath: str,
    predictions_column_name: str,
-    identifier_column: str = "id",
 ):
    """
    For a given model, we generate prediction and evaluate this against the true target
    """

+    logger.info("-------------------------")
    logger.info("--- Loading test data ---")
+    logger.info("-------------------------")

    test_data = input_dataclient.load_data(
        location=test_data_filepath, load_config=None
    )

+    logger.info("---------------------")
    logger.info("--- Loading model ---")
+    logger.info("---------------------")

    model.load_model(model_filepath)

+    logger.info("------------------------------")
    logger.info("--- Generating predictions ---")
+    logger.info("------------------------------")

    prediction_data = (
        test_data.drop(columns=target) if target in test_data.columns else test_data
@ -40,17 +45,13 @@ def generate_predictions(
        data=prediction_data, post_prediction_logic=post_prediction_logic
    )

+    logger.info("--------------------------")
    logger.info("--- Saving predictions ---")
+    logger.info("--------------------------")

    predictions_df = pd.DataFrame(predictions)
    predictions_df.columns = [predictions_column_name]

-    output_df = (
-        pd.concat([test_data[identifier_column], predictions_df], axis=1)
-        if identifier_column in test_data.columns
-        else predictions_df
-    )
-
    output_dataclient.save_data(
-        obj=output_df, location=predictions_output_filepath, save_config=None
+        obj=predictions_df, location=predictions_output_filepath, save_config=None
    )
--- a/modules/ml-pipeline/src/pipeline/metrics/.gitignore
+++ b/modules/ml-pipeline/src/pipeline/metrics/.gitignore
@ -1,4 +1,2 @@
 /fit_metrics.json
 /metrics.json
-/scenario_table.md
-/scenario_metrics.md
--- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt
@ -1,7 +1,7 @@
 joblib==1.3.2
 boto3==1.28.17
-pandas==2.1.4
-autogluon.tabular[all]==1.0.0
-dynaconf==3.2.1
+pandas==1.5.3
+autogluon==0.8.2
+dynaconf==3.2.0
 pyarrow==13.0.0
 pre-commit==3.3.3
--- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt
@ -1,7 +1,7 @@
 joblib==1.3.2
 boto3==1.28.17
-pandas==2.1.4
-autogluon.tabular[all]==1.0.0
-dynaconf==3.2.1
+pandas==1.5.3
+autogluon==0.8.2
+dynaconf==3.2.0
 pyarrow==13.0.0
 PyYAML==6.0.1
--- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt
@ -1,10 +1,9 @@
 joblib==1.3.2
 boto3==1.28.17
-pandas==2.1.4
-autogluon.tabular[all]==1.0.0
-ray==2.6.3
-dynaconf==3.2.1
-alibi==0.9.5
+pandas==1.5.3
+autogluon==0.8.2
+dynaconf==3.2.0
+alibi==0.9.4
 shap==0.42.1
 pyarrow==13.0.0
 pre-commit==3.3.3
--- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt
@ -1,4 +1,4 @@
 boto3==1.28.41
-pandas==2.1.4
-autogluon.tabular[all]==1.0.0
-dynaconf==3.2.1
+pandas==1.5.3
+autogluon==0.8.2
+dynaconf==3.2.0
--- a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
@ -1,4 +1,4 @@
-dvc==3.51.0
-dvc-s3==3.2.0
-gto==1.7.1
-pyOpenSSL==23.3.0
+dvc==3.18.0
+dvc-s3==2.23.0
+gto==1.0.4
+pyOpenSSL==23.2.0