Merge pull request #117 from Hestia-Homes/sap-dev

Sap dev
Update Registry
2026-06-08 11:17:25 +00:00 · 2024-05-30 20:18:25 +01:00 · 2024-05-30 11:47:46 +00:00 · 2024-05-30 11:47:04 +00:00 · 2024-05-30 12:46:28 +01:00 · 2024-05-28 19:59:07 +01:00
45 changed files with 580 additions and 243 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,9 @@
 modules/ml-pipeline/src/pipeline/data/predictions
 modules/ml-pipeline/src/pipeline/data/fit_predictions
 modules/ml-pipeline/src/pipeline/data/prepared_data
 modules/ml-pipeline/src/pipeline/data/model/allmodels
 modules/ml-pipeline/src/pipeline/metrics
 modules/ml-pipeline/src/pipeline/__pycache__
 modules/ml-pipeline/src/pipeline/.dvc
 modules/ml-pipeline/src/pipeline/analysis
 modules/ml-pipeline/src/pipeline/metrics
--- a/.github/workflows/Deploy.yml
+++ b/.github/workflows/Deploy.yml
@ -2,7 +2,7 @@ name: Sap Change Model Deploy
 on:
  push:
-    branches: [ sap-dev, sap-prod ]
+    branches: [ sap-dev, sap-prod, heat-dev, heat-prod, carbon-dev, carbon-prod]
 jobs:
  deploy:
@ -19,8 +19,8 @@ jobs:
      - name: Install Serverless and plugins
        run: |
-          npm install -g serverless
+          npm install -g serverless@^3.38.0
-          npm install -g serverless-domain-manager
+          npm install -g serverless-domain-manager@^7.3.8
      - name: Install DVC
        run: |
@ -32,8 +32,7 @@ jobs:
        id: secret_prefix
        run: |
            # Convert branch name to uppercase and replace hyphens with underscores
-            # TODO: Remove the CHANGE line by changing the secrets name
+            echo "::set-output name=secret_prefix::$(echo "${{ github.ref_name }}" | tr 'a-z-' 'A-Z_')"
            echo "::set-output name=secret_prefix::$(echo "${{ github.ref_name }}" | sed  's/-/_change-/g' | tr 'a-z-' 'A-Z_')"
      - name: Set domain name
        id: set_domain
@ -58,9 +57,9 @@ jobs:
          # Take branch prefix and add "model" for stack name
          stack_name=$( echo ${{ github.ref_name }} | awk -F"-" '{print $1}' | sed 's/$/model/g')
          if [ -z "${stack_name}" ]; then
            echo "::set-output name=stack_name::${stack_name}"
          else
            echo "::set-output name=stack_name::"
          else
            echo "::set-output name=stack_name::${stack_name}"
          fi
      - name: Set runtime_environment
--- a/.github/workflows/MLPipelinePostMerge.yml
+++ b/.github/workflows/MLPipelinePostMerge.yml
@ -42,7 +42,14 @@ jobs:
        if [ -z "${latest_version}" ]; then
          increment_version="1.0.0"
        else
-          increment_version=$(echo ${latest_version} | awk -F'.' '{OFS="."; $1+=1; print}')
+          increment_version=$(echo ${latest_version} | awk 'BEGIN {
              FS="\\."   # Set the field separator to a period
              OFS="."    # Set the output field separator to a period
          }
          {
              major = $1 + 1   # Increment the major version
              print major, "0", "0"   # Print the new version
          }')
        fi
        new_tag=${REGISTER_MODEL_NAME}@v${increment_version}
@ -80,7 +87,14 @@ jobs:
        if [ -z "${latest_version}" ]; then
          increment_version="0.1.0"
        else
-          increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$2++; print}')
+          increment_version=$(echo ${latest_version} | awk 'BEGIN {
              FS="\\."   # Set the field separator to a period
              OFS="."    # Set the output field separator to a period
          }
          {
              minor = $2 + 1   # Increment the minor version
              print $1, minor, "0"   # Print the new version
          }')
        fi
        new_tag=${REGISTER_MODEL_NAME}@v${increment_version}
@ -118,7 +132,14 @@ jobs:
        if [ -z "${latest_version}" ]; then
          increment_version="0.0.1"
        else
-          increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$3++; print}')
+          increment_version=$(echo ${latest_version} | awk 'BEGIN {
              FS="\\."   # Set the field separator to a period
              OFS="."    # Set the output field separator to a period
          }
          {
              patch = $3 + 1   # Increment the patch version
              print $1, $2, patch   # Print the new version
          }')
        fi
        new_tag=${REGISTER_MODEL_NAME}@v${increment_version}
@ -188,7 +209,7 @@ jobs:
        git config user.name "Github-Bot"
        git config user.email "Github-Bot@no-reply.com"
-        latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/')
+        latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/' | awk 'END {print}')
        if [ -z "${latest_dev_version}" ]; then
          increment_version="1"
        else
@ -196,7 +217,7 @@ jobs:
        fi
        new_tag=${REGISTER_MODEL_NAME}#dev#${increment_version}
-        latest_version=$(gto show model@latest --ref | awk -F"@" '{print $2}')
+        latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@" '{print $2}')
        echo ${new_tag}
--- a/.github/workflows/MLPipelinePullRequest.yml
+++ b/.github/workflows/MLPipelinePullRequest.yml
@ -98,6 +98,16 @@ jobs:
        git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH}
        dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md
        echo "## Scenario comparison" >> report.md
        cat metrics/scenario_table.md >> report.md
        echo "" >> report.md
        echo "## Scenario metrics" >> report.md
        cat metrics/scenario_metrics.md >> report.md
        cml comment create report.md
        # echo "## Residuals plot from model" >> report.md
--- a/MODEL_REGISTRY.md
+++ b/MODEL_REGISTRY.md
@ -8,9 +8,25 @@
        "active": true
    },
    "sap": {
-        "version": "v0.0.1",
+        "version": "v0.14.0",
        "stage": {
-            "dev": "v0.0.1"
+            "dev": "v0.14.0"
        },
        "registered": true,
        "active": true
    },
    "heat": {
        "version": "v0.5.0",
        "stage": {
            "dev": "v0.5.0"
        },
        "registered": true,
        "active": true
    },
    "carbon": {
        "version": "v0.5.0",
        "stage": {
            "dev": "v0.5.0"
        },
        "registered": true,
        "active": true
--- a/README.md
+++ b/README.md
@ -10,9 +10,9 @@ tracking and a model registry
 	- A bolt-on service that can implement model monitoring
 There are multiple protected branches which adapt the generic pipeline to produce different models:
- sap_change-**
+- sap-{dev/staging/prod}-**
- heat_change-**
+- heat-{dev/staging/prod}-**
- carbon_change-**
+- carbon-{dev/staging/prod}-**
 These branches will differ by the configuration files that define the data used and the outputs of the ML-pipeline
 - There can be different additional logic for each branch but the pipeline will be the same.
@ -31,7 +31,7 @@ In order for this to be set up, some key environment variables needs to be inser
 secrets. Each different model and protected branch has its own set of secrets which allows for flexibility
 between different pipelines.
-For example, for the branch sap_change-dev, the prefix=SAP_CHANGE_DEV, and the following secrets are:
+For example, for the branch sap-dev, the prefix=SAP_DEV, and the following secrets are:
 - {prefix}_ECR_URI, which is the URI of the ECR repository to push to. For example, for the
   sap change model this is the lambda-sap-prediction-dev repository.
@ -58,7 +58,7 @@ First, navigate to the root directory of the repository. Open a terminal and exe
 2. command to build the Docker image:
 ```bash
-docker build -t sap_change -f deployment/Dockerfile.prediction.lambda .
+docker build -t sap -f deployment/Dockerfile.prediction.lambda .
 ```
 This will build a Docker image tagged as sap_change using the Dockerfile.prediction.lambda located
@ -68,7 +68,7 @@ in the deployment directory.
 Once the image is built, you can run it using the following command:
 ```bash
-docker run -p 9000:8080 -v ~/.aws/credentials:/root/.aws/credentials:ro -e RUNTIME_ENVIRONMENT=dev sap_change
+docker run -p 9000:8080 -v ~/.aws/credentials:/root/.aws/credentials:ro -e RUNTIME_ENVIRONMENT=dev -e PREDICTIONS_BUCKET=retrofit-sap-predictions-dev sap
 ```
 This command does the following:
@ -79,6 +79,7 @@ Sets the RUNTIME_ENVIRONMENT variable to dev.
 To test the Lambda function, use the following curl command:
 ```json
-curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data_with_id.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"'
+curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/one_sample_test_dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"}'
 ```
 This will send a POST request to the running Lambda function and pass in the required data as JSON.
--- a/deployment/.dockerignore
+++ b/deployment/.dockerignore
@ -0,0 +1,9 @@
 modules/ml-pipeline/src/pipeline/data/predictions
 modules/ml-pipeline/src/pipeline/data/fit_predictions
 modules/ml-pipeline/src/pipeline/data/prepared_data
 modules/ml-pipeline/src/pipeline/data/model/allmodels
 modules/ml-pipeline/src/pipeline/metrics
 modules/ml-pipeline/src/__pycache__
 modules/ml-pipeline/src/.dvc
 modules/ml-pipeline/src/analysis
 modules/ml-pipeline/src/metrics
--- a/deployment/Dockerfile.prediction.lambda
+++ b/deployment/Dockerfile.prediction.lambda
@ -9,7 +9,7 @@ ARG RUNTIME_ENVIRONMENT
 ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT}
 # Install necessary build tools - required to test locally
-RUN yum install -y gcc python3-devel
+RUN yum install -y gcc python3-devel gcc-c++
 # Install python packages
 COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt
--- a/deployment/handlers/prediction_app.py
+++ b/deployment/handlers/prediction_app.py
@ -69,9 +69,7 @@ def handler(event, context):
        storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet"
        logger.info("-------------------------")
        logger.info(f"--- Initiate MLModel ---")
        logger.info("-------------------------")
        build_model_params = settings.build_model
        client_params = settings.client
@ -80,17 +78,13 @@ def handler(event, context):
        model = model_factory(build_model_params["model_type"])
        logger.info("----------------------------")
        logger.info(f"--- Initiate Input DataClient ---")
        logger.info("----------------------------")
        input_dataclient = dataclient_factory(
            dataclient_type="aws-s3",
            dataclient_config=client_params["aws-s3"],
        )
        logger.info("----------------------------")
        logger.info(f"--- Initiate Output DataClient ---")
        logger.info("----------------------------")
        output_dataclient = dataclient_factory(
            dataclient_type="aws-s3",
            dataclient_config=client_params["aws-s3"],
@ -107,6 +101,7 @@ def handler(event, context):
            predictions_column_name=generate_predictions_params[
                "predictions_column_name"
            ],
            identifier_column=generate_predictions_params["identifier_column"],
        )
        return {
--- a/modules/ml-pipeline/.dvc/.gitignore
+++ b/modules/ml-pipeline/.dvc/.gitignore
@ -1,3 +0,0 @@
 /config.local
 /tmp
 /cache
--- a/modules/ml-pipeline/.dvc/config
+++ b/modules/ml-pipeline/.dvc/config
@ -1,2 +0,0 @@
 ['remote "myremote"']
    url = /tmp/dvcstore
--- a/modules/ml-pipeline/.dvcignore
+++ b/modules/ml-pipeline/.dvcignore
@ -1,3 +0,0 @@
 # Add patterns of files dvc should ignore, which could improve
 # the performance. Learn more at
 # https://dvc.org/doc/user-guide/dvcignore
--- a/modules/ml-pipeline/.gto
+++ b/modules/ml-pipeline/.gto
@ -1,2 +0,0 @@
 # .gto config file
 stages: [dev, stage, prod] # list of allowed Stages
--- a/modules/ml-pipeline/Makefile
+++ b/modules/ml-pipeline/Makefile
@ -9,16 +9,16 @@ init: dev-conda
 .PHONY: dev-conda
 dev-conda:
 	# conda deactivate || echo "Not in conda environment"
-	# conda remove --name $CONDA_ENV --all -y || echo "No environment created previously"
+	# conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously"
-	conda create --name $CONDA_ENV python=$(PYTHON_VERSION) -y
+	conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y
 	conda init bash
-	conda run -vvvv -n $CONDA_ENV pip install --upgrade pip
+	conda run -v -n ${CONDA_ENV} pip install --upgrade pip
-	conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/training/requirements-dev.txt
+	conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt
-	conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/version_control/requirements.txt
+	conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt
-	conda run -vvvv -n $CONDA_ENV pre-commit install
+	conda run -v -n ${CONDA_ENV} pre-commit install
-	conda run -vvvv -n $CONDA_ENV pip install ipykernel
+	conda run -v -n ${CONDA_ENV} pip install ipykernel
 	echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND"
-	echo "conda activate $CONDA_ENV"
+	echo "conda activate ${CONDA_ENV}"
 .PHONY: dev-pyenv
--- a/modules/ml-pipeline/src/.dockerignore
+++ b/modules/ml-pipeline/src/.dockerignore
@ -0,0 +1,8 @@
 pipeline/data/predictions
 pipeline/data/fit_predictions
 pipeline/data/prepared_data/train.parquet
 pipeline/data/fit_predictions
 pipeline/data/model/allmodels
 pipeline/metrics
 pipeline/.dvc
 pipeline/analysis
--- a/modules/ml-pipeline/src/Prediction.Dockerfile
+++ b/modules/ml-pipeline/src/Prediction.Dockerfile
@ -1,7 +1,7 @@
 # Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow)
 FROM python:3.10.12-slim
-RUN apt-get update && apt-get install -y libgomp1
+RUN apt-get update && apt-get install -y libgomp1 gcc python3-dev
 COPY pipeline/requirements/predictions/requirements.txt requirements.txt
--- a/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py
+++ b/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py
@ -16,13 +16,9 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
    Remove the directory where artefacts are stored
    """
    logger.info("---------------------")
    logger.info(f"--- Run Clean up ---")
    logger.info("---------------------")
    logger.info("-------------------------")
    logger.info(f"--- Delete artefacts ---")
    logger.info("-------------------------")
    artefact_directory_path = Path(artefacts_directory)
@ -31,9 +27,7 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
        logger.info(f"Removing the directory: {artefacts_directory}")
        shutil.rmtree(artefact_directory_path)
    logger.info("-----------------------")
    logger.info(f"--- Delete metrics ---")
    logger.info("-----------------------")
    metrics_directory_path = Path(metrics_directory)
@ -45,15 +39,11 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
 if __name__ == "__main__":
    logger.info("----------------------------")
    logger.info(f"--- {__file__} - Start! ---")
    logger.info("----------------------------")
    run_cleanup(
        artefacts_directory=startup_cleanup_params["artefacts"],
        metrics_directory=startup_cleanup_params["metrics"],
    )
    logger.info("-------------------------------")
    logger.info(f"--- {__file__} - Complete! ---")
    logger.info("-------------------------------")
--- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py
+++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py
@ -17,9 +17,7 @@ from core.DataClient import dataclient_factory
 from core.FeatureProcessor import feature_processor_factory
 from config import settings
 logger.info("----------------------------")
 logger.info(f"--- Initiate Parameters ---")
 logger.info("----------------------------")
 RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@ -33,9 +31,7 @@ output_train_filepath = prepare_data_params["output_train_filepath"]
 output_test_filepath = prepare_data_params["output_test_filepath"]
 feature_processor_config = feature_process_params["feature_processor_config"]
 logger.info("----------------------------")
 logger.info(f"--- Initiate DataClient ---")
 logger.info("----------------------------")
 input_dataclient_type = prepare_data_params["input_dataclient_type"]
 output_dataclient_type = prepare_data_params["output_dataclient_type"]
@ -49,9 +45,7 @@ output_dataclient = dataclient_factory(
    dataclient_config=client_params[output_dataclient_type],
 )
 logger.info("----------------------------------")
 logger.info(f"--- Initiate FeatureProcessor ---")
 logger.info("----------------------------------")
 feature_processor = feature_processor_factory(
    feature_process_params["feature_processor_type"]
@ -76,15 +70,11 @@ def prepare_data(
    :param pipeline_mode: bool, Default False, this caches out the file for experimentation, objects returned in pipeline mode
    """
    logger.info("--------------------")
    logger.info("--- Loading data ---")
    logger.info("--------------------")
    data = input_dataclient.load_data(location=data_filepath, load_config={})
    logger.info("--------------------------")
    logger.info("--- Feature Processing ---")
    logger.info("--------------------------")
    data = feature_processor.feature_process(
        data,
@ -93,13 +83,12 @@ def prepare_data(
        new_feature_funcs=new_feature_funcs,
    )
    logger.info("----------------------")
    logger.info("--- Splitting data ---")
    logger.info("----------------------")
    if train_proportion == 1:
        train = data
-        test = None
+        # Sample 10% of the data for testing
        test = data.sample(round(len(data) * 0.1))
    else:
        train, test = train_test_split(
            data, train_size=train_proportion, test_size=(1 - train_proportion)
@ -108,9 +97,7 @@ def prepare_data(
    train = train.reset_index(drop=True)
    logger.info("-----------------------")
    logger.info("--- Outputting data ---")
    logger.info("-----------------------")
    output_dataclient.save_data(
        obj=train, location=output_train_filepath, save_config=None
@ -126,13 +113,9 @@ def prepare_data(
 if __name__ == "__main__":
    logger.info("----------------------------")
    logger.info(f"--- {__file__} - Start! ---")
    logger.info("----------------------------")
    logger.info("---------------------------")
    logger.info(f"--- Prepare Data Stage ---")
    logger.info("---------------------------")
    prepare_data(
        input_dataclient=input_dataclient,
@ -147,6 +130,4 @@ if __name__ == "__main__":
        new_feature_funcs=new_feature_funcs,
    )
    logger.info("-------------------------------")
    logger.info(f"--- {__file__} - Complete! ---")
    logger.info("-------------------------------")
--- a/modules/ml-pipeline/src/pipeline/2_build_model.py
+++ b/modules/ml-pipeline/src/pipeline/2_build_model.py
@ -18,9 +18,7 @@ from core.MLMetrics import metrics_factory
 from configs.post_prediction_logic import post_prediction_logic
 from config import settings
 logger.info("----------------------------")
 logger.info(f"--- Initiate Parameters ---")
 logger.info("----------------------------")
 RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@ -28,9 +26,12 @@ prepare_data_params = settings.prepare_data
 build_model_params = settings.build_model
 feature_process_params = settings.feature_processor
 generate_metrics_params = settings.generate_metrics
 generate_predictions_params = settings.generate_predictions
 model_type = build_model_params["model_type"]
 target = feature_process_params["feature_processor_config"]["target"]
 fit_predictions_filepath = build_model_params["fit_predictions_filepath"]
 predictions_column_name = generate_predictions_params["predictions_column_name"]
 identifier_columns = feature_process_params["feature_processor_config"][
    "identifier_columns"
 ]
@ -40,22 +41,16 @@ train_filepath = prepare_data_params["output_train_filepath"]
 test_filepath = prepare_data_params["output_test_filepath"]
 fit_metrics_filepath = build_model_params["fit_metrics_filepath"]
 logger.info("----------------------------")
 logger.info(f"--- Initiate DataClient ---")
 logger.info("----------------------------")
 # Output of previous prepare data step, will be where the data is
 dataclient = dataclient_factory(prepare_data_params["output_dataclient_type"])
 logger.info("-------------------------")
 logger.info(f"--- Initiate MLModel ---")
 logger.info("-------------------------")
 model = model_factory(model_type)
 logger.info("-------------------------")
 logger.info(f"--- Initiate Metrics ---")
 logger.info("-------------------------")
 metrics = metrics_factory(generate_metrics_params["metrics_type"])
@ -68,6 +63,8 @@ def build_model(
    identifier_columns: List[str],
    model_save_location: str,
    model_hyperparameters: dict,
    fit_predictions_filepath: str,
    predictions_column_name: str,
    fit_metrics_filepath: str,
    train_filepath: Union[str, None] = None,
    test_filepath: Union[str, None] = None,
@ -75,9 +72,7 @@ def build_model(
    test_data: Union[pd.DataFrame, None] = None,
    pipeline_mode: bool = False,
 ):
    logger.info("--------------------------------------")
    logger.info("--- Loading Data for build process ---")
    logger.info("--------------------------------------")
    if train_data is None:
        if train_filepath is None:
@ -89,9 +84,7 @@ def build_model(
            raise ValueError(f"Need {test_filepath} if no data supplied")
        test_data = dataclient.load_data(location=test_filepath, load_config=None)
    logger.info("----------------------")
    logger.info("--- Training model ---")
    logger.info("----------------------")
    model.train_model(
        data=train_data.drop(columns=identifier_columns),
@ -99,32 +92,33 @@ def build_model(
        model_hyperparameters=model_hyperparameters,
    )
    logger.info("----------------------------------")
    logger.info("--- Generating fit predictions ---")
    logger.info("----------------------------------")
    fit_predictions = model.predict(
        data=train_data, post_prediction_logic=post_prediction_logic
    )
-    logger.info("------------------------------")
+    logger.info("--- Saving fit predictions ---")
    predictions_df = pd.DataFrame(fit_predictions)
    predictions_df.columns = [predictions_column_name]
    dataclient.save_data(
        obj=predictions_df, location=fit_predictions_filepath, save_config=None
    )
    logger.info("--- Generating fit metrics ---")
    logger.info("------------------------------")
    metrics_output = metrics.generate_metrics(
        target=train_data[target],
        predictions=pd.Series(fit_predictions),
    )
    logger.info("--------------------")
    logger.info("--- Saving model ---")
    logger.info("--------------------")
    model.save_model(path=Path(model_save_location))
    logger.info("--------------------------")
    logger.info("--- Saving fit metrics ---")
    logger.info("--------------------------")
    dataclient.save_data(
        obj=metrics_output, location=fit_metrics_filepath, save_config=None
@ -133,13 +127,9 @@ def build_model(
 if __name__ == "__main__":
    logger.info("----------------------------")
    logger.info(f"--- {__file__} - Start! ---")
    logger.info("----------------------------")
    logger.info("--------------------------")
    logger.info(f"--- Build Model Stage ---")
    logger.info("--------------------------")
    build_model(
        dataclient=dataclient,
@ -152,8 +142,8 @@ if __name__ == "__main__":
        train_filepath=train_filepath,
        test_filepath=test_filepath,
        fit_metrics_filepath=fit_metrics_filepath,
        fit_predictions_filepath=fit_predictions_filepath,
        predictions_column_name=predictions_column_name,
    )
    logger.info("-------------------------------")
    logger.info(f"--- {__file__} - Complete! ---")
    logger.info("-------------------------------")
--- a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py
+++ b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py
@ -10,9 +10,7 @@ from core.Logger import logger
 from config import settings
 from generate_predictions import generate_predictions
 logger.info("----------------------------")
 logger.info(f"--- Initiate Parameters ---")
 logger.info("----------------------------")
 RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@ -33,15 +31,11 @@ model_filepath = build_model_params["model_save_filepath"]
 predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
 predictions_column_name = generate_predictions_params["predictions_column_name"]
 logger.info("-------------------------")
 logger.info(f"--- Initiate MLModel ---")
 logger.info("-------------------------")
 model = model_factory(build_model_params["model_type"])
 logger.info("----------------------------")
 logger.info(f"--- Initiate DataClient ---")
 logger.info("----------------------------")
 # We may have different locations of loading hence why we use one specified in generate_predictions.yaml
 # I.e. for metric runs, this will be a local data client
@ -59,13 +53,9 @@ output_dataclient = dataclient_factory(
 if __name__ == "__main__":
    logger.info("----------------------------")
    logger.info(f"--- {__file__} - Start! ---")
    logger.info("----------------------------")
    logger.info("----------------------------------")
    logger.info(f"--- Generate Predictions Stage---")
    logger.info("----------------------------------")
    generate_predictions(
        input_dataclient=input_dataclient,
@ -78,6 +68,4 @@ if __name__ == "__main__":
        predictions_column_name=predictions_column_name,
    )
    logger.info("-------------------------------")
    logger.info(f"--- {__file__} - Complete! ---")
    logger.info("-------------------------------")
--- a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py
+++ b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py
@ -16,9 +16,7 @@ from core.MLMetrics import metrics_factory
 from core.Logger import logger
 from config import settings
 logger.info("----------------------------")
 logger.info(f"--- Initiate Parameters ---")
 logger.info("----------------------------")
 RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@ -35,16 +33,11 @@ predictions_output_filepath = generate_predictions_params["predictions_output_fi
 predictions_column_name = generate_predictions_params["predictions_column_name"]
 metrics_output_filepath = generate_metrics_params["metrics_output_filepath"]
 logger.info("-------------------------")
 logger.info(f"--- Initiate MLModel ---")
 logger.info("-------------------------")
 model = model_factory(build_model_params["model_type"])
 logger.info("----------------------------")
 logger.info(f"--- Initiate DataClient ---")
 logger.info("----------------------------")
 # Use data client for input and output, as we use dvc to cache later to the cloud
 dataclient_type = generate_metrics_params["dataclient_type"]
@ -53,9 +46,7 @@ dataclient = dataclient_factory(
    dataclient_config=client_params[dataclient_type],
 )
 logger.info("---------------------------")
 logger.info(f"--- Initiate MLMetrics ---")
 logger.info("---------------------------")
 metrics = metrics_factory(generate_metrics_params["metrics_type"])
@ -75,34 +66,26 @@ def generate_metrics(
    For a given model, we generate prediction and evaluate this against the true target
    """
    logger.info("-------------------------")
    logger.info("--- Loading test data ---")
    logger.info("-------------------------")
    test_data = input_dataclient.load_data(
        location=test_data_filepath, load_config=None
    )
    logger.info("---------------------------")
    logger.info("--- Loading predictions ---")
    logger.info("---------------------------")
    predictions = input_dataclient.load_data(
        location=predictions_output_filepath, load_config=None
    )
    logger.info("--------------------------")
    logger.info("--- Generating metrics ---")
    logger.info("--------------------------")
    metrics_output = metrics.generate_metrics(
        target=test_data[target],
        predictions=pd.Series(predictions[predictions_column_name]),
    )
    logger.info("----------------------")
    logger.info("--- Saving metrics ---")
    logger.info("----------------------")
    output_dataclient.save_data(
        obj=metrics_output, location=metrics_output_filepath, save_config=None
@ -111,13 +94,9 @@ def generate_metrics(
 if __name__ == "__main__":
    logger.info("----------------------------")
    logger.info(f"--- {__file__} - Start! ---")
    logger.info("----------------------------")
    logger.info("------------------------------")
    logger.info(f"--- Generate Metrics Stage---")
    logger.info("------------------------------")
    generate_metrics(
        input_dataclient=dataclient,
@ -131,6 +110,4 @@ if __name__ == "__main__":
        metrics_output_filepath=metrics_output_filepath,
    )
    logger.info("-------------------------------")
    logger.info(f"--- {__file__} - Complete! ---")
    logger.info("-------------------------------")
--- a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py
+++ b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py
@ -0,0 +1,162 @@
 """
 Fourth part of the pipeline:
 After the model is built and metrics are generated,
 we want to test this model against known scenarios
 """
 import os
 import pandas as pd
 from core.interface.InterfaceModels import MLModel
 from core.interface.InterfaceDataClient import DataClient
 from core.interface.InterfaceMetrics import MLMetrics
 from configs.post_prediction_logic import post_prediction_logic
 from core.DataClient import dataclient_factory
 from core.MLModels import model_factory
 from core.MLMetrics import metrics_factory
 from core.Logger import logger
 from config import settings
 logger.info(f"--- Initiate Parameters ---")
 RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
 client_params = settings.client
 prepare_data_params = settings.prepare_data
 build_model_params = settings.build_model
 generate_predictions_params = settings.generate_predictions
 generate_metrics_params = settings.generate_metrics
 feature_process_params = settings.feature_processor
 scenarios_params = settings.scenarios
 model_filepath = build_model_params["model_save_filepath"]
 target = feature_process_params["feature_processor_config"]["target"]
 scenario_data_filepaths = scenarios_params["scenario_data_filepaths"]
 predictions_column_name = generate_predictions_params["predictions_column_name"]
 comparison_output_filepath = scenarios_params["comparison_output_filepath"]
 metrics_output_filepath = scenarios_params["metrics_output_filepath"]
 logger.info(f"--- Initiate MLModel ---")
 model = model_factory(build_model_params["model_type"])
 logger.info(f"--- Initiate DataClient ---")
 # Use data client for input and output, as we use dvc to cache later to the cloud
 input_dataclient_type = scenarios_params["input_dataclient_type"]
 input_dataclient = dataclient_factory(
    dataclient_type=input_dataclient_type,
    dataclient_config=client_params[input_dataclient_type],
 )
 output_dataclient_type = scenarios_params["output_dataclient_type"]
 output_dataclient = dataclient_factory(
    dataclient_type=output_dataclient_type,
    dataclient_config=client_params[output_dataclient_type],
 )
 logger.info(f"--- Initiate MLMetrics ---")
 metrics = metrics_factory(generate_metrics_params["metrics_type"])
 def generate_scenario_predictions(
    input_dataclient: DataClient,
    output_dataclient: DataClient,
    model: MLModel,
    metrics: MLMetrics,
    model_filepath: str,
    scenario_data_filepaths: list,
    predictions_column_name: str,
    comparison_output_filepath: str,
    metrics_output_filepath: str,
 ):
    """
    Given the new model, we generate prediction for expected scenarios
    """
    logger.info("--- Loading Scenario Data ---")
    scenario_data = pd.DataFrame()
    # If we have no scenario data, we can save empty dataframes
    if scenario_data_filepaths is None:
        logger.info("No scenario data filepaths provided")
        output_dataclient.save_data(
            obj=scenario_data, location=comparison_output_filepath, save_config=None
        )
        output_dataclient.save_data(
            obj=scenario_data, location=metrics_output_filepath, save_config=None
        )
        return
    # Can have multiple scenario data files
    for scenario_data_filepath in scenario_data_filepaths:
        scenario_data = pd.concat(
            [
                scenario_data,
                input_dataclient.load_data(scenario_data_filepath, load_config=None),
            ]
        )
    logger.info("--- Loading Model ---")
    model.load_model(model_filepath)
    logger.info("--- Generating Predictions ---")
    predictions = model.predict(
        data=scenario_data, post_prediction_logic=post_prediction_logic
    )
    logger.info("--- Generate Scenario Predicted Impact ---")
    predictions_df = pd.DataFrame(predictions)
    predictions_df.columns = [predictions_column_name]
    scenario_data = pd.concat([scenario_data, predictions_df], axis=1)
    scenario_data["predicted_impact"] = abs(
        scenario_data[predictions_column_name] - scenario_data["sap_starting"]
    )
    logger.info("--- Generate Metrics ---")
    metrics_dict = metrics.generate_metrics(
        scenario_data["impact"], scenario_data["predicted_impact"]
    )
    metrics_df = pd.DataFrame(metrics_dict, index=[0]).T.reset_index()
    metrics_df.columns = ["metric", "value"]
    logger.info("--- Save prediction into metrics ---")
    output_df = scenario_data[["uprn", "id", "impact", "predicted_impact"]]
    output_dataclient.save_data(
        obj=output_df, location=comparison_output_filepath, save_config=None
    )
    output_dataclient.save_data(
        obj=metrics_df, location=metrics_output_filepath, save_config=None
    )
 if __name__ == "__main__":
    logger.info(f"--- {__file__} - Start! ---")
    logger.info(f"--- Generate Scenario Predictions ---")
    generate_scenario_predictions(
        input_dataclient=input_dataclient,
        output_dataclient=output_dataclient,
        model=model,
        metrics=metrics,
        model_filepath=model_filepath,
        scenario_data_filepaths=scenario_data_filepaths,
        predictions_column_name=predictions_column_name,
        comparison_output_filepath=comparison_output_filepath,
        metrics_output_filepath=metrics_output_filepath,
    )
    logger.info(f"--- {__file__} - Complete! ---")
--- a/modules/ml-pipeline/src/pipeline/README.md
+++ b/modules/ml-pipeline/src/pipeline/README.md
@ -37,3 +37,4 @@ Workflow:
    - This experiment will have the corresponding .dvc files for the hashed model and data
 - Use version control as normal
    - git add, git commit etc
 - To revert change, use `git checkout {COMMIT_HASH}`, followed by `git switch -c {NEW_BRANCH_NAME}`
--- a/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet
+++ b/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet
--- a/modules/ml-pipeline/src/pipeline/config.py
+++ b/modules/ml-pipeline/src/pipeline/config.py
@ -7,6 +7,7 @@ settings = Dynaconf(
        "./configs/settings.yaml",
        "./configs/build_model.yaml",
        "./configs/analysis.yaml",
        "./configs/scenarios.yaml",
    ],
 )
--- a/modules/ml-pipeline/src/pipeline/configs/analysis.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/analysis.yaml
@ -13,4 +13,4 @@ default:
    dataclient_type: local
    nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
    n_val: 30  # how many datapoints from validation data should we interpret predictions for, larger values will be slower
-    row_index: [0, 10, 20] # index of an example datapoint
+    row_index: [20695, 50243, 7653] # index of an example datapoint
--- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
@ -3,6 +3,7 @@ default:
    model_type: AutogluonAutoML
    model_save_filepath: ./data/model/optimised/
    fit_metrics_filepath: ./metrics/fit_metrics.json
    fit_predictions_filepath: ./data/fit_predictions/predictions.parquet
    SKLearnLinearRegression: null
@ -13,6 +14,9 @@ default:
      output_filepath: ./data/model/allmodels/
      problem_type: regression
      eval_metric: mean_squared_error #mean_absolute_error
-      time_limit: 4000
+      time_limit: 1800
      presets: medium_quality
-      excluded_model_types: ['KNN', 'RF']
+      excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT']
      infer_limit: 0.05
      infer_limit_batch_size: 10000
      ag_args_ensemble: {'num_folds_parallel': 2}
--- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
+++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
@ -9,15 +9,42 @@ Business Logic dict + functions
 def remove_starting_columns(df):
    keep_column_index = [
-        False if col_name.endswith("_STARTING") else True
+        False if col_name.endswith("_starting") else True
        for col_name in list(df.columns)
    ]
    keep_columns = df.columns[keep_column_index].to_list()
-    keep_columns.append("SAP_STARTING")
+    keep_columns.append("sap_starting")
    df = df[keep_columns]
    return df
 def remove_floor_height_ending(df):
    # df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING']
    # shows bottom 0.5 percentile is 1.665
    # So keep anything above this
    df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True)
    print("we in here")
    return df
 def remove_minimum_habitable_room_size(df):
    # Need minimum of 6.5m per habitable room
    df = df[
        df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5
    ].reset_index(drop=True)
    return df
 def keep_flats(df):
    df = df[df["property_type"] == "Flat"]
    return df
 def keep_non_zero_rdsap(df):
    df = df[df["rdsap_change"] != 0]
    return df
 # def keep_ending_columns(df):
 #     ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
 #     keep_columns = df.columns[ending_column_index].to_list()
@ -27,6 +54,10 @@ def remove_starting_columns(df):
 #     return df
 business_logic = {
    # "keep_non_zero_rdsap": keep_non_zero_rdsap,
    # "keep_flats": keep_flats,
    # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,
    # "remove_floor_height_ending": remove_floor_height_ending
    # "remove_starting_columns": remove_starting_columns
    # "keep_ENDING_COLUMNS": keep_ending_columns
 }
--- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py
+++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py
@ -5,16 +5,18 @@ import pandas as pd
 def clip_predictions_to_minimum_value(
-    data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 1
+    data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0
 ) -> pd.Series:
    series_name = predictions.name
    predictions.name = "predictions"
    predictions_df = pd.concat([data, predictions], axis=1)
    # We expect all prediction to be atleast one point improvement
-    replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"]
+    replace_index = (
        predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"]
    )
    predictions_df.loc[replace_index, "predictions"] = (
-        predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value
+        predictions_df.loc[replace_index, "sap_starting"] + minimum_value
    )
    predictions_new = predictions_df["predictions"]
--- a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml
@ -0,0 +1,13 @@
 default:
  scenarios:
    input_dataclient_type: aws-s3
    output_dataclient_type: local
    scenario_data_filepaths:
      # - s3://retrofit-data-dev/scenario_data/22-03-2024-19-20-09/recommendations_scoring_data.parquet
      # - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet
      # - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
      # - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet
      # - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet
      - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
    comparison_output_filepath: ./metrics/scenario_table.md
    metrics_output_filepath: ./metrics/scenario_metrics.md
--- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml
@ -18,10 +18,10 @@ default:
  prepare_data:
    input_dataclient_type: aws-s3
    output_dataclient_type: local
-    # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet
+    # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
-    # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet
+    # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet
-    # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
+    # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet
-    data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
+    data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
    train_proportion: 0.9
    output_train_filepath: ./data/prepared_data/train.parquet
    output_test_filepath: ./data/prepared_data/test.parquet
@ -31,11 +31,37 @@ default:
    feature_processor_config:
      subsample_amount: null
      subsample_seed: 0
-      target: SAP_ENDING
+      target: sap_ending
-      identifier_columns: ["UPRN"]
+      identifier_columns: ["uprn"]
-      drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
+      # drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"]
-      # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
+      drop_columns: [
        "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending",
        'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
        'number_habitable_rooms', 'number_heated_rooms']
      retain_features: null
      # retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending',
      #  'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending',
      #  'walls_energy_eff_ending', 'secondheat_description_ending',
      #  'property_type', 'mainheatc_energy_eff_ending', 'built_form',
      #  'walls_insulation_thickness_ending', 'potential_energy_efficiency',
      #  'transaction_type_ending',
      #  'floor_thermal_transmittance_ending',
      #  'low_energy_lighting_ending', 'heat_demand_starting',
      #  'photo_supply_ending', 'carbon_starting',
      #  'walls_thermal_transmittance_ending',
      #  'roof_insulation_thickness_ending',
      #  'total_floor_area_ending', 'number_open_fireplaces_ending',
      #  'windows_energy_eff_ending',
      #  'floor_height_ending',
      #  'extension_count_ending',
      #  'has_air_source_heat_pump_ending',
      #  'charging_system_ending', 'construction_age_band', 'glazed_type_ending',
      #  'roof_thermal_transmittance_ending',
      #  'floor_insulation_thickness_ending', 'has_mains_gas_ending',
      #  'estimated_perimeter_starting', 'energy_consumption_potential',
      #  'environment_impact_potential', 'heater_type_ending',
      #  'multi_glaze_proportion_ending',
      #  'lighting_energy_eff_ending', 'fixed_lighting_outlets_count']
  generate_predictions:
    input_dataclient_type: local
@ -43,6 +69,7 @@ default:
    test_data_filepath: ./data/prepared_data/test.parquet
    predictions_output_filepath: ./data/predictions/predictions.parquet
    predictions_column_name: predictions
    identifier_column: id
  generate_metrics:
    dataclient_type: local
--- a/modules/ml-pipeline/src/pipeline/core/DataClient.py
+++ b/modules/ml-pipeline/src/pipeline/core/DataClient.py
@ -142,9 +142,15 @@ class AWSS3Client:
        buffer = BytesIO()
        obj.to_parquet(buffer, index=False)
        # Reset the buffer position to the beginning
        buffer.seek(0)
        bucket, key = location.strip("s3://").split("/", 1)
        self.client.upload_fileobj(buffer, bucket, key)
        # Close the buffer
        buffer.close()
    def _load_parquet(self, location: str, load_config: dict) -> pd.DataFrame:
        """
        Load a parquet file
@ -239,7 +245,8 @@ class LocalClient:
        save_methods = {
            ".parquet": self._save_parquet,
-            ".json": self._save_json
+            ".json": self._save_json,
            ".md": self._save_md,
            # "": _save_directory(**save_config),
            # ADD MORE save_methods HERE
        }
@ -288,3 +295,10 @@ class LocalClient:
        # Write the contents of the buffer to the local file
        with open(location, "wb") as f:
            f.write(buffer.getvalue())
    def _save_md(self, obj: pd.DataFrame, location: str, save_config: dict):
        """
        Save object as markdown
        """
        obj.to_markdown(location, **save_config)
--- a/modules/ml-pipeline/src/pipeline/core/Logger.py
+++ b/modules/ml-pipeline/src/pipeline/core/Logger.py
@ -21,6 +21,7 @@ def setup_logger():
    # Add the stream handler to the logger
    logger.addHandler(stream_handler)
    logger.propagate = False
    return logger
--- a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py
+++ b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py
@ -4,6 +4,7 @@ Implementation of MLMetrics, all of which will have two methods:
 - Generate Plot Suite
 """
 import numpy as np
 import pandas as pd
 from typing import Union
 from sklearn.metrics import (
@ -14,6 +15,18 @@ from sklearn.metrics import (
 )
 from core.interface.InterfaceMetrics import MLMetrics
 # Define the function to return the SMAPE value
 def symmetric_mape(actual, predicted) -> float:
    # Convert actual and predicted to numpy
    # array data type if not already
    if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual), np.array(predicted)
    return np.mean(
        np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual)) / 2)
    )
 def metrics_factory(metrics_type: str) -> MLMetrics:
    metrics = {
@ -34,7 +47,7 @@ class RegressionMetrics:
        median_absolute_error,
        mean_squared_error,
        mean_absolute_percentage_error,
-        # max_error
+        symmetric_mape,
    ]
    def generate_metrics(
--- a/modules/ml-pipeline/src/pipeline/core/MLModels.py
+++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py
@ -25,7 +25,7 @@ def model_factory(model_type: str) -> MLModel:
    models = {
        "SKLearnLinearRegression": SKLearnLinearRegression(),
        "SKLearnSVMRegression": SKLearnSVMRegression(),
-        "AutogluonAutoML": AutogluonAutoML()
+        "AutogluonAutoML": AutogluonAutoML(),
        # ADD OTHER MODELS HERE
    }
@ -149,6 +149,9 @@ class AutogluonAutoML:
        "time_limit",
        "presets",
        "excluded_model_types",
        "infer_limit",
        "infer_limit_batch_size",
        "ag_args_ensemble",
    ]
    def load_model(self, path: Union[Path, str]) -> None:
@ -203,6 +206,9 @@ class AutogluonAutoML:
            time_limit=model_hyperparameters["time_limit"],
            presets=model_hyperparameters["presets"],
            excluded_model_types=model_hyperparameters["excluded_model_types"],
            infer_limit=model_hyperparameters["infer_limit"],
            infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"],
            ag_args_ensemble=model_hyperparameters["ag_args_ensemble"],
        )
    def predict(
--- a/modules/ml-pipeline/src/pipeline/dvc.lock
+++ b/modules/ml-pipeline/src/pipeline/dvc.lock
@ -1,26 +1,46 @@
 schema: '2.0'
 stages:
  startup_cleanup:
    cmd: python 0_startup_cleanup.py
    deps:
    - path: 0_startup_cleanup.py
      hash: md5
      md5: b1b12f6b6393fbf8b83d23684df0a3d4
      size: 1220
    params:
      configs/settings.yaml:
        default.startup_cleanup.artefacts: ./data
        default.startup_cleanup.metrics: ./metrics
  prepare_data:
    cmd: python 1_prepare_data.py
    deps:
    - path: 1_prepare_data.py
      hash: md5
-      md5: c9f030df733e318b80d1fa91b7732f79
+      md5: 11a3b8bfdfe199ab7ecc39ccc5652649
-      size: 5132
+      size: 4298
    params:
      configs/settings.yaml:
        default.feature_processor.feature_processor_config.drop_columns:
-        - HEAT_DEMAND_CHANGE
+        - heat_demand_change
-        - CARBON_CHANGE
+        - carbon_change
-        - RDSAP_CHANGE
+        - rdsap_change
-        - HEAT_DEMAND_ENDING
+        - heat_demand_ending
-        - CARBON_ENDING
+        - carbon_ending
        - days_to_starting
        - days_to_ending
        - number_habitable_rooms_starting
        - number_habitable_rooms_ending
        - number_heated_rooms_starting
        - number_heated_rooms_ending
        - number_habitable_rooms
        - number_heated_rooms
        default.feature_processor.feature_processor_config.retain_features:
        default.feature_processor.feature_processor_config.subsample_amount:
        default.feature_processor.feature_processor_config.subsample_seed: 0
-        default.feature_processor.feature_processor_config.target: SAP_ENDING
+        default.feature_processor.feature_processor_config.target: sap_ending
        default.feature_processor.feature_processor_type: dataframe
-        default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
+        default.prepare_data.data_filepath:
          s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
        default.prepare_data.input_dataclient_type: aws-s3
        default.prepare_data.output_dataclient_type: local
        default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
@ -29,20 +49,20 @@ stages:
    outs:
    - path: data/prepared_data/
      hash: md5
-      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
+      md5: 80c9e138146a1d96b9d16091c207e2e8.dir
-      size: 33881619
+      size: 45056059
      nfiles: 2
  build_model:
    cmd: python 2_build_model.py
    deps:
    - path: 2_build_model.py
      hash: md5
-      md5: 84699d208874c52accaff61c6af9bb0a
+      md5: 7231450b78920b0c5e7c6bada496b24a
-      size: 5359
+      size: 4820
    - path: data/prepared_data
      hash: md5
-      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
+      md5: 80c9e138146a1d96b9d16091c207e2e8.dir
-      size: 33881619
+      size: 45056059
      nfiles: 2
    params:
      configs/build_model.yaml:
@ -51,6 +71,7 @@ stages:
            model_type: AutogluonAutoML
            model_save_filepath: ./data/model/optimised/
            fit_metrics_filepath: ./metrics/fit_metrics.json
            fit_predictions_filepath: ./data/fit_predictions/predictions.parquet
            SKLearnLinearRegression:
            SKLearnSVMRegression:
              kernel: linear
@ -58,37 +79,49 @@ stages:
              output_filepath: ./data/model/allmodels/
              problem_type: regression
              eval_metric: mean_squared_error
-              time_limit: 4000
+              time_limit: 1800
              presets: medium_quality
              excluded_model_types:
              - KNN
              - RF
              - CAT
              - NN_TORCH
              - KNN
              - XT
              infer_limit: 0.05
              infer_limit_batch_size: 10000
              ag_args_ensemble:
                num_folds_parallel: 2
    outs:
    - path: data/fit_predictions/
      hash: md5
      md5: d9c9afc05e8780db47c0548b19bf7d19.dir
      size: 3349989
      nfiles: 1
    - path: data/model/
      hash: md5
-      md5: 7bb5156243b4db39349e80a01ffecde4.dir
+      md5: 13c3100e1486c27a83a8a47491077842.dir
-      size: 473398662
+      size: 773523079
-      nfiles: 27
+      nfiles: 36
    - path: metrics/fit_metrics.json
      hash: md5
-      md5: 2bb16ac67de8778fbc08171d562b34d5
+      md5: 2ff70a2a45813e1bcdf2ea3aa8e07d4a
-      size: 184
+      size: 224
  generate_predictions:
    cmd: python 3_generate_predictions.py
    deps:
    - path: 3_generate_predictions.py
      hash: md5
-      md5: 5ef2856a5a977304f1ec01f9b4205262
+      md5: 0a70ad4dfe99414a75d1261c75a177b9
-      size: 3028
+      size: 2464
    - path: data/model
      hash: md5
-      md5: 7bb5156243b4db39349e80a01ffecde4.dir
+      md5: 13c3100e1486c27a83a8a47491077842.dir
-      size: 473398662
+      size: 773523079
-      nfiles: 27
+      nfiles: 36
    - path: data/prepared_data
      hash: md5
-      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
+      md5: 80c9e138146a1d96b9d16091c207e2e8.dir
-      size: 33881619
+      size: 45056059
      nfiles: 2
    params:
      configs/settings.yaml:
@ -100,25 +133,25 @@ stages:
    outs:
    - path: data/predictions/
      hash: md5
-      md5: 0bb3cf991906953def81c8204cdcfaf0.dir
+      md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
-      size: 374532
+      size: 463197
      nfiles: 1
  generate_metrics:
    cmd: python 4_generate_metrics.py
    deps:
    - path: 4_generate_metrics.py
      hash: md5
-      md5: 2c9fb78955a8c19cff0a098976f81d1b
+      md5: 4fedb86d89d528f0a6597934ba3890a0
-      size: 4487
+      size: 3484
    - path: data/predictions
      hash: md5
-      md5: 0bb3cf991906953def81c8204cdcfaf0.dir
+      md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
-      size: 374532
+      size: 463197
      nfiles: 1
    - path: data/prepared_data
      hash: md5
-      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
+      md5: 80c9e138146a1d96b9d16091c207e2e8.dir
-      size: 33881619
+      size: 45056059
      nfiles: 2
    params:
      configs/settings.yaml:
@ -128,16 +161,30 @@ stages:
    outs:
    - path: metrics/metrics.json
      hash: md5
-      md5: 2e13ae67759a64261d03224f1c0d4bf4
+      md5: 3e08df02fd5c5d094bcf936e1338d596
-      size: 185
+      size: 223
-  startup_cleanup:
+  generate_scenerio_metrics:
-    cmd: python 0_startup_cleanup.py
+    cmd: python 5_generate_scenarios.py
    deps:
-    - path: 0_startup_cleanup.py
+    - path: 5_generate_scenarios.py
      hash: md5
-      md5: fbb7e3b1b98b517c870f3e1df3e7f695
+      md5: 40506749fefd926d47c60ff5b16db307
-      size: 1676
+      size: 5337
    params:
-      configs/settings.yaml:
+      configs/scenarios.yaml:
-        default.startup_cleanup.artefacts: ./data
+        default.scenarios:
-        default.startup_cleanup.metrics: ./metrics
+          input_dataclient_type: aws-s3
          output_dataclient_type: local
          scenario_data_filepaths:
          - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
          comparison_output_filepath: ./metrics/scenario_table.md
          metrics_output_filepath: ./metrics/scenario_metrics.md
    outs:
    - path: metrics/scenario_metrics.md
      hash: md5
      md5: fa4d6d7bbd7818613800da5f8f37ea96
      size: 363
    - path: metrics/scenario_table.md
      hash: md5
      md5: d6baf100a1623cc2467c2f8221d314c9
      size: 2133
--- a/modules/ml-pipeline/src/pipeline/dvc.yaml
+++ b/modules/ml-pipeline/src/pipeline/dvc.yaml
@ -38,6 +38,7 @@ stages:
    - configs/build_model.yaml:
    outs:
    - data/model/
    - data/fit_predictions/
    - metrics/fit_metrics.json
    always_changed: true
  generate_predictions:
@ -70,6 +71,17 @@ stages:
    outs:
    - metrics/metrics.json
    always_changed: true
  generate_scenerio_metrics:
    cmd: python 5_generate_scenarios.py
    deps:
    - 5_generate_scenarios.py
    params:
    - configs/scenarios.yaml:
      - default.scenarios
    outs:
    - metrics/scenario_table.md
    - metrics/scenario_metrics.md
    always_changed: true
 metrics:
  - metrics/metrics.json
  - metrics/fit_metrics.json
--- a/modules/ml-pipeline/src/pipeline/eda.py
+++ b/modules/ml-pipeline/src/pipeline/eda.py
@ -190,28 +190,35 @@ prediction_analysis_params = settings.prediction_analysis
 model = model_factory(build_model_params["model_type"])
 model.load_model(build_model_params["model_save_filepath"])
 dataclient_type = prediction_analysis_params["dataclient_type"]
-dataclient = dataclient_factory(
+# dataclient_type = 'aws-s3'
-    dataclient_type=dataclient_type,
+# dataclient = dataclient_factory(
-    dataclient_config=client_params[dataclient_type],
+#     dataclient_type=dataclient_type,
-)
+#     dataclient_config=client_params[dataclient_type],
 # )
 # data = dataclient.load_data("s3://retrofit-data-dev/sap_change_model/dataset.parquet")
 target = feature_process_params["feature_processor_config"]["target"]
 predictions_column_name = generate_predictions_params["predictions_column_name"]
 output_test_filepath = prepare_data_params["output_test_filepath"]
 predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
-test_df = dataclient.load_data(output_test_filepath)
+# score_data = dataclient.load_data("s3://retrofit-data-dev/carbon_change_predictions/51/2023-11-28T21:01:21.869339.parquet")
-predictions = dataclient.load_data(predictions_output_filepath)
+
 local_dataclient = dataclient_factory(
    dataclient_type="local",
    dataclient_config=client_params["local"],
 )
 test_df = local_dataclient.load_data(output_test_filepath)
 predictions = local_dataclient.load_data(predictions_output_filepath)
 mix_df = pd.concat([test_df.copy(), predictions], axis=1)
 mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
 mix_df = mix_df.sort_values("residual", ascending=False)
-cosine_similarity_df = mix_df[
+cosine_similarity_df = mix_df[mix_df.columns.difference(["predictions", "residual"])]
    mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
 ]
 from sklearn.metrics.pairwise import cosine_similarity
-row_index = 58199
+row_index = 0
 from sklearn.preprocessing import LabelEncoder
@ -225,7 +232,17 @@ feature_vector = cosine_similarity_df.loc[[row_index]]
 cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector)
 similar_index = (
-    cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index
+    cosine_similarity_df.sort_values("cosine", ascending=False).head(15).index
 )
 check_df = mix_df.loc[similar_index]
 columns_to_check = [
    "LOW_ENERGY_LIGHTING_ENDING",
    "walls_thermal_transmittance_ENDING",
    "floor_thermal_transmittance_ENDING",
    "roof_thermal_transmittance_ENDING",
    "roof_insulation_thickness_ENDING",
 ]
 cosine_similarity_df = mix_df[columns_to_check]
--- a/modules/ml-pipeline/src/pipeline/generate_predictions.py
+++ b/modules/ml-pipeline/src/pipeline/generate_predictions.py
@ -14,28 +14,23 @@ def generate_predictions(
    test_data_filepath: str,
    predictions_output_filepath: str,
    predictions_column_name: str,
    identifier_column: str = "id",
 ):
    """
    For a given model, we generate prediction and evaluate this against the true target
    """
    logger.info("-------------------------")
    logger.info("--- Loading test data ---")
    logger.info("-------------------------")
    test_data = input_dataclient.load_data(
        location=test_data_filepath, load_config=None
    )
    logger.info("---------------------")
    logger.info("--- Loading model ---")
    logger.info("---------------------")
    model.load_model(model_filepath)
    logger.info("------------------------------")
    logger.info("--- Generating predictions ---")
    logger.info("------------------------------")
    prediction_data = (
        test_data.drop(columns=target) if target in test_data.columns else test_data
@ -45,13 +40,17 @@ def generate_predictions(
        data=prediction_data, post_prediction_logic=post_prediction_logic
    )
    logger.info("--------------------------")
    logger.info("--- Saving predictions ---")
    logger.info("--------------------------")
    predictions_df = pd.DataFrame(predictions)
    predictions_df.columns = [predictions_column_name]
-    output_dataclient.save_data(
+    output_df = (
-        obj=predictions_df, location=predictions_output_filepath, save_config=None
+        pd.concat([test_data[identifier_column], predictions_df], axis=1)
        if identifier_column in test_data.columns
        else predictions_df
    )
    output_dataclient.save_data(
        obj=output_df, location=predictions_output_filepath, save_config=None
    )
--- a/modules/ml-pipeline/src/pipeline/metrics/.gitignore
+++ b/modules/ml-pipeline/src/pipeline/metrics/.gitignore
@ -1,2 +1,4 @@
 /fit_metrics.json
 /metrics.json
 /scenario_table.md
 /scenario_metrics.md
--- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt
@ -1,7 +1,7 @@
 joblib==1.3.2
 boto3==1.28.17
-pandas==1.5.3
+pandas==2.1.4
-autogluon==0.8.2
+autogluon.tabular[all]==1.0.0
-dynaconf==3.2.0
+dynaconf==3.2.1
 pyarrow==13.0.0
 pre-commit==3.3.3
--- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt
@ -1,7 +1,7 @@
 joblib==1.3.2
 boto3==1.28.17
-pandas==1.5.3
+pandas==2.1.4
-autogluon==0.8.2
+autogluon.tabular[all]==1.0.0
-dynaconf==3.2.0
+dynaconf==3.2.1
 pyarrow==13.0.0
 PyYAML==6.0.1
--- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt
@ -1,9 +1,10 @@
 joblib==1.3.2
 boto3==1.28.17
-pandas==1.5.3
+pandas==2.1.4
-autogluon==0.8.2
+autogluon.tabular[all]==1.0.0
-dynaconf==3.2.0
+ray==2.6.3
-alibi==0.9.4
+dynaconf==3.2.1
 alibi==0.9.5
 shap==0.42.1
 pyarrow==13.0.0
 pre-commit==3.3.3
--- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt
@ -1,4 +1,4 @@
 boto3==1.28.41
-pandas==1.5.3
+pandas==2.1.4
-autogluon==0.8.2
+autogluon.tabular[all]==1.0.0
-dynaconf==3.2.0
+dynaconf==3.2.1
--- a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
@ -1,4 +1,4 @@
-dvc==3.18.0
+dvc==3.51.0
-dvc-s3==2.23.0
+dvc-s3==3.2.0
-gto==1.0.4
+gto==1.7.1
-pyOpenSSL==23.2.0
+pyOpenSSL==23.3.0
Author	SHA1	Message	Date
quandanrepo	b8dcf626b2	Merge pull request #117 from Hestia-Homes/sap-dev Sap dev	2024-05-30 20:18:25 +01:00
Github-Bot	d09c534e0d	Update Registry	2024-05-30 11:47:46 +00:00
Github-Bot	9925b54af2	Update Registry	2024-05-30 11:47:04 +00:00
KhalimCK	d307d9e093	Merge pull request #116 from Hestia-Homes/sap-dev-assumed Sap dev assumed	2024-05-30 12:46:28 +01:00
Michael Duong	1944ea1cf1	Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-assumed	2024-05-28 19:59:07 +01:00
Michael Duong	8399092e20	formatting	2024-05-28 19:58:46 +01:00
Github-Bot	dc260fddd0	Update Registry	2024-05-28 15:58:31 +00:00
Github-Bot	6f00d6afb8	Update Registry	2024-05-28 15:57:55 +00:00
quandanrepo	1f0414a905	Merge pull request #115 from Hestia-Homes/sap-dev-assumed Sap dev assumed	2024-05-28 16:57:22 +01:00
Michael Duong	5e0118ca0b	change deployment - pineed serverless pajkage	2024-05-28 16:55:47 +01:00
Michael Duong	7e3a6f7700	Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-assumed	2024-05-26 10:46:38 +01:00
Github-Bot	396a5ffb08	Update Registry	2024-05-26 09:08:23 +00:00
Github-Bot	a78c5a50b0	Update Registry	2024-05-26 09:07:46 +00:00
quandanrepo	dc70b84626	Merge pull request #113 from Hestia-Homes/sap-dev-gto Sap dev gto	2024-05-26 10:07:07 +01:00
Michael Duong	e0954b52bc	Upgrade dvc packages to fix pygit2 error	2024-05-26 09:56:05 +01:00
Michael Duong	9e23eae27a	add testing script	2024-05-26 09:54:22 +01:00
Michael Duong	0768ace947	add testing script	2024-05-26 09:50:39 +01:00
Michael Duong	4ff7cfb271	Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-gto	2024-05-26 09:47:23 +01:00
Michael Duong	a4dffe527a	add testing script	2024-05-26 09:47:08 +01:00
quandanrepo	8adfa72036	Merge pull request #111 from Hestia-Homes/sap-dev-package Sap dev package	2024-05-26 09:31:46 +01:00
Michael Duong	29b350e33b	Merge branch 'master' of github.com:Hestia-Homes/ML into sap-dev-assumed	2024-05-26 09:28:16 +01:00
Michael Duong	b985bbf753	new model with is_as_built_ending and is assumed columns	2024-05-26 09:28:00 +01:00
Michael Duong	f43d077479	use previous model with new downstream processes	2024-04-22 19:10:40 +01:00
Michael Duong	52f33239f4	Merge branch 'sap-dev-package' of github.com:Hestia-Homes/ML into sap-dev-package	2024-04-22 19:02:13 +01:00
Michael Duong	874b1db5f3	add ignored file to dockerignore	2024-04-22 19:01:56 +01:00
Michael Duong	7a3477c0e1	change to all packages	2024-04-22 13:30:58 +01:00
Michael Duong	87e3cc391e	push files to s3	2024-04-19 17:48:15 +01:00
Michael Duong	380bd6b595	correct the dockerignore files and test model with just tabular	2024-04-19 17:34:10 +01:00
Michael Duong	50a3e2d5be	correct the dockerignore files and test model with just tabular	2024-04-19 16:25:26 +01:00
Michael Duong	620c1d10a1	correct the dockerignore files and test model with just tabular	2024-04-19 16:22:06 +01:00
Michael Duong	179c334b6e	add switch to turn off scenario data (for carbon and heat for now)	2024-04-19 14:38:57 +01:00
quandanrepo	502621e434	Merge pull request #110 from Hestia-Homes/sap-dev Sap dev	2024-04-19 14:36:45 +01:00
Github-Bot	e97c01c366	Update Registry	2024-03-28 15:23:18 +00:00
Github-Bot	94a6aaa38f	Update Registry	2024-03-28 15:22:33 +00:00
quandanrepo	98254555a1	Merge pull request #108 from Hestia-Homes/sap-dev-model add c++ to docker, fixed dynaconf	2024-03-28 15:21:31 +00:00
Michael Duong	7aeaa9a5f6	add c++ to docker, fixed dynaconf	2024-03-28 15:13:20 +00:00
Github-Bot	a7bb61433a	Update Registry	2024-03-28 09:31:07 +00:00
Github-Bot	64a5c93833	Update Registry	2024-03-28 09:30:30 +00:00
KhalimCK	e746352977	Merge pull request #104 from Hestia-Homes/sap-dev-model Sap dev model	2024-03-28 09:29:53 +00:00
Michael Duong	1bb1f8d61f	add metrics for scenarios	2024-03-27 12:30:31 +00:00
Michael Duong	c3985e2104	add metrics for scenarios	2024-03-27 12:22:58 +00:00
Michael Duong	9b6aeae0da	medium model with scenario and upgraded autogluon	2024-03-26 22:32:44 +00:00
Michael Duong	96f5b37001	medium model with scenario and upgraded autogluon	2024-03-26 22:32:14 +00:00
Michael Duong	8a9b5877b5	medium model with scenario and upgraded autogluon	2024-03-26 22:30:50 +00:00
Michael Duong	ad2c4d6019	upgrade autogluon	2024-03-21 14:41:58 +00:00
Michael Duong	d5f40a8eb2	only ending	2024-02-17 21:17:34 +00:00
Michael Duong	cec3cc60e7	test less features	2024-02-17 16:26:49 +00:00
Michael Duong	81e7c2a4bd	test this version	2024-02-16 16:57:37 +00:00
Michael Duong	fe430c4326	test this version	2024-02-16 16:54:18 +00:00
Michael Duong	49e66411ce	test this version	2024-02-16 16:51:43 +00:00
Michael Duong	fdbf339d63	try the scenario cml	2024-02-16 16:44:43 +00:00
Michael Duong	2221283de4	try the scenario cml	2024-02-16 16:43:23 +00:00
Github-Bot	7f2f80af22	Update Registry	2024-02-16 16:36:38 +00:00
Github-Bot	99e883584b	Update Registry	2024-02-16 16:35:54 +00:00
KhalimCK	3ee352b719	Merge pull request #103 from Hestia-Homes/sap-dev-revert revert change on sap-dev-model	2024-02-16 16:35:18 +00:00
Michael Duong	0e2bff9d64	revert changes	2024-02-16 16:30:13 +00:00
Michael Duong	e060aeb4c0	Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-revert	2024-02-16 16:25:57 +00:00
Michael Duong	a9b50c8a2d	revert change on sap-dev-model	2024-02-16 16:23:37 +00:00
Github-Bot	6e76716fbc	Update Registry	2024-02-16 14:52:15 +00:00
Github-Bot	86352ce0ce	Update Registry	2024-02-16 14:51:31 +00:00
KhalimCK	33d0f6b323	Merge pull request #102 from Hestia-Homes/sap-dev-model Sap dev model	2024-02-16 14:50:51 +00:00
Michael Duong	8363d5f0de	Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-model	2024-02-15 15:11:08 +00:00
Michael Duong	603dfe2eab	new model with starting and ending rooms	2024-02-15 15:10:49 +00:00
Github-Bot	babbc155e9	Update Registry	2024-02-12 18:34:09 +00:00
Github-Bot	d21fd1c4e8	Update Registry	2024-02-12 18:33:28 +00:00
KhalimCK	6815cfcc66	Merge pull request #101 from Hestia-Homes/sap-dev-model Sap dev model	2024-02-12 18:32:38 +00:00
Michael Duong	fedcd1ed92	Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-model	2024-02-10 12:30:52 +00:00
Michael Duong	eeb653c041	new model	2024-02-10 11:03:38 +00:00
Github-Bot	8a1e2958b4	Update Registry	2024-02-09 18:54:16 +00:00
Github-Bot	bc44376e07	Update Registry	2024-02-09 18:53:22 +00:00
KhalimCK	89604645d5	Merge pull request #99 from Hestia-Homes/sap-dev-model Sap dev model	2024-02-09 18:52:45 +00:00
Michael Duong	1e36d6e4f6	Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-model	2024-02-09 18:46:33 +00:00
Michael Duong	778bff37fb	4000 model	2024-02-09 18:46:19 +00:00
Github-Bot	f17119382b	Update Registry	2024-02-09 16:27:45 +00:00
Github-Bot	a98fc9d93a	Update Registry	2024-02-09 16:27:01 +00:00
KhalimCK	051921ff3f	Merge pull request #97 from Hestia-Homes/sap-dev-model Sap dev model	2024-02-09 16:26:24 +00:00
Michael Duong	18ea4a2177	Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-model	2024-02-09 16:20:02 +00:00
Michael Duong	f92c97f6cf	drop days_starting and days_ending	2024-02-09 16:19:47 +00:00
Github-Bot	96eb3904e2	Update Registry	2024-01-29 12:38:33 +00:00
Github-Bot	7f59305e20	Update Registry	2024-01-29 12:37:45 +00:00
KhalimCK	23dbfcc467	Merge pull request #94 from Hestia-Homes/sap-dev-model test model with 1 percent o change records	2024-01-29 12:37:02 +00:00
Michael Duong	353b62bc77	test model with all data, using interal cross validation, all dataset with permuation and 0, test data is just a random 10 percent sample of the training data	2024-01-29 09:03:36 +00:00
Michael Duong	d356fbfed0	test model with all permutation and zero records	2024-01-24 10:29:56 +00:00
Michael Duong	ca2a3d3623	longer run model	2024-01-23 21:46:24 +00:00
Michael Duong	efb84723bb	test model with 1 percent o change records	2024-01-23 19:27:53 +00:00
Github-Bot	6d6b824006	Update Registry	2024-01-18 10:37:52 +00:00
Github-Bot	47f8447223	Update Registry	2024-01-18 10:36:52 +00:00
KhalimCK	d9cbc1e190	Merge pull request #91 from Hestia-Homes/sap-dev-model run sap model with new data	2024-01-18 10:36:00 +00:00
Michael Duong	0e31d67970	run sap model with new data	2024-01-17 23:07:22 +00:00
Github-Bot	77888bb839	Update Registry	2024-01-16 17:38:50 +00:00
Github-Bot	f472d3c5fa	Update Registry	2024-01-16 17:38:07 +00:00
KhalimCK	03364036db	Merge pull request #90 from Hestia-Homes/sap-dev-model Sap dev model	2024-01-16 17:37:12 +00:00
Michael Duong	50c369720e	corrected model	2023-12-22 11:16:45 +00:00
Michael Duong	717a1a64fe	update version control packages	2023-12-22 10:47:35 +00:00
Michael Duong	daa4c28be6	remove unneeded dvc gto files	2023-12-22 10:44:23 +00:00
Michael Duong	c576657805	comment out old dataset	2023-12-22 10:35:17 +00:00
Michael Duong	acdac3d8dc	test new data	2023-12-22 10:28:56 +00:00
Michael Duong	598c1118f3	fix merge conflict	2023-12-22 09:54:35 +00:00
Michael Duong	639ba9dd11	add infernce limit	2023-11-27 21:50:08 +00:00
KhalimCK	ed4c1aebf6	Merge pull request #84 from Hestia-Homes/new-model-workflows Added additional workflows for new models	2023-11-27 19:12:09 +00:00
Khalim Conn-Kowlessar	e9417ca73d	Added additional workflows for new models	2023-11-27 15:17:01 +00:00
Github-Bot	7d26ec4219	Update Registry	2023-10-22 21:07:17 +00:00
Github-Bot	6d3407ba0e	Update Registry	2023-10-22 21:06:37 +00:00
quandanrepo	91741c527b	Merge pull request #83 from Hestia-Homes/sap-dev-model Sap dev model	2023-10-22 17:05:52 -04:00
Michael Duong	0f96bc55f1	add time to inference to model	2023-10-22 21:05:07 +00:00
Michael Duong	499458b699	add time to inference to model	2023-10-22 21:02:32 +00:00
Michael Duong	8689d4391e	Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-model	2023-10-22 03:25:23 +00:00
Michael Duong	cbd46489fe	Remove propgate	2023-10-22 03:25:07 +00:00
Github-Bot	a15bdd5ee0	Update Registry	2023-10-21 03:03:21 +00:00
Github-Bot	72cf709601	Update Registry	2023-10-21 03:02:38 +00:00
quandanrepo	6e35e8cdfe	Merge pull request #82 from Hestia-Homes/sap-dev-dockerignore final removal of dash from handler	2023-10-20 23:01:59 -04:00
Michael Duong	ca37e4ee18	final removal of dash from handler	2023-10-21 04:00:13 +01:00
Github-Bot	3145b5d331	Update Registry	2023-10-20 22:41:26 +00:00
Github-Bot	960425e709	Update Registry	2023-10-20 22:40:39 +00:00
quandanrepo	46bb25012a	Merge pull request #81 from Hestia-Homes/sap-dev-dockerignore Sap dev dockerignore	2023-10-20 18:39:53 -04:00
Michael Duong	811d47b78a	remove more lines	2023-10-20 23:30:31 +01:00
Michael Duong	59113859b1	Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-dockerignore	2023-10-20 15:45:25 +01:00
Michael Duong	867f4e0bf0	change logging style	2023-10-20 15:45:04 +01:00
Github-Bot	c605d6b549	Update Registry	2023-10-20 02:16:05 +00:00
Github-Bot	72d4dbae3f	Update Registry	2023-10-20 02:15:23 +00:00
quandanrepo	7a2347a937	Merge pull request #80 from Hestia-Homes/sap-dev-dockerignore Sap dev dockerignore	2023-10-19 22:14:40 -04:00
Michael Duong	56b7139b41	Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-dockerignore	2023-10-20 03:13:36 +01:00
Michael Duong	dadcbbab3a	revert back for now	2023-10-20 03:13:24 +01:00
Github-Bot	c5a9b548ab	Update Registry	2023-10-20 02:12:04 +00:00
Github-Bot	652bdd3467	Update Registry	2023-10-20 02:11:10 +00:00
quandanrepo	ca4edb5068	Merge pull request #79 from Hestia-Homes/sap-dev-dockerignore Sap dev dockerignore	2023-10-19 22:10:10 -04:00
Michael Duong	b9ea396f86	Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-dockerignore	2023-10-20 03:08:26 +01:00
Michael Duong	0c87f21673	test just a single dependency	2023-10-20 03:08:13 +01:00
Github-Bot	b50e0ef1ba	Update Registry	2023-10-20 01:59:48 +00:00
Github-Bot	ad98ec4f1a	Update Registry	2023-10-20 01:58:57 +00:00
quandanrepo	9de74ce453	Merge pull request #78 from Hestia-Homes/sap-dev-dockerignore add dockerignore file for prediction lamda	2023-10-19 21:58:10 -04:00
Michael Duong	ddf3ad3b40	add dependency for workflow files	2023-10-20 02:56:58 +01:00
Michael Duong	a44fe33998	add the test data back to get it to run	2023-10-20 02:48:17 +01:00
Michael Duong	fbd235addf	add dockerignore for verify step	2023-10-20 02:39:09 +01:00
Michael Duong	e1cf3a48a9	add dockerignore file for prediction lamda	2023-10-20 02:27:26 +01:00
Github-Bot	7efb910103	Update Registry	2023-10-19 01:20:19 +00:00
Github-Bot	b2e5fd9419	Update Registry	2023-10-19 01:19:29 +00:00
quandanrepo	e921d0f90b	Merge pull request #73 from Hestia-Homes/sap-dev-model Sap dev model	2023-10-18 21:18:42 -04:00
Michael Duong	790c3a9456	use test dataset	2023-10-18 13:27:25 +00:00
Michael Duong	a60a3bd285	Merge branch 'master' of github.com:Hestia-Homes/ML into sap-dev-model	2023-10-17 23:54:06 +00:00
Michael Duong	17fad3cf0a	Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-model	2023-10-17 23:53:43 +00:00
quandanrepo	96153f8248	Update Makefile	2023-10-17 03:08:01 +01:00
quandanrepo	7589977cda	Update Makefile	2023-10-12 10:19:22 +01:00
quandanrepo	b570829b5a	Merge pull request #70 from Hestia-Homes/sap-dev Sap dev	2023-10-11 09:36:44 +01:00
Github-Bot	4597c12795	Update Registry	2023-10-10 23:00:04 +00:00
Github-Bot	c668e4227c	Update Registry	2023-10-10 22:59:21 +00:00
quandanrepo	b04a0a4a90	Merge pull request #69 from Hestia-Homes/sap-dev-fix Sap dev fix	2023-10-10 23:58:39 +01:00
Michael Duong	bd80c3d69d	final fix for workflow on post merge	2023-10-10 23:58:07 +01:00
Michael Duong	da8cf5c1c4	Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-fix	2023-10-10 23:56:46 +01:00
Michael Duong	8bdedf25a2	final fix for workflow on post merge	2023-10-10 23:56:35 +01:00
Github-Bot	7a113f790e	Update Registry	2023-10-10 22:43:36 +00:00
Github-Bot	755d00e0e4	Update Registry	2023-10-10 22:42:45 +00:00
quandanrepo	6e71a59cc5	Merge pull request #68 from Hestia-Homes/sap-dev-fix add smape	2023-10-10 23:41:56 +01:00
Michael Duong	fe34356822	Merge branch 'master' of github.com:Hestia-Homes/ML into sap-dev-fix	2023-10-10 23:41:17 +01:00
Michael Duong	6552e97555	fix the register increments	2023-10-10 23:41:06 +01:00
Michael Duong	8dd784255a	add smape	2023-10-10 23:28:30 +01:00
quandanrepo	051f07df77	Update README.md	2023-10-10 14:02:54 +01:00
Github-Bot	7a1b9aed5f	Update Registry	2023-10-10 11:49:02 +00:00
Github-Bot	69c5c77a88	Update Registry	2023-10-10 11:48:13 +00:00
quandanrepo	ae474fedb4	Merge pull request #66 from Hestia-Homes/sap-dev-fix Sap dev fix	2023-10-10 12:47:29 +01:00
Michael Duong	57934d0ae3	fixed buffer bug and add id	2023-10-10 12:35:34 +01:00
quandanrepo	70b3008dc5	Update README.md	2023-10-10 11:56:56 +01:00
quandanrepo	391cc66435	Update README.md	2023-10-10 11:53:52 +01:00
quandanrepo	d3b1bb4bb9	Update README.md	2023-10-10 11:49:37 +01:00
quandanrepo	dda9065a88	Update README.md	2023-10-10 11:45:50 +01:00
Michael Duong	f9b0b6112c	add some processing ocde	2023-10-09 15:44:37 +00:00
quandanrepo	ba4d1bcc8b	Merge pull request #65 from Hestia-Homes/sap-dev Sap dev	2023-10-07 09:56:42 +01:00
Github-Bot	8105706ea7	Update Registry	2023-10-04 17:01:20 +00:00
Github-Bot	4d909c3996	Update Registry	2023-10-04 17:00:31 +00:00
quandanrepo	88aa4048bb	Merge pull request #64 from Hestia-Homes/sap-dev-model change sapmodel stack anme to be more general - remove change_ sed co…	2023-10-04 17:59:46 +01:00
Michael Duong	a15befe381	change sapmodel stack anme to be more general - remove change_ sed command	2023-10-04 16:58:18 +00:00
Github-Bot	50f72f91e3	Update Registry	2023-10-04 16:41:12 +00:00
Github-Bot	325153a725	Update Registry	2023-10-04 16:40:28 +00:00
		`@ -1,2 +0,0 @@`
			`['remote "myremote"']`
			`url = /tmp/dvcstore`
		`@ -1,2 +0,0 @@`
			`# .gto config file`
			`stages: [dev, stage, prod] # list of allowed Stages`