add pipeline structure

2026-06-08 11:17:25 +00:00 · 2023-09-09 10:07:30 +00:00 · 2023-09-09 10:07:30 +00:00 · 0d18b440c1
commit 0d18b440c1
parent d907c64ee6
25 changed files with 171 additions and 120 deletions
--- a/modules/ml-pipeline/.pre-commit-config.yaml
+++ b/modules/ml-pipeline/.pre-commit-config.yaml
@ -0,0 +1,12 @@
+# Pre commit hooks
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+-   repo: https://github.com/psf/black
+    rev: 22.10.0
+    hooks:
+    -   id: black
--- a/modules/ml-pipeline/Makefile
+++ b/modules/ml-pipeline/Makefile
@ -11,7 +11,7 @@ dev-pyenv:
 	pyenv install ${PYTHON_VERSION} || echo "Python version already installed"
 	pyenv global ${PYTHON_VERSION}
 	python3 -m venv .dev_env
-	. .dev_env/bin/activate && pip install --upgrade pip && pip install -r src/training/requirements/requirements-dev.txt && pre-commit install
+	. .dev_env/bin/activate && pip install --upgrade pip && pip install -r src/pipeline/training/requirements/requirements-dev.txt && pre-commit install
 	echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND"
 	echo "source .dev_env/bin/activate"

--- a/modules/ml-pipeline/src/pipeline/Dockerfile
+++ b/modules/ml-pipeline/src/pipeline/Dockerfile
@ -2,4 +2,3 @@
 FROM python:3.9-slim

 RUN pip install -r experimentation/requirements/training.txt
-
--- a/modules/ml-pipeline/src/pipeline/inference/README.MD
+++ b/modules/ml-pipeline/src/pipeline/inference/README.MD
@ -2,4 +2,4 @@

 This folder contains the inference codebase to:
 - Load a model
- Generate a prediction
+- Generate a prediction
--- a/modules/ml-pipeline/src/pipeline/main_training.py
+++ b/modules/ml-pipeline/src/pipeline/main_training.py
@ -1,3 +1,3 @@
 """
 Pipeline that stitches all steps together
-"""
+"""
--- a/modules/ml-pipeline/src/pipeline/training/README.md
+++ b/modules/ml-pipeline/src/pipeline/training/README.md
@ -25,13 +25,11 @@ Workflow:
 - Use `dvc metrics show` to view current metrics score
 - Adjust parameters/ codebase
 - When happy with changes, use `dvc exp run` to trigger an experiment
- Due to cache, only need stages are re-run 
+- Due to cache, only need stages are re-run
 - Use `dvc metrics diff` to check the change in metrics
- Use `dvc exp show` to view all experiments 
-    - NOTE: the last experiment will always be applied to the workspace! 
+- Use `dvc exp show` to view all experiments
+    - NOTE: the last experiment will always be applied to the workspace!
 - After running experiments, you can apply the the best model to workspace using `dvc exp apply [EXPERIMENT_NAME]`
    - This experiment will have the corresponding .dvc files for the hashed model and data
 - Use version control as normal
    - git add, git commit etc
-
-    
--- a/modules/ml-pipeline/src/pipeline/training/init.py
+++ b/modules/ml-pipeline/src/pipeline/training/init.py
--- a/modules/ml-pipeline/src/pipeline/training/build_model.py
+++ b/modules/ml-pipeline/src/pipeline/training/build_model.py
@ -3,7 +3,7 @@ Second Pipieline step:
 Once we have the features, we build a model
 """

-import os 
+import os
 import yaml
 import pandas as pd
 from typing import Union
@ -22,39 +22,42 @@ prepare_data_params = yaml.safe_load(open(prepare_data_path))
 build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
 build_model_params = yaml.safe_load(open(build_model_path))

+
 def build_model(
-        dataclient: DataClient, 
-        model: MLModel,
-        target: str,
-        model_save_location: str,
-        model_hyperparameters: dict,
-        train_location: Union[str, None] = None,
-        test_location: Union[str, None] = None,
-        train_data: Union[pd.DataFrame, None] = None,
-        test_data: Union[pd.DataFrame, None] = None,
-        pipeline_mode: bool = False
-):   
+    dataclient: DataClient,
+    model: MLModel,
+    target: str,
+    model_save_location: str,
+    model_hyperparameters: dict,
+    train_location: Union[str, None] = None,
+    test_location: Union[str, None] = None,
+    train_data: Union[pd.DataFrame, None] = None,
+    test_data: Union[pd.DataFrame, None] = None,
+    pipeline_mode: bool = False,
+):
    logger.info("--------------------------------------")
    logger.info("--- Loading Data for build process ---")
    logger.info("--------------------------------------")

    if train_data is None:
-        # TODO: replace this with the data client to load 
+        # TODO: replace this with the data client to load
        if train_location is None:
-            raise ValueError(f"Need {train_location}") 
+            raise ValueError(f"Need {train_location}")
        train_data = pd.read_parquet(train_location)

    if test_data is None:
-        # TODO: replace this with the data client to load 
+        # TODO: replace this with the data client to load
        if test_location is None:
-            raise ValueError(f"Need {test_location}") 
+            raise ValueError(f"Need {test_location}")
        test_data = pd.read_parquet(test_location)

    logger.info("----------------------")
    logger.info("--- Training model ---")
    logger.info("----------------------")

-    model.train_model(data=train_data, target=target, model_hyperparameters=model_hyperparameters)
+    model.train_model(
+        data=train_data, target=target, model_hyperparameters=model_hyperparameters
+    )

    logger.info("--------------------")
    logger.info("--- Saving model ---")
@ -62,7 +65,7 @@ def build_model(

    model.save_model(path=Path(model_save_location))

-    # TODO: replace this with the data client to load 
+    # TODO: replace this with the data client to load
    # TODO: can fine tune model here if need with the test data


@ -76,13 +79,13 @@ if __name__ == "__main__":
    logger.info(f"--- Initiate DataClient ---")
    logger.info("----------------------------")

-    dataclient = dataclient_factory(prepare_data_params['client_type'])
+    dataclient = dataclient_factory(prepare_data_params["client_type"])

    logger.info("-------------------------")
    logger.info(f"--- Initiate MLModel ---")
    logger.info("-------------------------")

-    model_type = build_model_params['model_type']
+    model_type = build_model_params["model_type"]
    model = model_factory(model_type)

    logger.info("--------------------------")
@ -92,13 +95,13 @@ if __name__ == "__main__":
    build_model(
        dataclient=dataclient,
        model=model,
-        target=build_model_params['target'],
-        model_save_location=build_model_params['model_save_location'],
+        target=build_model_params["target"],
+        model_save_location=build_model_params["model_save_location"],
        model_hyperparameters=build_model_params[model_type],
-        train_location=prepare_data_params['output_train_filename'],
-        test_location=prepare_data_params['output_test_filename']
-        )
+        train_location=prepare_data_params["output_train_filename"],
+        test_location=prepare_data_params["output_test_filename"],
+    )

    logger.info("-------------------------------")
    logger.info(f"--- {__file__} - Complete! ---")
-    logger.info("-------------------------===---")
+    logger.info("-------------------------===---")
--- a/modules/ml-pipeline/src/pipeline/training/configs/build_model.yaml
+++ b/modules/ml-pipeline/src/pipeline/training/configs/build_model.yaml
@ -5,4 +5,4 @@ test_location: ./data/prepared_data/test.parquet
 model_save_location: ./data/model/model.joblib

 SKLearnSVMRegression:
-  kernel: "linear"
+  kernel: "linear"
--- a/modules/ml-pipeline/src/pipeline/training/configs/configs.py
+++ b/modules/ml-pipeline/src/pipeline/training/configs/configs.py
@ -1,3 +1,3 @@
 """
 Stitch all yaml configuration files together, override some settings (such as bucket location) based off environment variables
-"""
+"""
--- a/modules/ml-pipeline/src/pipeline/training/configs/generate_metrics.yaml
+++ b/modules/ml-pipeline/src/pipeline/training/configs/generate_metrics.yaml
@ -1,4 +1,4 @@
 metrics_type: Regression
 test_data_location: ./data/prepared_data/
 predictions_output_location: ./data/predictions/predictions.csv
-metrics_output_location: ./metrics/metrics.json
+metrics_output_location: ./metrics/metrics.json
--- a/modules/ml-pipeline/src/pipeline/training/configs/prepare_data.yaml
+++ b/modules/ml-pipeline/src/pipeline/training/configs/prepare_data.yaml
@ -5,4 +5,4 @@ output_location: ./data/prepared_data/
 output_train_filename: train.parquet
 output_test_filename: test.parquet

-cache_o
+# cache_o
--- a/modules/ml-pipeline/src/pipeline/training/core/DataClient.py
+++ b/modules/ml-pipeline/src/pipeline/training/core/DataClient.py
@ -6,6 +6,7 @@ import pandas as pd
 from typing import List
 from core.interface.InterfaceDataClient import DataClient

+
 def dataclient_factory(dataclient_type: str) -> DataClient:
    """
    Determine which dataclient to use
@ -17,7 +18,7 @@ def dataclient_factory(dataclient_type: str) -> DataClient:

    if dataclient_type not in dataclients:
        raise ValueError("Dataclient type specified is not in factory")
-    
+
    return dataclients[dataclient_type]


@ -25,12 +26,17 @@ def validate_dict_keys(keys_1: List[str], keys_2: List[str], config_type: str):
    if not set(keys_1).issubset(keys_2):
        raise ValueError(f"Incorrect {config_type} keys specified")

+
 class MinioClient:
    """
    Using the Minio s3 client, to do local testing
    """

-    ACCEPTED_CONFIG_KEYS = ["aws_access_key_id", "aws_secret_access_key", "endpoint_url"]
+    ACCEPTED_CONFIG_KEYS = [
+        "aws_access_key_id",
+        "aws_secret_access_key",
+        "endpoint_url",
+    ]
    ACCEPTED_LOAD_CONFIG_KEYS = []
    ACCEPTED_SAVE_CONFIG_KEYS = []

@ -38,10 +44,14 @@ class MinioClient:
        """
        Load all configuration into the instance (self.config)
        """
-        validate_dict_keys(keys_1=list(config.keys()), keys_2=self.ACCEPTED_CONFIG_KEYS, config_type="config")
-        
+        validate_dict_keys(
+            keys_1=list(config.keys()),
+            keys_2=self.ACCEPTED_CONFIG_KEYS,
+            config_type="config",
+        )
+
        self.config = config
-    
+
    def establish_client(self) -> None:
        """
        With the given configurations, create the connection to the client (self.client)
@ -53,14 +63,20 @@ class MinioClient:
        """
        When the client is established, we can load data
        """
-        validate_dict_keys(keys_1=list(load_config.keys()), keys_2=self.ACCEPTED_LOAD_CONFIG_KEYS, config_type="load_config")
-        
+        validate_dict_keys(
+            keys_1=list(load_config.keys()),
+            keys_2=self.ACCEPTED_LOAD_CONFIG_KEYS,
+            config_type="load_config",
+        )
+
        return pd.DataFrame()

    def save_data(self, obj: object, save_config: dict) -> None:
        """
        When the client is established, we can save out objects
        """
-        validate_dict_keys(keys_1=list(save_config.keys()), keys_2=self.ACCEPTED_SAVE_CONFIG_KEYS, config_type="save_config")
-
-
+        validate_dict_keys(
+            keys_1=list(save_config.keys()),
+            keys_2=self.ACCEPTED_SAVE_CONFIG_KEYS,
+            config_type="save_config",
+        )
--- a/modules/ml-pipeline/src/pipeline/training/core/Logger.py
+++ b/modules/ml-pipeline/src/pipeline/training/core/Logger.py
@ -2,25 +2,27 @@
 Logger that will be used throughout the application
 """

-import logging 
+import logging
+

 def setup_logger():
    # Create a logger
    logger = logging.getLogger()
-    
+
    # Set the log level
    logger.setLevel(logging.INFO)
-    
+
    # Create a formatter
-    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-    
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+
    # Create a stream handler to direct logs to stdout
    stream_handler = logging.StreamHandler()
    stream_handler.setFormatter(formatter)
-    
+
    # Add the stream handler to the logger
    logger.addHandler(stream_handler)
-    
+
    return logger

-logger = setup_logger()
+
+logger = setup_logger()
--- a/modules/ml-pipeline/src/pipeline/training/core/MLMetrics.py
+++ b/modules/ml-pipeline/src/pipeline/training/core/MLMetrics.py
@ -10,10 +10,11 @@ from sklearn.metrics import (
    mean_absolute_error,
    median_absolute_error,
    mean_squared_error,
-    mean_absolute_percentage_error
+    mean_absolute_percentage_error,
 )
 from core.interface.InterfaceMetrics import MLMetrics

+
 def metrics_factory(metrics_type: str) -> MLMetrics:
    metrics = {
        "Regression": RegressionMetrics(),
@ -22,9 +23,10 @@ def metrics_factory(metrics_type: str) -> MLMetrics:

    if metrics_type not in metrics:
        raise ValueError("Metrics type specified is not in factory")
-    
+
    return metrics[metrics_type]

+
 class RegressionMetrics:

    METRIC_TO_APPLY = [
@ -36,7 +38,7 @@ class RegressionMetrics:
    ]

    def generate_metrics(
-            self, target: Union[pd.DataFrame, pd.Series], predictions: pd.Series
+        self, target: Union[pd.DataFrame, pd.Series], predictions: pd.Series
    ) -> dict:
        """
        Method to generate metrics
@ -44,8 +46,6 @@ class RegressionMetrics:

        metric_dict = {}
        for metric_function in self.METRIC_TO_APPLY:
-            metric_dict[metric_function.__name__] = metric_function(
-                target, predictions
-            )
+            metric_dict[metric_function.__name__] = metric_function(target, predictions)

-        return metric_dict
+        return metric_dict
--- a/modules/ml-pipeline/src/pipeline/training/core/MLModels.py
+++ b/modules/ml-pipeline/src/pipeline/training/core/MLModels.py
@ -7,7 +7,7 @@ Implementations of MLModels, all of which will have four methods to:
 """

 import os
-import joblib 
+import joblib
 import pandas as pd
 from pathlib import Path
 from typing import Union, List
@ -15,6 +15,7 @@ from sklearn import linear_model
 from sklearn.svm import SVR
 from core.interface.InterfaceModels import MLModel

+
 def model_factory(model_type: str) -> MLModel:
    """
    Determine which model to use from the model type
@ -27,7 +28,7 @@ def model_factory(model_type: str) -> MLModel:

    if model_type not in models:
        raise ValueError("Model type specified is not in factory")
-    
+
    return models[model_type]


@ -37,7 +38,6 @@ def validate_dict_keys(keys_1: List[str], keys_2: List[str], config_type: str):


 class SKLearnLinearRegression:
-
    def load_model(self, path: Union[Path, str]) -> None:
        """
        Method to load a model
@ -51,7 +51,7 @@ class SKLearnLinearRegression:
        """
        if self.model is None:
            raise KeyError("No model trained/ loaded - unable to save")
-        
+
        if not path.parent.exists():
            os.mkdir(path.parent)

@ -60,7 +60,9 @@ class SKLearnLinearRegression:

        return string_path

-    def train_model(self, data: pd.DataFrame, target: str, model_hyperparameters: dict) -> None:
+    def train_model(
+        self, data: pd.DataFrame, target: str, model_hyperparameters: dict
+    ) -> None:
        """
        Method to train a model
        """
@ -70,10 +72,9 @@ class SKLearnLinearRegression:
        y_train = data[target]
        self.model.fit(x_train, y_train)

-
    def predict(self, data: pd.DataFrame) -> pd.Series:
        """
-        Method to predict 
+        Method to predict
        """
        self.predictions = pd.Series(self.model.predict(data))
        return self.predictions
@ -82,21 +83,21 @@ class SKLearnLinearRegression:
 class SKLearnSVMRegression:

    MODEL_HYPERPARAMETERS = ["kernel"]
-    
+
    def load_model(self, path: Union[Path, str]) -> None:
        """
        Method to load a model
        """
        string_path = str(path)
        self.model = joblib.load(string_path)
-        
+
    def save_model(self, path: Path) -> str:
        """
        Method to save a model
        """
        if self.model is None:
            raise KeyError("No model trained/ loaded - unable to save")
-        
+
        if not path.parent.exists():
            os.mkdir(path.parent)

@ -105,23 +106,28 @@ class SKLearnSVMRegression:

        return string_path

-    def train_model(self, data: pd.DataFrame, target: str, model_hyperparameters: dict) -> None:
+    def train_model(
+        self, data: pd.DataFrame, target: str, model_hyperparameters: dict
+    ) -> None:
        """
        Method to train a model
        """

-        validate_dict_keys(list(model_hyperparameters.keys()), self.MODEL_HYPERPARAMETERS, config_type="Train_model_config")
-        
-        self.model = SVR(kernel=model_hyperparameters['kernel'])
+        validate_dict_keys(
+            list(model_hyperparameters.keys()),
+            self.MODEL_HYPERPARAMETERS,
+            config_type="Train_model_config",
+        )
+
+        self.model = SVR(kernel=model_hyperparameters["kernel"])

        x_train = data.iloc[:, data.columns != target]
        y_train = data[target]
        self.model.fit(x_train, y_train)

-
    def predict(self, data: pd.DataFrame) -> pd.Series:
        """
-        Method to predict 
+        Method to predict
        """
        self.predictions = pd.Series(self.model.predict(data))
-        return self.predictions
+        return self.predictions
--- a/modules/ml-pipeline/src/pipeline/training/core/init.py
+++ b/modules/ml-pipeline/src/pipeline/training/core/init.py
--- a/modules/ml-pipeline/src/pipeline/training/core/interface/InterfaceDataClient.py
+++ b/modules/ml-pipeline/src/pipeline/training/core/interface/InterfaceDataClient.py
@ -5,6 +5,7 @@ Interface for all DataClient i.e. s3, database, local etc
 import pandas as pd
 from typing import Protocol

+
 class DataClient(Protocol):
    """
    Declare the methods required for a DataClient
@ -15,7 +16,7 @@ class DataClient(Protocol):
        Load all configuration into the instance (self.config)
        """
        ...
-    
+
    def establish_client(self) -> None:
        """
        With the given configurations, create the connection to the client (self.client)
--- a/modules/ml-pipeline/src/pipeline/training/core/interface/InterfaceMetrics.py
+++ b/modules/ml-pipeline/src/pipeline/training/core/interface/InterfaceMetrics.py
@ -5,15 +5,16 @@ Define the interface for creating metrics
 import pandas as pd
 from typing import Protocol, Union

+
 class MLMetrics(Protocol):
    """
    All metrics will need to have the following interface to interact with the ML Pipeline
    """

    def generate_metrics(
-            self, target: Union[pd.DataFrame, pd.Series], predictions: pd.Series
+        self, target: Union[pd.DataFrame, pd.Series], predictions: pd.Series
    ) -> dict:
        """
        Method to generate metrics
        """
-        ...
+        ...
--- a/modules/ml-pipeline/src/pipeline/training/core/interface/InterfaceModels.py
+++ b/modules/ml-pipeline/src/pipeline/training/core/interface/InterfaceModels.py
@ -2,10 +2,11 @@
 Define the protocol for models in this pipeline
 """

-import pandas as pd 
+import pandas as pd
 from pathlib import Path
 from typing import Protocol, Union

+
 class MLModel(Protocol):
    """
    All models will need to have the following interface to interact with the ML pipeline
@ -16,14 +17,16 @@ class MLModel(Protocol):
        Method to load a model
        """
        ...
-    
+
    def save_model(self, path: Path) -> str:
        """
        Method to save a model
        """
        ...

-    def train_model(self, data: pd.DataFrame, target: str, model_hyperparameters: dict) -> None:
+    def train_model(
+        self, data: pd.DataFrame, target: str, model_hyperparameters: dict
+    ) -> None:
        """
        Method to train a model
        """
@ -31,6 +34,6 @@ class MLModel(Protocol):

    def predict(self, data: pd.DataFrame) -> pd.Series:
        """
-        Method to predict 
+        Method to predict
        """
-        ...
+        ...
--- a/modules/ml-pipeline/src/pipeline/training/core/interface/init.py
+++ b/modules/ml-pipeline/src/pipeline/training/core/interface/init.py
--- a/modules/ml-pipeline/src/pipeline/training/generate_metrics.py
+++ b/modules/ml-pipeline/src/pipeline/training/generate_metrics.py
@ -10,7 +10,7 @@ import pandas as pd
 from pathlib import Path
 from core.interface.InterfaceModels import MLModel
 from core.interface.InterfaceMetrics import MLMetrics
-from core.interface.InterfaceDataClient import DataClient 
+from core.interface.InterfaceDataClient import DataClient
 from core.DataClient import dataclient_factory
 from core.MLModels import model_factory
 from core.MLMetrics import metrics_factory
@ -37,7 +37,7 @@ def generate_metrics(
    model_location: str,
    test_data_location: str,
    predictions_output_location: str,
-    metrics_output_location: str
+    metrics_output_location: str,
 ):
    """
    For a given model, we generate prediction and evaluate this against the true target
@ -61,7 +61,9 @@ def generate_metrics(
    logger.info("------------------------------")

    # Clean test data for now
-    prediction_data = test_data.drop(columns=target) if target in test_data.columns else test_data
+    prediction_data = (
+        test_data.drop(columns=target) if target in test_data.columns else test_data
+    )

    predictions = model.predict(data=prediction_data)

@ -73,7 +75,7 @@ def generate_metrics(

    if not Path(predictions_output_location).parent.exists():
        os.mkdir(Path(predictions_output_location).parent)
-    
+
    predictions.to_json(predictions_output_location)

    logger.info("--------------------------")
@ -92,27 +94,30 @@ def generate_metrics(

    if not Path(metrics_output_location).parent.exists():
        os.mkdir(Path(metrics_output_location).parent)
-    
+
    with open(metrics_output_location, "w") as f:
        json.dump(metrics_output, f)

+
 if __name__ == "__main__":

    logger.info("----------------------------")
    logger.info(f"--- {__file__} - Start! ---")
    logger.info("----------------------------")

-    model = model_factory(build_model_params['model_type'])
-    dataclient = dataclient_factory(prepare_data_params['dataclient_type'])
-    metrics = metrics_factory(generate_metrics_params['metrics_type'])
+    model = model_factory(build_model_params["model_type"])
+    dataclient = dataclient_factory(prepare_data_params["dataclient_type"])
+    metrics = metrics_factory(generate_metrics_params["metrics_type"])

    generate_metrics(
-        dataclient=dataclient, 
-        model=model, 
-        metrics=metrics, 
-        target=build_model_params["target"], 
-        model_location=build_model_params["model_save_location"], 
-        test_data_location=generate_metrics_params["test_data_location"], 
-        predictions_output_location=generate_metrics_params["predictions_output_location"], 
-        metrics_output_location=generate_metrics_params["metrics_output_location"]
-        )
+        dataclient=dataclient,
+        model=model,
+        metrics=metrics,
+        target=build_model_params["target"],
+        model_location=build_model_params["model_save_location"],
+        test_data_location=generate_metrics_params["test_data_location"],
+        predictions_output_location=generate_metrics_params[
+            "predictions_output_location"
+        ],
+        metrics_output_location=generate_metrics_params["metrics_output_location"],
+    )
--- a/modules/ml-pipeline/src/pipeline/training/prepare_data.py
+++ b/modules/ml-pipeline/src/pipeline/training/prepare_data.py
@ -23,16 +23,22 @@ params = yaml.safe_load(open(params_path))
 def use_dummy_data() -> pd.DataFrame:
    diabetes_data = load_diabetes()

-    x_data = pd.DataFrame(diabetes_data['data'], columns=diabetes_data['feature_names']) # type: ignore
-    y_data = pd.DataFrame(diabetes_data['target'], columns=['target']) # type: ignore
+    x_data = pd.DataFrame(diabetes_data["data"], columns=diabetes_data["feature_names"])  # type: ignore
+    y_data = pd.DataFrame(diabetes_data["target"], columns=["target"])  # type: ignore

    data = pd.concat([x_data, y_data], axis=1)
    return data


-def prepare_data(dataclient: DataClient, train_proportion: float, output_location: str, output_train_filename: str = "train.parquet", output_test_filename: str = "test.parquet") -> Tuple[pd.DataFrame, pd.DataFrame]:
+def prepare_data(
+    dataclient: DataClient,
+    train_proportion: float,
+    output_location: str,
+    output_train_filename: str = "train.parquet",
+    output_test_filename: str = "test.parquet",
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
-    Given a client and location, load data into the pipeline 
+    Given a client and location, load data into the pipeline
    :param dataclient: DataClient, Determines how to get data from the given provider (cloud or local)
    :param pipeline_mode: bool, Default False, this caches out the file for experimentation, objects returned in pipeline mode
    """
@ -49,15 +55,13 @@ def prepare_data(dataclient: DataClient, train_proportion: float, output_locatio
    logger.info("----------------------")

    train, test = train_test_split(
-        data, train_size=train_proportion, test_size=(1-train_proportion)
+        data, train_size=train_proportion, test_size=(1 - train_proportion)
    )

    logger.info("--------------------------")
    logger.info("--- Feature Processing ---")
    logger.info("--------------------------")

-
-
    logger.info("-----------------------")
    logger.info("--- Outputting data ---")
    logger.info("-----------------------")
@ -69,13 +73,14 @@ def prepare_data(dataclient: DataClient, train_proportion: float, output_locatio

    logger.info("--- Outputting train and test data ---")
    train.to_csv(output_path / output_train_filename, index=False)
-    test.to_csv(output_path/ output_test_filename, index=False)
+    test.to_csv(output_path / output_test_filename, index=False)

    # client.save_data(obj=train)
    # client.save_data(obj=test)

    return train, test

+
 if __name__ == "__main__":

    logger.info("----------------------------")
@ -86,16 +91,16 @@ if __name__ == "__main__":
    logger.info(f"--- Initiate DataClient ---")
    logger.info("----------------------------")

-    dataclient = dataclient_factory(params['dataclient_type'])
+    dataclient = dataclient_factory(params["dataclient_type"])

    logger.info("---------------------------")
    logger.info(f"--- Prepare Data Stage ---")
    logger.info("---------------------------")

    prepare_data(
-        dataclient=dataclient, 
-        train_proportion=params['train_proportion'],
-        output_location=params['output_location']
+        dataclient=dataclient,
+        train_proportion=params["train_proportion"],
+        output_location=params["output_location"],
    )

    logger.info("-------------------------------")
--- a/modules/ml-pipeline/src/pipeline/training/requirements/requirements-dev.txt
+++ b/modules/ml-pipeline/src/pipeline/training/requirements/requirements-dev.txt
@ -6,4 +6,4 @@ gto==1.0.4
 scikit-learn==1.3.0
 pre-commit==3.3.3
 sphinx==7.2.5
-sphinx_rtd_theme==1.3.0
+sphinx_rtd_theme==1.3.0
--- a/modules/ml-pipeline/src/pipeline/training/requirements/requirements.txt
+++ b/modules/ml-pipeline/src/pipeline/training/requirements/requirements.txt
@ -2,4 +2,4 @@ boto3==1.28.41
 pandas==1.5.3
 dvc==3.18.0
 gto==1.0.4
-scikit-learn==1.3.0
+scikit-learn==1.3.0