From b3c9bc8fd7be8dc8cb0d7e3edc592e9f2fd9d888 Mon Sep 17 00:00:00 2001
From: Michael Duong <michael123ste@gmail.com>
Date: Tue, 12 Sep 2023 12:51:50 +0100
Subject: [PATCH] using dataclient everywhere

---
 .../ml-pipeline/src/pipeline/src/.DS_Store    | Bin 0 -> 6148 bytes
 .../src/pipeline/src/build_model.py           |  26 +++++---
 .../src/pipeline/src/configs/build_model.yaml |   1 -
 .../src/configs/feature_processor.yaml        |   2 +-
 .../src/configs/generate_metrics.yaml         |   3 +
 .../src/configs/generate_predictions.yaml     |   2 +
 .../src/pipeline/src/core/DataClient.py       |  11 +++-
 .../src/pipeline/src/core/DataHandler.py      |  32 ++++++++++
 .../src/pipeline/src/core/FeatureProcessor.py |  16 +++--
 .../core/interface/InterfaceDataHandler.py    |   2 +-
 .../src/pipeline/src/data/.gitignore          |   1 -
 modules/ml-pipeline/src/pipeline/src/dvc.lock |  58 ++++++++++--------
 .../src/pipeline/src/generate_metrics.py      |  39 ++++++++----
 .../src/pipeline/src/generate_predictions.py  |  45 +++++++++-----
 14 files changed, 164 insertions(+), 74 deletions(-)
 create mode 100644 modules/ml-pipeline/src/pipeline/src/.DS_Store

diff --git a/modules/ml-pipeline/src/pipeline/src/.DS_Store b/modules/ml-pipeline/src/pipeline/src/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..4dff51ac089c8d5519e55cb38af42fcbc1594bf7
GIT binary patch
literal 6148
zcmeHK%}T>S5Z-O0O({YS3Oz1(Em*Bu#Y>3w1&ruHr6#0kFlI}WnnNk%tS{t~_&m<+
zZp31}ir5+0{pNQ!`$6`HF~+@Vw9A;o7_*@va#WTGx|fD(CK-|A7-5l(!$gK)znR!y
z2mE%6MJ!-B3%-7TI8Nd$?{?pLt!}NYH|&PpvTywdS@?O7Pctu=T%&a%WfE3;5MIWk
zV(4t1$}|t+bTm-~aWsOIo2xjDWZ}ta8fB{1*8#g}H-}DpHaqI?cEmxiKktay@m{wh
z_Iro(dDGt7-Z?oRJ|<78e9?q*;9JRt!3thM`CQMlKS>jr-h;2oukr|q0b+m{AO=>K
z0dpqUt<|M~R!$5M13xf;`-6an=o&0Fs;vV$ygp;xLPP-_-x7$zplh(y2oVsjO96E$
zH%|<%%fT;9o@=nwsLL5wGs8G$=IZgn)$HIGDxGmxBlW}pF|f!$U7HS`|7Y;aEPdoJ
zmXJjZ5Ci{=0d5Wafd`8+XY04+;aMx7-9baayb=`<&=)QNVBkK|RzV#Xs6(D>u+)g7
TpkI{((nUZKLLD*i3k-Y!k-thg

literal 0
HcmV?d00001

diff --git a/modules/ml-pipeline/src/pipeline/src/build_model.py b/modules/ml-pipeline/src/pipeline/src/build_model.py
index 71a24ae..dde3035 100644
--- a/modules/ml-pipeline/src/pipeline/src/build_model.py
+++ b/modules/ml-pipeline/src/pipeline/src/build_model.py
@@ -12,6 +12,7 @@ from core.Logger import logger
 from core.interface.InterfaceModels import MLModel
 from core.interface.InterfaceDataClient import DataClient
 from core.DataClient import dataclient_factory
+from core.DataHandler import datahandler_factory
 from core.MLModels import model_factory
 
 RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@@ -22,6 +23,9 @@ prepare_data_params = yaml.safe_load(open(prepare_data_path))
 build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
 build_model_params = yaml.safe_load(open(build_model_path))
 
+feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
+feature_process_params = yaml.safe_load(open(feature_process_path))
+
 
 def build_model(
     dataclient: DataClient,
@@ -40,16 +44,16 @@ def build_model(
     logger.info("--------------------------------------")
 
     if train_data is None:
-        # TODO: replace this with the data client to load
         if train_filepath is None:
-            raise ValueError(f"Need {train_filepath}")
-        train_data = pd.read_parquet(train_filepath)
+            raise ValueError(f"Need {train_filepath} if no data supplied")
+        train_data = datahandler.load_data(
+            dataclient=dataclient, location=train_filepath
+        )
 
     if test_data is None:
-        # TODO: replace this with the data client to load
         if test_filepath is None:
-            raise ValueError(f"Need {test_filepath}")
-        test_data = pd.read_parquet(test_filepath)
+            raise ValueError(f"Need {test_filepath} if no data supplied")
+        test_data = datahandler.load_data(dataclient=dataclient, location=test_filepath)
 
     logger.info("----------------------")
     logger.info("--- Training model ---")
@@ -76,7 +80,13 @@ if __name__ == "__main__":
     logger.info(f"--- Initiate DataClient ---")
     logger.info("----------------------------")
 
-    dataclient = dataclient_factory(prepare_data_params["dataclient_type"])
+    dataclient = dataclient_factory(prepare_data_params["output_dataclient_type"])
+
+    logger.info("-----------------------------")
+    logger.info(f"--- Initiate DataHandler ---")
+    logger.info("-----------------------------")
+
+    datahandler = datahandler_factory(prepare_data_params["datahandler_type"])
 
     logger.info("-------------------------")
     logger.info(f"--- Initiate MLModel ---")
@@ -92,7 +102,7 @@ if __name__ == "__main__":
     build_model(
         dataclient=dataclient,
         model=model,
-        target=build_model_params["target"],
+        target=feature_process_params["feature_processor_config"]["target"],
         model_save_location=build_model_params["model_save_filepath"],
         model_hyperparameters=build_model_params[model_type],
         train_filepath=prepare_data_params["output_train_filepath"],
diff --git a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml
index 8a16027..0a059d6 100644
--- a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml
+++ b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml
@@ -1,5 +1,4 @@
 model_type: SKLearnLinearRegression
-target: target
 model_save_filepath: ./data/model/model.joblib
 
 SKLearnLinearRegression: null
diff --git a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml
index d84de4d..30dacbe 100644
--- a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml
+++ b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml
@@ -4,4 +4,4 @@ feature_processor_config:
   subsample_seed: 0
   target: RDSAP_CHANGE
   drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE"]
-  retain_features: null
+  retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]
diff --git a/modules/ml-pipeline/src/pipeline/src/configs/generate_metrics.yaml b/modules/ml-pipeline/src/pipeline/src/configs/generate_metrics.yaml
index a370f9f..84f5897 100644
--- a/modules/ml-pipeline/src/pipeline/src/configs/generate_metrics.yaml
+++ b/modules/ml-pipeline/src/pipeline/src/configs/generate_metrics.yaml
@@ -1,2 +1,5 @@
+dataclient_type: local
+input_datahandler_type: parquet
+output_datahandler_type: json
 metrics_type: Regression
 metrics_output_filepath: ./metrics/metrics.json
diff --git a/modules/ml-pipeline/src/pipeline/src/configs/generate_predictions.yaml b/modules/ml-pipeline/src/pipeline/src/configs/generate_predictions.yaml
index c7f1b2d..404c33f 100644
--- a/modules/ml-pipeline/src/pipeline/src/configs/generate_predictions.yaml
+++ b/modules/ml-pipeline/src/pipeline/src/configs/generate_predictions.yaml
@@ -1,3 +1,5 @@
+input_dataclient_type: local
+output_dataclient_type: local
 test_data_filepath: ./data/prepared_data/test.parquet
 predictions_output_filepath: ./data/predictions/predictions.parquet
 predictions_column_name: predictions
diff --git a/modules/ml-pipeline/src/pipeline/src/core/DataClient.py b/modules/ml-pipeline/src/pipeline/src/core/DataClient.py
index 955800f..cb5b8d7 100644
--- a/modules/ml-pipeline/src/pipeline/src/core/DataClient.py
+++ b/modules/ml-pipeline/src/pipeline/src/core/DataClient.py
@@ -2,7 +2,9 @@
 Implementations of the DataClient Protocol
 """
 
+import os
 import boto3
+from pathlib import Path
 from io import BytesIO
 from typing import List
 from core.interface.InterfaceDataClient import DataClient
@@ -197,7 +199,12 @@ class LocalClient:
         """
         When the client is established, we can load data from a buffer
         """
-        ...
+        with open(location, "rb") as file:
+            # Read the entire file into a BytesIO object
+            buffer = BytesIO(file.read())
+        buffer.seek(0)
+
+        return buffer
 
     def load_database(self, database_location: dict) -> None:
         """
@@ -215,6 +222,8 @@ class LocalClient:
         """
         When the client is established, we can save out objects from a buffer
         """
+        if not Path(location).parent.exists():
+            os.makedirs(Path(location).parent)
 
         # Write the contents of the buffer to the local file
         with open(location, "wb") as f:
diff --git a/modules/ml-pipeline/src/pipeline/src/core/DataHandler.py b/modules/ml-pipeline/src/pipeline/src/core/DataHandler.py
index dba35f9..f5c07c1 100644
--- a/modules/ml-pipeline/src/pipeline/src/core/DataHandler.py
+++ b/modules/ml-pipeline/src/pipeline/src/core/DataHandler.py
@@ -2,6 +2,7 @@
 Implementations of the datahandler Protocol
 """
 
+import json
 import pandas as pd
 from io import BytesIO
 from typing import List
@@ -15,6 +16,7 @@ def datahandler_factory(datahandler_type: str) -> DataHandler:
     """
     datahandler = {
         "parquet": ParquetHandler(),
+        "json": JSONHandler()
         # ADD MORE DATACLIENTS HERE
     }
 
@@ -52,3 +54,33 @@ class ParquetHandler:
         obj.to_parquet(parquet_buffer, index=False)
 
         dataclient.upload_data_from_buffer(buffer=parquet_buffer, location=location)
+
+
+class JSONHandler:
+    """
+    Load and save Parquet datasets
+    """
+
+    def load_data(self, dataclient: DataClient, location: str) -> pd.DataFrame:
+        """
+        When the client is established, we can load data
+        """
+        ...
+
+    def save_data(self, dataclient: DataClient, obj: dict, location: str) -> None:
+        """
+        When the client is established, we can save out objects
+        """
+        # Serialize the dictionary to a JSON-formatted string
+        json_string = json.dumps(obj)  # indent for pretty formatting
+
+        # Convert the JSON string to bytes (UTF-8 encoding)
+        json_bytes = json_string.encode("utf-8")
+
+        # Create a BytesIO object and write the JSON bytes to it
+        buffer = BytesIO()
+        buffer.write(json_bytes)
+
+        buffer.seek(0)
+
+        dataclient.upload_data_from_buffer(buffer=buffer, location=location)
diff --git a/modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py b/modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py
index 0b8568f..7f14e03 100644
--- a/modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py
+++ b/modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py
@@ -54,7 +54,9 @@ class DataFrameFeatureProcessor:
         return df
 
     @staticmethod
-    def retain_features(df: pd.DataFrame, retain_features: List[str] | None = None):
+    def retain_features(
+        df: pd.DataFrame, target: str, retain_features: List[str] | None = None
+    ) -> pd.DataFrame:
         """
         Determine which columns to keep for modelling
         """
@@ -62,8 +64,8 @@ class DataFrameFeatureProcessor:
             retain_features = df.columns.to_list()
         else:
             if not set(retain_features).issubset(df.columns):
-                logger.error("Features defined is not contained in data")
-                exit(1)
+                raise ValueError("Features defined is not contained in data")
+            retain_features = [target] + retain_features
 
         df = df[retain_features]
 
@@ -83,7 +85,7 @@ class DataFrameFeatureProcessor:
     @staticmethod
     def apply_business_logic(
         df: pd.DataFrame, business_logic: Union[dict[str, Callable], None]
-    ):
+    ) -> pd.DataFrame:
         """
         If we need any additional business logic to be applied, post data cleaning
         """
@@ -99,7 +101,7 @@ class DataFrameFeatureProcessor:
     @staticmethod
     def generate_new_features(
         df: pd.DataFrame, new_feature_funcs: Union[dict[str, Callable], None]
-    ):
+    ) -> pd.DataFrame:
         """
         We can iterative over all keys (new feature column names), and apply their Calleabl function
         """
@@ -137,7 +139,9 @@ class DataFrameFeatureProcessor:
             df, drop_columns=feature_processor_config["drop_columns"]
         )
         df = self.retain_features(
-            df, retain_features=feature_processor_config["retain_features"]
+            df,
+            retain_features=feature_processor_config["retain_features"],
+            target=feature_processor_config["target"],
         )
         df = self.apply_business_logic(df, business_logic=business_logic)
         df = self.generate_new_features(df, new_feature_funcs=new_feature_funcs)
diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataHandler.py b/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataHandler.py
index 783bac9..1c21144 100644
--- a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataHandler.py
+++ b/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataHandler.py
@@ -19,7 +19,7 @@ class DataHandler(Protocol):
         ...
 
     def save_data(
-        self, dataclient: DataClient, obj: Union[pd.DataFrame, Any], location: str
+        self, dataclient: DataClient, obj: Union[pd.DataFrame, dict, Any], location: str
     ) -> None:
         """
         When the client is established, we can save out objects
diff --git a/modules/ml-pipeline/src/pipeline/src/data/.gitignore b/modules/ml-pipeline/src/pipeline/src/data/.gitignore
index b771993..7c8e294 100644
--- a/modules/ml-pipeline/src/pipeline/src/data/.gitignore
+++ b/modules/ml-pipeline/src/pipeline/src/data/.gitignore
@@ -1,4 +1,3 @@
 /prepared_data
 /model
 /predictions
-.DS_Store
diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.lock b/modules/ml-pipeline/src/pipeline/src/dvc.lock
index ed7c57c..7e8cd26 100644
--- a/modules/ml-pipeline/src/pipeline/src/dvc.lock
+++ b/modules/ml-pipeline/src/pipeline/src/dvc.lock
@@ -5,8 +5,8 @@ stages:
     deps:
     - path: prepare_data.py
       hash: md5
-      md5: 38b0836237bfa25ea0d71ca259610f4d
-      size: 3623
+      md5: 87a83e62512bff93c89f3e93c1ed248d
+      size: 5593
     params:
       configs/prepare_data.yaml:
         output_test_filepath: ./data/prepared_data/test.parquet
@@ -15,20 +15,20 @@ stages:
     outs:
     - path: data/prepared_data/
       hash: md5
-      md5: f0d462fe6b1a856a827409a745539285.dir
-      size: 36169
+      md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
+      size: 4428909
       nfiles: 2
   build_model:
     cmd: python build_model.py
     deps:
     - path: build_model.py
       hash: md5
-      md5: 152d52b7754b4c6f96f3481dc26562fc
-      size: 3576
+      md5: 58315ea127dcc127e2c22ab1205fddb2
+      size: 3925
     - path: data/prepared_data
       hash: md5
-      md5: f0d462fe6b1a856a827409a745539285.dir
-      size: 36169
+      md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
+      size: 4428909
       nfiles: 2
     params:
       configs/build_model.yaml:
@@ -37,64 +37,68 @@ stages:
           kernel: linear
         model_save_filepath: ./data/model/model.joblib
         model_type: SKLearnLinearRegression
-        target: target
     outs:
     - path: data/model/
       hash: md5
-      md5: fb7ae4137b445dc91e840b794d72e940.dir
-      size: 1096
+      md5: 40fa511f4f401f9d2c7da814afe198ef.dir
+      size: 920
       nfiles: 1
   generate_predictions:
     cmd: python generate_predictions.py
     deps:
     - path: data/model
       hash: md5
-      md5: fb7ae4137b445dc91e840b794d72e940.dir
-      size: 1096
+      md5: 40fa511f4f401f9d2c7da814afe198ef.dir
+      size: 920
       nfiles: 1
     - path: data/prepared_data
       hash: md5
-      md5: f0d462fe6b1a856a827409a745539285.dir
-      size: 36169
+      md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
+      size: 4428909
       nfiles: 2
     - path: generate_predictions.py
       hash: md5
-      md5: 424b9d89045eaf8a5a167ab2e0e363ae
-      size: 3400
+      md5: 13e920c0bae8ac51dd907631578f7045
+      size: 4126
     params:
       configs/generate_predictions.yaml:
+        input_dataclient_type: local
+        output_dataclient_type: local
         predictions_column_name: predictions
         predictions_output_filepath: ./data/predictions/predictions.parquet
         test_data_filepath: ./data/prepared_data/test.parquet
     outs:
     - path: data/predictions/
       hash: md5
-      md5: 4d5854903b25bdae15d99c934ebcfb99.dir
-      size: 2531
+      md5: 01e8d3483e1f90b5d92022ee4a65bbd7.dir
+      size: 945933
       nfiles: 1
   generate_metrics:
     cmd: python generate_metrics.py
     deps:
     - path: data/predictions
       hash: md5
-      md5: 4d5854903b25bdae15d99c934ebcfb99.dir
-      size: 2531
+      md5: 01e8d3483e1f90b5d92022ee4a65bbd7.dir
+      size: 945933
       nfiles: 1
     - path: data/prepared_data
       hash: md5
-      md5: f0d462fe6b1a856a827409a745539285.dir
-      size: 36169
+      md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
+      size: 4428909
       nfiles: 2
     - path: generate_metrics.py
       hash: md5
-      md5: b456e207b152298428ba79c083d1b6ff
-      size: 3728
+      md5: 6276995b5e860d0f0bb4545aa5f5d347
+      size: 4259
     params:
       configs/generate_metrics.yaml:
+        dataclient_type: local
+        input_datahandler_type: parquet
         metrics_output_filepath: ./metrics/metrics.json
         metrics_type: Regression
+        output_datahandler_type: json
     outs:
     - path: metrics/metrics.json
       hash: md5
-      md5: 3c9306e992b07491ff7e642949d6bc47
-      size: 182
+      md5: 995ccf3c6c3f6a975d22aa9bc9f4964e
+      size: 181
diff --git a/modules/ml-pipeline/src/pipeline/src/generate_metrics.py b/modules/ml-pipeline/src/pipeline/src/generate_metrics.py
index 11b214c..a7def45 100644
--- a/modules/ml-pipeline/src/pipeline/src/generate_metrics.py
+++ b/modules/ml-pipeline/src/pipeline/src/generate_metrics.py
@@ -11,9 +11,11 @@ from pathlib import Path
 from core.interface.InterfaceModels import MLModel
 from core.interface.InterfaceMetrics import MLMetrics
 from core.interface.InterfaceDataClient import DataClient
+from core.interface.InterfaceDataHandler import DataHandler
 from core.DataClient import dataclient_factory
 from core.MLModels import model_factory
 from core.MLMetrics import metrics_factory
+from core.DataHandler import datahandler_factory
 from core.Logger import logger
 
 
@@ -33,9 +35,14 @@ generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
 generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml"
 generate_metrics_params = yaml.safe_load(open(generate_metrics_path))
 
+feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
+feature_process_params = yaml.safe_load(open(feature_process_path))
+
 
 def generate_metrics(
     dataclient: DataClient,
+    input_datahandler: DataHandler,
+    output_datahandler: DataHandler,
     model: MLModel,
     metrics: MLMetrics,
     target: str,
@@ -52,15 +59,17 @@ def generate_metrics(
     logger.info("--- Loading test data ---")
     logger.info("-------------------------")
 
-    # TODO: replace with client loader here
-    test_data = pd.read_parquet(test_data_filepath)
+    test_data = input_datahandler.load_data(
+        dataclient=dataclient, location=test_data_filepath
+    )
 
     logger.info("---------------------------")
     logger.info("--- Loading predictions ---")
     logger.info("---------------------------")
 
-    # TODO: replace with client loader here
-    predictions = pd.read_parquet(predictions_output_filepath)
+    predictions = input_datahandler.load_data(
+        dataclient=dataclient, location=predictions_output_filepath
+    )
 
     logger.info("--------------------------")
     logger.info("--- Generating metrics ---")
@@ -75,13 +84,9 @@ def generate_metrics(
     logger.info("--- Saving metrics ---")
     logger.info("----------------------")
 
-    # TODO: replace with client
-
-    if not Path(metrics_output_filepath).parent.exists():
-        os.mkdir(Path(metrics_output_filepath).parent)
-
-    with open(metrics_output_filepath, "w") as f:
-        json.dump(metrics_output, f)
+    output_datahandler.save_data(
+        dataclient=dataclient, obj=metrics_output, location=metrics_output_filepath
+    )
 
 
 if __name__ == "__main__":
@@ -91,14 +96,22 @@ if __name__ == "__main__":
     logger.info("----------------------------")
 
     model = model_factory(build_model_params["model_type"])
-    dataclient = dataclient_factory(prepare_data_params["dataclient_type"])
+    dataclient = dataclient_factory(generate_metrics_params["dataclient_type"])
+    input_datahandler = datahandler_factory(
+        generate_metrics_params["input_datahandler_type"]
+    )
+    output_datahandler = datahandler_factory(
+        generate_metrics_params["output_datahandler_type"]
+    )
     metrics = metrics_factory(generate_metrics_params["metrics_type"])
 
     generate_metrics(
         dataclient=dataclient,
+        input_datahandler=input_datahandler,
+        output_datahandler=output_datahandler,
         model=model,
         metrics=metrics,
-        target=build_model_params["target"],
+        target=feature_process_params["feature_processor_config"]["target"],
         test_data_filepath=generate_predictions_params["test_data_filepath"],
         predictions_output_filepath=generate_predictions_params[
             "predictions_output_filepath"
diff --git a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py b/modules/ml-pipeline/src/pipeline/src/generate_predictions.py
index 00d6fce..4ab1503 100644
--- a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py
+++ b/modules/ml-pipeline/src/pipeline/src/generate_predictions.py
@@ -10,9 +10,10 @@ import pandas as pd
 from pathlib import Path
 from core.interface.InterfaceModels import MLModel
 from core.interface.InterfaceDataClient import DataClient
+from core.interface.InterfaceDataHandler import DataHandler
 from core.DataClient import dataclient_factory
 from core.MLModels import model_factory
-from core.MLMetrics import metrics_factory
+from core.DataHandler import datahandler_factory
 from core.Logger import logger
 
 
@@ -29,9 +30,14 @@ generate_predictions_path = (
 )
 generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
 
+feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
+feature_process_params = yaml.safe_load(open(feature_process_path))
+
 
 def generate_predictions(
-    dataclient: DataClient,
+    input_dataclient: DataClient,
+    output_dataclient: DataClient,
+    datahandler: DataHandler,
     model: MLModel,
     target: str,
     model_filepath: str,
@@ -47,8 +53,9 @@ def generate_predictions(
     logger.info("--- Loading test data ---")
     logger.info("-------------------------")
 
-    # TODO: replace with client loader here
-    test_data = pd.read_parquet(test_data_filepath)
+    test_data = datahandler.load_data(
+        dataclient=input_dataclient, location=test_data_filepath
+    )
 
     logger.info("---------------------")
     logger.info("--- Loading model ---")
@@ -60,7 +67,6 @@ def generate_predictions(
     logger.info("--- Generating predictions ---")
     logger.info("------------------------------")
 
-    # Clean test data for now
     prediction_data = (
         test_data.drop(columns=target) if target in test_data.columns else test_data
     )
@@ -71,13 +77,11 @@ def generate_predictions(
     logger.info("--- Saving predictions ---")
     logger.info("--------------------------")
 
-    # TODO: replace with client
-
-    if not Path(predictions_output_filepath).parent.exists():
-        os.mkdir(Path(predictions_output_filepath).parent)
-
-    pd.DataFrame(predictions, columns=[predictions_column_name]).to_parquet(
-        predictions_output_filepath
+    predictions_df = pd.DataFrame(predictions, columns=[predictions_column_name])
+    datahandler.save_data(
+        dataclient=output_dataclient,
+        obj=predictions_df,
+        location=predictions_output_filepath,
     )
 
 
@@ -88,12 +92,23 @@ if __name__ == "__main__":
     logger.info("----------------------------")
 
     model = model_factory(build_model_params["model_type"])
-    dataclient = dataclient_factory(prepare_data_params["dataclient_type"])
+    # We may have different locations of loading hence why we use one specified in generate_predictions.yaml
+    # I.e. for metric runs, this will be a local data client
+    # For predictions, we will want a cloud data client
+    input_dataclient = dataclient_factory(
+        generate_predictions_params["input_dataclient_type"]
+    )
+    output_dataclient = dataclient_factory(
+        generate_predictions_params["output_dataclient_type"]
+    )
+    datahandler = datahandler_factory(prepare_data_params["datahandler_type"])
 
     generate_predictions(
-        dataclient=dataclient,
+        input_dataclient=input_dataclient,
+        output_dataclient=output_dataclient,
+        datahandler=datahandler,
         model=model,
-        target=build_model_params["target"],
+        target=feature_process_params["feature_processor_config"]["target"],
         model_filepath=build_model_params["model_save_filepath"],
         test_data_filepath=generate_predictions_params["test_data_filepath"],
         predictions_output_filepath=generate_predictions_params[