commit fb81191ec8496f93907b07a7c3dab73bfd3354a0 Author: Michael Duong Date: Fri Sep 8 18:57:06 2023 +0100 initial commit diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..b63e824 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1 @@ +# Pre commit hooks \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..2bf78c8 --- /dev/null +++ b/README.md @@ -0,0 +1,11 @@ +# ML Toolkit + +Creating a ML-toolkit that can be reused: + +- ML pipeline: + - A dummy pipeline that has data version control, experiment +tracking and a model registry + +- ML monitoring: + - A bolt-on service that can implement model monitoring + diff --git a/modules/ml-monitoring/README.MD b/modules/ml-monitoring/README.MD new file mode 100644 index 0000000..7053ca0 --- /dev/null +++ b/modules/ml-monitoring/README.MD @@ -0,0 +1,4 @@ +# ML-Monitoring + +Concept: +- To have a plug in a play model to do model monitoring \ No newline at end of file diff --git a/modules/ml-pipeline/src/Dockerfile b/modules/ml-pipeline/src/Dockerfile new file mode 100644 index 0000000..eaee453 --- /dev/null +++ b/modules/ml-pipeline/src/Dockerfile @@ -0,0 +1,5 @@ +# Dockerfile that grabs the current dvc hashed model +FROM python:3.9-slim + +RUN pip install -r experimentation/requirements/training.txt + diff --git a/modules/ml-pipeline/src/inference/README.MD b/modules/ml-pipeline/src/inference/README.MD new file mode 100644 index 0000000..48067dc --- /dev/null +++ b/modules/ml-pipeline/src/inference/README.MD @@ -0,0 +1,5 @@ +# Inference + +This folder contains the inference codebase to: +- Load a model +- Generate a prediction \ No newline at end of file diff --git a/modules/ml-pipeline/src/main_training.py b/modules/ml-pipeline/src/main_training.py new file mode 100644 index 0000000..d940f1a --- /dev/null +++ b/modules/ml-pipeline/src/main_training.py @@ -0,0 +1,3 @@ +""" +Pipeline that stitches all steps together +""" \ No newline at end of file diff --git a/modules/ml-pipeline/src/training/README.md b/modules/ml-pipeline/src/training/README.md new file mode 100644 index 0000000..4f54fcf --- /dev/null +++ b/modules/ml-pipeline/src/training/README.md @@ -0,0 +1,37 @@ +# Training + +This folder contains the code base for training experimentation. + +To understand the pipeline, run `dvc dag` + +There are 3 main steps: +- Preparing data + - This is loading data (locally or from s3) + - Splitting the data into train and validation + - Creating additional features (if needed) + - **Data is cached** + - This will be down to the dvc remote location +- Build model + - For the prepared data, we build a model using our configurations + - Model is saved (locally or s3) + - **Model is cached** + - This will be down to the dvc remote location +- Generate Metrics + - For the given model, we generate metrics on validation data/test data + - **Metrics are cached** + - This will be down to the dvc remote location + +Workflow: +- Use `dvc metrics show` to view current metrics score +- Adjust parameters/ codebase +- When happy with changes, use `dvc exp run` to trigger an experiment +- Due to cache, only need stages are re-run +- Use `dvc metrics diff` to check the change in metrics +- Use `dvc exp show` to view all experiments + - NOTE: the last experiment will always be applied to the workspace! +- After running experiments, you can apply the the best model to workspace using `dvc exp apply [EXPERIMENT_NAME]` + - This experiment will have the corresponding .dvc files for the hashed model and data +- Use version control as normal + - git add, git commit etc + + \ No newline at end of file diff --git a/modules/ml-pipeline/src/training/__init__.py b/modules/ml-pipeline/src/training/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/modules/ml-pipeline/src/training/build_model.py b/modules/ml-pipeline/src/training/build_model.py new file mode 100644 index 0000000..f2c5961 --- /dev/null +++ b/modules/ml-pipeline/src/training/build_model.py @@ -0,0 +1,104 @@ +""" +Second Pipieline step: +Once we have the features, we build a model +""" + +import os +import yaml +import pandas as pd +from typing import Union +from pathlib import Path +from core.Logger import logger +from core.interface.InterfaceModels import MLModel +from core.interface.InterfaceDataClient import DataClient +from core.DataClient import dataclient_factory +from core.MLModels import model_factory + +RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") + +prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" +prepare_data_params = yaml.safe_load(open(prepare_data_path)) + +build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" +build_model_params = yaml.safe_load(open(build_model_path)) + +def build_model( + dataclient: DataClient, + model: MLModel, + target: str, + model_save_location: str, + model_hyperparameters: dict, + train_location: Union[str, None] = None, + test_location: Union[str, None] = None, + train_data: Union[pd.DataFrame, None] = None, + test_data: Union[pd.DataFrame, None] = None, + pipeline_mode: bool = False +): + logger.info("--------------------------------------") + logger.info("--- Loading Data for build process ---") + logger.info("--------------------------------------") + + if train_data is None: + # TODO: replace this with the data client to load + if train_location is None: + raise ValueError(f"Need {train_location}") + train_data = pd.read_parquet(train_location) + + if test_data is None: + # TODO: replace this with the data client to load + if test_location is None: + raise ValueError(f"Need {test_location}") + test_data = pd.read_parquet(test_location) + + logger.info("----------------------") + logger.info("--- Training model ---") + logger.info("----------------------") + + model.train_model(data=train_data, target=target, model_hyperparameters=model_hyperparameters) + + logger.info("--------------------") + logger.info("--- Saving model ---") + logger.info("--------------------") + + model.save_model(path=Path(model_save_location)) + + # TODO: replace this with the data client to load + # TODO: can fine tune model here if need with the test data + + +if __name__ == "__main__": + + logger.info("----------------------------") + logger.info(f"--- {__file__} - Start! ---") + logger.info("----------------------------") + + logger.info("----------------------------") + logger.info(f"--- Initiate DataClient ---") + logger.info("----------------------------") + + dataclient = dataclient_factory(prepare_data_params['client_type']) + + logger.info("-------------------------") + logger.info(f"--- Initiate MLModel ---") + logger.info("-------------------------") + + model_type = build_model_params['model_type'] + model = model_factory(model_type) + + logger.info("--------------------------") + logger.info(f"--- Build Model Stage ---") + logger.info("--------------------------") + + build_model( + dataclient=dataclient, + model=model, + target=build_model_params['target'], + model_save_location=build_model_params['model_save_location'], + model_hyperparameters=build_model_params[model_type], + train_location=prepare_data_params['output_train_filename'], + test_location=prepare_data_params['output_test_filename'] + ) + + logger.info("-------------------------------") + logger.info(f"--- {__file__} - Complete! ---") + logger.info("-------------------------===---") \ No newline at end of file diff --git a/modules/ml-pipeline/src/training/configs/build_model.yaml b/modules/ml-pipeline/src/training/configs/build_model.yaml new file mode 100644 index 0000000..5f99a7f --- /dev/null +++ b/modules/ml-pipeline/src/training/configs/build_model.yaml @@ -0,0 +1,8 @@ +model_type: SKLearnLinearRegression +train_location: ./data/prepared_data/train.parquet +target: target +test_location: ./data/prepared_data/test.parquet +model_save_location: ./data/model/model.joblib + +SKLearnSVMRegression: + kernel: "linear" \ No newline at end of file diff --git a/modules/ml-pipeline/src/training/configs/configs.py b/modules/ml-pipeline/src/training/configs/configs.py new file mode 100644 index 0000000..19d6f9d --- /dev/null +++ b/modules/ml-pipeline/src/training/configs/configs.py @@ -0,0 +1,3 @@ +""" +Stitch all yaml configuration files together, override some settings (such as bucket location) based off environment variables +""" \ No newline at end of file diff --git a/modules/ml-pipeline/src/training/configs/generate_metrics.yaml b/modules/ml-pipeline/src/training/configs/generate_metrics.yaml new file mode 100644 index 0000000..3f695a8 --- /dev/null +++ b/modules/ml-pipeline/src/training/configs/generate_metrics.yaml @@ -0,0 +1,4 @@ +metrics_type: Regression +test_data_location: ./data/prepared_data/ +predictions_output_location: ./data/predictions/predictions.csv +metrics_output_location: ./metrics/metrics.json \ No newline at end of file diff --git a/modules/ml-pipeline/src/training/configs/prepare_data.yaml b/modules/ml-pipeline/src/training/configs/prepare_data.yaml new file mode 100644 index 0000000..aecd789 --- /dev/null +++ b/modules/ml-pipeline/src/training/configs/prepare_data.yaml @@ -0,0 +1,8 @@ +dataclient: minio +data_location: s3://dev_bucket +train_proportion: 0.8 +output_location: ./data/prepared_data/ +output_train_filename: train.parquet +output_test_filename: test.parquet + +cache_o diff --git a/modules/ml-pipeline/src/training/core/DataClient.py b/modules/ml-pipeline/src/training/core/DataClient.py new file mode 100644 index 0000000..a46ff5f --- /dev/null +++ b/modules/ml-pipeline/src/training/core/DataClient.py @@ -0,0 +1,66 @@ +""" +Implementations of the DataClient Protocol +""" + +import pandas as pd +from typing import List +from core.interface.InterfaceDataClient import DataClient + +def dataclient_factory(dataclient_type: str) -> DataClient: + """ + Determine which dataclient to use + """ + dataclients = { + "minio": MinioClient(), + # ADD MORE DATACLIENTS HERE + } + + if dataclient_type not in dataclients: + raise ValueError("Dataclient type specified is not in factory") + + return dataclients[dataclient_type] + + +def validate_dict_keys(keys_1: List[str], keys_2: List[str], config_type: str): + if not set(keys_1).issubset(keys_2): + raise ValueError(f"Incorrect {config_type} keys specified") + +class MinioClient: + """ + Using the Minio s3 client, to do local testing + """ + + ACCEPTED_CONFIG_KEYS = ["aws_access_key_id", "aws_secret_access_key", "endpoint_url"] + ACCEPTED_LOAD_CONFIG_KEYS = [] + ACCEPTED_SAVE_CONFIG_KEYS = [] + + def ingest_configurations(self, config: dict) -> None: + """ + Load all configuration into the instance (self.config) + """ + validate_dict_keys(keys_1=list(config.keys()), keys_2=self.ACCEPTED_CONFIG_KEYS, config_type="config") + + self.config = config + + def establish_client(self) -> None: + """ + With the given configurations, create the connection to the client (self.client) + """ + + ... + + def load_data(self, load_config: dict) -> pd.DataFrame: + """ + When the client is established, we can load data + """ + validate_dict_keys(keys_1=list(load_config.keys()), keys_2=self.ACCEPTED_LOAD_CONFIG_KEYS, config_type="load_config") + + return pd.DataFrame() + + def save_data(self, obj: object, save_config: dict) -> None: + """ + When the client is established, we can save out objects + """ + validate_dict_keys(keys_1=list(save_config.keys()), keys_2=self.ACCEPTED_SAVE_CONFIG_KEYS, config_type="save_config") + + diff --git a/modules/ml-pipeline/src/training/core/Logger.py b/modules/ml-pipeline/src/training/core/Logger.py new file mode 100644 index 0000000..8603fff --- /dev/null +++ b/modules/ml-pipeline/src/training/core/Logger.py @@ -0,0 +1,26 @@ +""" +Logger that will be used throughout the application +""" + +import logging + +def setup_logger(): + # Create a logger + logger = logging.getLogger() + + # Set the log level + logger.setLevel(logging.INFO) + + # Create a formatter + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + + # Create a stream handler to direct logs to stdout + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(formatter) + + # Add the stream handler to the logger + logger.addHandler(stream_handler) + + return logger + +logger = setup_logger() \ No newline at end of file diff --git a/modules/ml-pipeline/src/training/core/MLMetrics.py b/modules/ml-pipeline/src/training/core/MLMetrics.py new file mode 100644 index 0000000..d656884 --- /dev/null +++ b/modules/ml-pipeline/src/training/core/MLMetrics.py @@ -0,0 +1,51 @@ +""" +Implementation of MLMetrics, all of which will have two methods: +- Generate Metrics Suite +- Generate Plot Suite +""" + +import pandas as pd +from typing import Union +from sklearn.metrics import ( + mean_absolute_error, + median_absolute_error, + mean_squared_error, + mean_absolute_percentage_error +) +from core.interface.InterfaceMetrics import MLMetrics + +def metrics_factory(metrics_type: str) -> MLMetrics: + metrics = { + "Regression": RegressionMetrics(), + # ADD OTHER METRIC CLASSES HERE + } + + if metrics_type not in metrics: + raise ValueError("Metrics type specified is not in factory") + + return metrics[metrics_type] + +class RegressionMetrics: + + METRIC_TO_APPLY = [ + mean_absolute_error, + median_absolute_error, + mean_squared_error, + mean_absolute_percentage_error, + # max_error + ] + + def generate_metrics( + self, target: Union[pd.DataFrame, pd.Series], predictions: pd.Series + ) -> dict: + """ + Method to generate metrics + """ + + metric_dict = {} + for metric_function in self.METRIC_TO_APPLY: + metric_dict[metric_function.__name__] = metric_function( + target, predictions + ) + + return metric_dict \ No newline at end of file diff --git a/modules/ml-pipeline/src/training/core/MLModels.py b/modules/ml-pipeline/src/training/core/MLModels.py new file mode 100644 index 0000000..8b32957 --- /dev/null +++ b/modules/ml-pipeline/src/training/core/MLModels.py @@ -0,0 +1,127 @@ +"""" +Implementations of MLModels, all of which will have four methods to: +- Load model +- Save Model +- Train Model +- Geenrate predictions +""" + +import os +import joblib +import pandas as pd +from pathlib import Path +from typing import Union, List +from sklearn import linear_model +from sklearn.svm import SVR +from core.interface.InterfaceModels import MLModel + +def model_factory(model_type: str) -> MLModel: + """ + Determine which model to use from the model type + """ + models = { + "SKLearnLinearRegression": SKLearnLinearRegression(), + "SKLearnSVMRegression": SKLearnSVMRegression(), + # ADD OTHER MODELS HERE + } + + if model_type not in models: + raise ValueError("Model type specified is not in factory") + + return models[model_type] + + +def validate_dict_keys(keys_1: List[str], keys_2: List[str], config_type: str): + if not set(keys_1).issubset(keys_2): + raise ValueError(f"Incorrect {config_type} keys specified") + + +class SKLearnLinearRegression: + + def load_model(self, path: Union[Path, str]) -> None: + """ + Method to load a model + """ + string_path = str(path) + self.model = joblib.load(string_path) + + def save_model(self, path: Path) -> str: + """ + Method to save a model + """ + if self.model is None: + raise KeyError("No model trained/ loaded - unable to save") + + if not path.parent.exists(): + os.mkdir(path.parent) + + string_path = str(path) + joblib.dump(self.model, string_path) + + return string_path + + def train_model(self, data: pd.DataFrame, target: str, model_hyperparameters: dict) -> None: + """ + Method to train a model + """ + self.model = linear_model.LinearRegression() + + x_train = data.iloc[:, data.columns != target] + y_train = data[target] + self.model.fit(x_train, y_train) + + + def predict(self, data: pd.DataFrame) -> pd.Series: + """ + Method to predict + """ + self.predictions = pd.Series(self.model.predict(data)) + return self.predictions + + +class SKLearnSVMRegression: + + MODEL_HYPERPARAMETERS = ["kernel"] + + def load_model(self, path: Union[Path, str]) -> None: + """ + Method to load a model + """ + string_path = str(path) + self.model = joblib.load(string_path) + + def save_model(self, path: Path) -> str: + """ + Method to save a model + """ + if self.model is None: + raise KeyError("No model trained/ loaded - unable to save") + + if not path.parent.exists(): + os.mkdir(path.parent) + + string_path = str(path) + joblib.dump(self.model, string_path) + + return string_path + + def train_model(self, data: pd.DataFrame, target: str, model_hyperparameters: dict) -> None: + """ + Method to train a model + """ + + validate_dict_keys(list(model_hyperparameters.keys()), self.MODEL_HYPERPARAMETERS, config_type="Train_model_config") + + self.model = SVR(kernel=model_hyperparameters['kernel']) + + x_train = data.iloc[:, data.columns != target] + y_train = data[target] + self.model.fit(x_train, y_train) + + + def predict(self, data: pd.DataFrame) -> pd.Series: + """ + Method to predict + """ + self.predictions = pd.Series(self.model.predict(data)) + return self.predictions \ No newline at end of file diff --git a/modules/ml-pipeline/src/training/core/__init__.py b/modules/ml-pipeline/src/training/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/modules/ml-pipeline/src/training/core/interface/InterfaceDataClient.py b/modules/ml-pipeline/src/training/core/interface/InterfaceDataClient.py new file mode 100644 index 0000000..1a0c9e1 --- /dev/null +++ b/modules/ml-pipeline/src/training/core/interface/InterfaceDataClient.py @@ -0,0 +1,34 @@ +""" +Interface for all DataClient i.e. s3, database, local etc +""" + +import pandas as pd +from typing import Protocol + +class DataClient(Protocol): + """ + Declare the methods required for a DataClient + """ + + def ingest_configurations(self, config: dict) -> None: + """ + Load all configuration into the instance (self.config) + """ + ... + + def establish_client(self) -> None: + """ + With the given configurations, create the connection to the client (self.client) + """ + ... + + def load_data(self, load_config: dict) -> pd.DataFrame: + """ + When the client is established, we can load data + """ + ... + + def save_data(self, obj: object, save_config: dict) -> None: + """ + When the client is established, we can save out objects + """ diff --git a/modules/ml-pipeline/src/training/core/interface/InterfaceMetrics.py b/modules/ml-pipeline/src/training/core/interface/InterfaceMetrics.py new file mode 100644 index 0000000..f7bff1e --- /dev/null +++ b/modules/ml-pipeline/src/training/core/interface/InterfaceMetrics.py @@ -0,0 +1,19 @@ +""" +Define the interface for creating metrics +""" + +import pandas as pd +from typing import Protocol, Union + +class MLMetrics(Protocol): + """ + All metrics will need to have the following interface to interact with the ML Pipeline + """ + + def generate_metrics( + self, target: Union[pd.DataFrame, pd.Series], predictions: pd.Series + ) -> dict: + """ + Method to generate metrics + """ + ... \ No newline at end of file diff --git a/modules/ml-pipeline/src/training/core/interface/InterfaceModels.py b/modules/ml-pipeline/src/training/core/interface/InterfaceModels.py new file mode 100644 index 0000000..6daee05 --- /dev/null +++ b/modules/ml-pipeline/src/training/core/interface/InterfaceModels.py @@ -0,0 +1,36 @@ +""" +Define the protocol for models in this pipeline +""" + +import pandas as pd +from pathlib import Path +from typing import Protocol, Union + +class MLModel(Protocol): + """ + All models will need to have the following interface to interact with the ML pipeline + """ + + def load_model(self, path: Union[Path, str]) -> None: + """ + Method to load a model + """ + ... + + def save_model(self, path: Path) -> str: + """ + Method to save a model + """ + ... + + def train_model(self, data: pd.DataFrame, target: str, model_hyperparameters: dict) -> None: + """ + Method to train a model + """ + ... + + def predict(self, data: pd.DataFrame) -> pd.Series: + """ + Method to predict + """ + ... \ No newline at end of file diff --git a/modules/ml-pipeline/src/training/core/interface/__init__.py b/modules/ml-pipeline/src/training/core/interface/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/modules/ml-pipeline/src/training/generate_metrics.py b/modules/ml-pipeline/src/training/generate_metrics.py new file mode 100644 index 0000000..3642009 --- /dev/null +++ b/modules/ml-pipeline/src/training/generate_metrics.py @@ -0,0 +1,118 @@ +""" +Third part of the pipeline: +After the model is built, we can evaluate its performance +""" + +import os +import yaml +import json +import pandas as pd +from pathlib import Path +from core.interface.InterfaceModels import MLModel +from core.interface.InterfaceMetrics import MLMetrics +from core.interface.InterfaceDataClient import DataClient +from core.DataClient import dataclient_factory +from core.MLModels import model_factory +from core.MLMetrics import metrics_factory +from core.Logger import logger + + +RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") + +prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" +prepare_data_params = yaml.safe_load(open(prepare_data_path)) + +build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" +build_model_params = yaml.safe_load(open(build_model_path)) + +generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml" +generate_metrics_params = yaml.safe_load(open(generate_metrics_path)) + + +def generate_metrics( + dataclient: DataClient, + model: MLModel, + metrics: MLMetrics, + target: str, + model_location: str, + test_data_location: str, + predictions_output_location: str, + metrics_output_location: str +): + """ + For a given model, we generate prediction and evaluate this against the true target + """ + + logger.info("-------------------------") + logger.info("--- Loading test data ---") + logger.info("-------------------------") + + # TODO: replace with client loader here + test_data = pd.read_parquet(test_data_location) + + logger.info("---------------------") + logger.info("--- Loading model ---") + logger.info("---------------------") + + model.load_model(model_location) + + logger.info("------------------------------") + logger.info("--- Generating predictions ---") + logger.info("------------------------------") + + # Clean test data for now + prediction_data = test_data.drop(columns=target) if target in test_data.columns else test_data + + predictions = model.predict(data=prediction_data) + + logger.info("--------------------------") + logger.info("--- Saving predictions ---") + logger.info("--------------------------") + + # TODO: replace with client + + if not Path(predictions_output_location).parent.exists(): + os.mkdir(Path(predictions_output_location).parent) + + predictions.to_json(predictions_output_location) + + logger.info("--------------------------") + logger.info("--- Generating metrics ---") + logger.info("--------------------------") + + metrics_output = metrics.generate_metrics( + target=test_data[target], predictions=predictions + ) + + logger.info("----------------------") + logger.info("--- Saving metrics ---") + logger.info("----------------------") + + # TODO: replace with client + + if not Path(metrics_output_location).parent.exists(): + os.mkdir(Path(metrics_output_location).parent) + + with open(metrics_output_location, "w") as f: + json.dump(metrics_output, f) + +if __name__ == "__main__": + + logger.info("----------------------------") + logger.info(f"--- {__file__} - Start! ---") + logger.info("----------------------------") + + model = model_factory(build_model_params['model_type']) + dataclient = dataclient_factory(prepare_data_params['dataclient_type']) + metrics = metrics_factory(generate_metrics_params['metrics_type']) + + generate_metrics( + dataclient=dataclient, + model=model, + metrics=metrics, + target=build_model_params["target"], + model_location=build_model_params["model_save_location"], + test_data_location=generate_metrics_params["test_data_location"], + predictions_output_location=generate_metrics_params["predictions_output_location"], + metrics_output_location=generate_metrics_params["metrics_output_location"] + ) \ No newline at end of file diff --git a/modules/ml-pipeline/src/training/prepare_data.py b/modules/ml-pipeline/src/training/prepare_data.py new file mode 100644 index 0000000..c8b6524 --- /dev/null +++ b/modules/ml-pipeline/src/training/prepare_data.py @@ -0,0 +1,103 @@ +""" +First part of the pipeline: +Loading data from a client +""" + +import os +import yaml +import pandas as pd +from typing import Optional, Tuple, Union +from pathlib import Path +from sklearn.datasets import load_diabetes +from sklearn.model_selection import train_test_split +from core.interface.InterfaceDataClient import DataClient +from core.Logger import logger +from core.DataClient import dataclient_factory + +RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") + +params_path = Path(__file__).parent / "configs" / "prepare_data.yaml" +params = yaml.safe_load(open(params_path)) + + +def use_dummy_data() -> pd.DataFrame: + diabetes_data = load_diabetes() + + x_data = pd.DataFrame(diabetes_data['data'], columns=diabetes_data['feature_names']) # type: ignore + y_data = pd.DataFrame(diabetes_data['target'], columns=['target']) # type: ignore + + data = pd.concat([x_data, y_data], axis=1) + return data + + +def prepare_data(dataclient: DataClient, train_proportion: float, output_location: str, output_train_filename: str = "train.parquet", output_test_filename: str = "test.parquet") -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Given a client and location, load data into the pipeline + :param dataclient: DataClient, Determines how to get data from the given provider (cloud or local) + :param pipeline_mode: bool, Default False, this caches out the file for experimentation, objects returned in pipeline mode + """ + + logger.info("--------------------") + logger.info("--- Loading data ---") + logger.info("--------------------") + + # TODO: REPLACE THIS WIL CLIENT AND LOAD DATA + data = use_dummy_data() + + logger.info("----------------------") + logger.info("--- Splitting data ---") + logger.info("----------------------") + + train, test = train_test_split( + data, train_size=train_proportion, test_size=(1-train_proportion) + ) + + logger.info("--------------------------") + logger.info("--- Feature Processing ---") + logger.info("--------------------------") + + + + logger.info("-----------------------") + logger.info("--- Outputting data ---") + logger.info("-----------------------") + + # TODO: REPLACE WITH CLIENT + output_path = Path(output_location) + if not output_path.exists(): + os.mkdir(output_path) + + logger.info("--- Outputting train and test data ---") + train.to_csv(output_path / output_train_filename, index=False) + test.to_csv(output_path/ output_test_filename, index=False) + + # client.save_data(obj=train) + # client.save_data(obj=test) + + return train, test + +if __name__ == "__main__": + + logger.info("----------------------------") + logger.info(f"--- {__file__} - Start! ---") + logger.info("----------------------------") + + logger.info("----------------------------") + logger.info(f"--- Initiate DataClient ---") + logger.info("----------------------------") + + dataclient = dataclient_factory(params['dataclient_type']) + + logger.info("---------------------------") + logger.info(f"--- Prepare Data Stage ---") + logger.info("---------------------------") + + prepare_data( + dataclient=dataclient, + train_proportion=params['train_proportion'], + output_location=params['output_location'] + ) + + logger.info("-------------------------------") + logger.info(f"--- {__file__} - Complete! ---") + logger.info("-------------------------===---") diff --git a/modules/ml-pipeline/src/training/requirements/requirements-dev.txt b/modules/ml-pipeline/src/training/requirements/requirements-dev.txt new file mode 100644 index 0000000..87831fb --- /dev/null +++ b/modules/ml-pipeline/src/training/requirements/requirements-dev.txt @@ -0,0 +1,9 @@ +joblib==1.3.2 +boto3==1.28.41 +pandas==1.5.3 +dvc==3.18.0 +gto==1.0.4 +scikit-learn==1.3.0 +pre-commit==3.3.3 +sphinx==7.2.5 +sphinx_rtd_theme==1.3.0 \ No newline at end of file diff --git a/modules/ml-pipeline/src/training/requirements/requirements.txt b/modules/ml-pipeline/src/training/requirements/requirements.txt new file mode 100644 index 0000000..8775629 --- /dev/null +++ b/modules/ml-pipeline/src/training/requirements/requirements.txt @@ -0,0 +1,5 @@ +boto3==1.28.41 +pandas==1.5.3 +dvc==3.18.0 +gto==1.0.4 +scikit-learn==1.3.0 \ No newline at end of file