diff --git a/modules/ml-pipeline/.gitignore b/modules/ml-pipeline/.gitignore index e4d8729..2a3b661 100644 --- a/modules/ml-pipeline/.gitignore +++ b/modules/ml-pipeline/.gitignore @@ -1 +1,3 @@ .dev_env/ +data/ +__pycache__/ diff --git a/modules/ml-pipeline/README.MD b/modules/ml-pipeline/README.MD index 080838e..cf9316a 100644 --- a/modules/ml-pipeline/README.MD +++ b/modules/ml-pipeline/README.MD @@ -11,3 +11,11 @@ Within `src` folder, the structure is as follows: - i.e. for a product, we might require multuple pipelines do deliver a result - i.e. multiple models - these models can be all tracked within the same gto model registry + +To enable the virtual envrionemnt created in vscode: +- Open settings +- Search 'env' +- Under the extensions tab, there will be **Venv path** +- Copy the path of the '.dev_env' folder into there. +- When you select a kernel, clcik through create environment and refresh +- The virutal environment should be there diff --git a/modules/ml-pipeline/src/pipeline/training/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/training/configs/prepare_data.yaml index 1fd65b3..17b36ce 100644 --- a/modules/ml-pipeline/src/pipeline/training/configs/prepare_data.yaml +++ b/modules/ml-pipeline/src/pipeline/training/configs/prepare_data.yaml @@ -1,4 +1,4 @@ -dataclient: minio +dataclient_type: minio data_location: s3://dev_bucket train_proportion: 0.8 output_location: ./data/prepared_data/ diff --git a/modules/ml-pipeline/src/pipeline/training/prepare_data.py b/modules/ml-pipeline/src/pipeline/training/prepare_data.py index 08e84d6..d25efd9 100644 --- a/modules/ml-pipeline/src/pipeline/training/prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/training/prepare_data.py @@ -69,7 +69,7 @@ def prepare_data( # TODO: REPLACE WITH CLIENT output_path = Path(output_location) if not output_path.exists(): - os.mkdir(output_path) + os.makedirs(output_path) logger.info("--- Outputting train and test data ---") train.to_csv(output_path / output_train_filename, index=False)