diff --git a/modules/ml-pipeline/src/pipeline/training/build_model.py b/modules/ml-pipeline/src/pipeline/training/build_model.py index 77dd25e..8d8bcca 100644 --- a/modules/ml-pipeline/src/pipeline/training/build_model.py +++ b/modules/ml-pipeline/src/pipeline/training/build_model.py @@ -96,7 +96,7 @@ if __name__ == "__main__": dataclient=dataclient, model=model, target=build_model_params["target"], - model_save_location=build_model_params["model_save_location"], + model_save_location=build_model_params["model_save_filepath"], model_hyperparameters=build_model_params[model_type], train_filepath=prepare_data_params["output_train_filepath"], test_filepath=prepare_data_params["output_test_filepath"], diff --git a/modules/ml-pipeline/src/pipeline/training/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/training/configs/build_model.yaml index 940f5ce..8a16027 100644 --- a/modules/ml-pipeline/src/pipeline/training/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/training/configs/build_model.yaml @@ -1,9 +1,6 @@ model_type: SKLearnLinearRegression -train_location: ./data/prepared_data/train.parquet target: target -test_location: ./data/prepared_data/test.parquet -model_save_location: ./data/model/model.joblib - +model_save_filepath: ./data/model/model.joblib SKLearnLinearRegression: null diff --git a/modules/ml-pipeline/src/pipeline/training/data/.gitignore b/modules/ml-pipeline/src/pipeline/training/data/.gitignore new file mode 100644 index 0000000..7c8e294 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/training/data/.gitignore @@ -0,0 +1,3 @@ +/prepared_data +/model +/predictions diff --git a/modules/ml-pipeline/src/pipeline/training/dvc.lock b/modules/ml-pipeline/src/pipeline/training/dvc.lock new file mode 100644 index 0000000..ebafd09 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/training/dvc.lock @@ -0,0 +1,99 @@ +schema: '2.0' +stages: + prepare_data: + cmd: python prepare_data.py + deps: + - path: prepare_data.py + hash: md5 + md5: 113f292aa8fa1ecec56b21cfc7f657a9 + size: 3623 + params: + configs/prepare_data.yaml: + output_test_filepath: ./data/prepared_data/test.parquet + output_train_filepath: ./data/prepared_data/train.parquet + train_proportion: 0.8 + outs: + - path: data/prepared_data/ + hash: md5 + md5: 8268b5117320d2589594a0eda859c5e5.dir + size: 36337 + nfiles: 2 + build_model: + cmd: python build_model.py + deps: + - path: build_model.py + hash: md5 + md5: 43ff6a4781efacff4234fe261022a5dd + size: 3576 + - path: data/prepared_data + hash: md5 + md5: 8268b5117320d2589594a0eda859c5e5.dir + size: 36337 + nfiles: 2 + params: + configs/build_model.yaml: + SKLearnLinearRegression: + SKLearnSVMRegression: + kernel: linear + model_save_filepath: ./data/model/model.joblib + model_type: SKLearnLinearRegression + target: target + outs: + - path: data/model/ + hash: md5 + md5: 85ed2d0d4f179e038b8ffd296b86f630.dir + size: 1096 + nfiles: 1 + generate_predictions: + cmd: python generate_predictions.py + deps: + - path: data/model + hash: md5 + md5: 85ed2d0d4f179e038b8ffd296b86f630.dir + size: 1096 + nfiles: 1 + - path: data/prepared_data + hash: md5 + md5: 8268b5117320d2589594a0eda859c5e5.dir + size: 36337 + nfiles: 2 + - path: generate_predictions.py + hash: md5 + md5: 209fe6efbebfd3d7aa1a1bb27885d3c1 + size: 3114 + params: + configs/generate_predictions.yaml: + predictions_output_filepath: ./data/predictions/predictions.parquet + test_data_filepath: ./data/prepared_data/test.parquet + outs: + - path: data/predictions/ + hash: md5 + md5: 7bb333329935cc66390475a3ad6deaf9.dir + size: 2531 + nfiles: 1 + generate_metrics: + cmd: python generate_metrics.py + deps: + - path: data/predictions + hash: md5 + md5: 7bb333329935cc66390475a3ad6deaf9.dir + size: 2531 + nfiles: 1 + - path: data/prepared_data + hash: md5 + md5: 8268b5117320d2589594a0eda859c5e5.dir + size: 36337 + nfiles: 2 + - path: generate_metrics.py + hash: md5 + md5: 4c1379bf37f5e5ad5843eb5b5a22ebc5 + size: 3407 + params: + configs/generate_metrics.yaml: + metrics_output_filepath: ./metrics/metrics.json + metrics_type: Regression + outs: + - path: metrics/metrics.json + hash: md5 + md5: bf7ed6a9b378b42fb3d7b6d16c76655f + size: 183 diff --git a/modules/ml-pipeline/src/pipeline/training/dvc.yaml b/modules/ml-pipeline/src/pipeline/training/dvc.yaml index e69de29..b3a374b 100644 --- a/modules/ml-pipeline/src/pipeline/training/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/training/dvc.yaml @@ -0,0 +1,41 @@ +stages: + prepare_data: + cmd: python prepare_data.py + deps: + - prepare_data.py + params: + - configs/prepare_data.yaml: + - output_test_filepath + - output_train_filepath + - train_proportion + outs: + - data/prepared_data/ + build_model: + cmd: python build_model.py + deps: + - build_model.py + - data/prepared_data + params: + - configs/build_model.yaml: + outs: + - data/model/ + generate_predictions: + cmd: python generate_predictions.py + deps: + - generate_predictions.py + - data/prepared_data + - data/model + params: + - configs/generate_predictions.yaml: + outs: + - data/predictions/ + generate_metrics: + cmd: python generate_metrics.py + deps: + - generate_metrics.py + - data/prepared_data + - data/predictions + params: + - configs/generate_metrics.yaml: + outs: + - metrics/metrics.json diff --git a/modules/ml-pipeline/src/pipeline/training/generate_predictions.py b/modules/ml-pipeline/src/pipeline/training/generate_predictions.py index ff67344..321f04e 100644 --- a/modules/ml-pipeline/src/pipeline/training/generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/training/generate_predictions.py @@ -34,7 +34,7 @@ def generate_predictions( dataclient: DataClient, model: MLModel, target: str, - model_location: str, + model_filepath: str, test_data_filepath: str, predictions_output_filepath: str, ): @@ -53,7 +53,7 @@ def generate_predictions( logger.info("--- Loading model ---") logger.info("---------------------") - model.load_model(model_location) + model.load_model(model_filepath) logger.info("------------------------------") logger.info("--- Generating predictions ---") @@ -93,7 +93,7 @@ if __name__ == "__main__": dataclient=dataclient, model=model, target=build_model_params["target"], - model_location=build_model_params["model_save_location"], + model_filepath=build_model_params["model_save_filepath"], test_data_filepath=generate_predictions_params["test_data_filepath"], predictions_output_filepath=generate_predictions_params[ "predictions_output_filepath" diff --git a/modules/ml-pipeline/src/pipeline/training/metrics/.gitignore b/modules/ml-pipeline/src/pipeline/training/metrics/.gitignore new file mode 100644 index 0000000..fbadd1c --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/training/metrics/.gitignore @@ -0,0 +1 @@ +/metrics.json