diff --git a/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py b/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py index 0bfa37f..32e8a1b 100644 --- a/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py +++ b/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py @@ -16,13 +16,9 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: Remove the directory where artefacts are stored """ - logger.info("---------------------") logger.info(f"--- Run Clean up ---") - logger.info("---------------------") - logger.info("-------------------------") logger.info(f"--- Delete artefacts ---") - logger.info("-------------------------") artefact_directory_path = Path(artefacts_directory) @@ -31,9 +27,7 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: logger.info(f"Removing the directory: {artefacts_directory}") shutil.rmtree(artefact_directory_path) - logger.info("-----------------------") logger.info(f"--- Delete metrics ---") - logger.info("-----------------------") metrics_directory_path = Path(metrics_directory) @@ -45,15 +39,11 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") run_cleanup( artefacts_directory=startup_cleanup_params["artefacts"], metrics_directory=startup_cleanup_params["metrics"], ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/1_prepare_data.py b/modules/ml-pipeline/src/pipeline/1_prepare_data.py index 32daa19..ed7e057 100644 --- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py @@ -17,9 +17,7 @@ from core.DataClient import dataclient_factory from core.FeatureProcessor import feature_processor_factory from config import settings -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -33,9 +31,7 @@ output_train_filepath = prepare_data_params["output_train_filepath"] output_test_filepath = prepare_data_params["output_test_filepath"] feature_processor_config = feature_process_params["feature_processor_config"] -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") input_dataclient_type = prepare_data_params["input_dataclient_type"] output_dataclient_type = prepare_data_params["output_dataclient_type"] @@ -49,9 +45,7 @@ output_dataclient = dataclient_factory( dataclient_config=client_params[output_dataclient_type], ) -logger.info("----------------------------------") logger.info(f"--- Initiate FeatureProcessor ---") -logger.info("----------------------------------") feature_processor = feature_processor_factory( feature_process_params["feature_processor_type"] @@ -76,15 +70,11 @@ def prepare_data( :param pipeline_mode: bool, Default False, this caches out the file for experimentation, objects returned in pipeline mode """ - logger.info("--------------------") logger.info("--- Loading data ---") - logger.info("--------------------") data = input_dataclient.load_data(location=data_filepath, load_config={}) - logger.info("--------------------------") logger.info("--- Feature Processing ---") - logger.info("--------------------------") data = feature_processor.feature_process( data, @@ -93,9 +83,7 @@ def prepare_data( new_feature_funcs=new_feature_funcs, ) - logger.info("----------------------") logger.info("--- Splitting data ---") - logger.info("----------------------") if train_proportion == 1: train = data @@ -108,9 +96,7 @@ def prepare_data( train = train.reset_index(drop=True) - logger.info("-----------------------") logger.info("--- Outputting data ---") - logger.info("-----------------------") output_dataclient.save_data( obj=train, location=output_train_filepath, save_config=None @@ -126,13 +112,9 @@ def prepare_data( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("---------------------------") logger.info(f"--- Prepare Data Stage ---") - logger.info("---------------------------") prepare_data( input_dataclient=input_dataclient, @@ -147,6 +129,4 @@ if __name__ == "__main__": new_feature_funcs=new_feature_funcs, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/2_build_model.py b/modules/ml-pipeline/src/pipeline/2_build_model.py index cae5cfd..f3504a7 100644 --- a/modules/ml-pipeline/src/pipeline/2_build_model.py +++ b/modules/ml-pipeline/src/pipeline/2_build_model.py @@ -18,9 +18,7 @@ from core.MLMetrics import metrics_factory from configs.post_prediction_logic import post_prediction_logic from config import settings -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -40,22 +38,16 @@ train_filepath = prepare_data_params["output_train_filepath"] test_filepath = prepare_data_params["output_test_filepath"] fit_metrics_filepath = build_model_params["fit_metrics_filepath"] -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") # Output of previous prepare data step, will be where the data is dataclient = dataclient_factory(prepare_data_params["output_dataclient_type"]) -logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") -logger.info("-------------------------") model = model_factory(model_type) -logger.info("-------------------------") logger.info(f"--- Initiate Metrics ---") -logger.info("-------------------------") metrics = metrics_factory(generate_metrics_params["metrics_type"]) @@ -75,9 +67,8 @@ def build_model( test_data: Union[pd.DataFrame, None] = None, pipeline_mode: bool = False, ): - logger.info("--------------------------------------") + logger.info("--- Loading Data for build process ---") - logger.info("--------------------------------------") if train_data is None: if train_filepath is None: @@ -89,9 +80,7 @@ def build_model( raise ValueError(f"Need {test_filepath} if no data supplied") test_data = dataclient.load_data(location=test_filepath, load_config=None) - logger.info("----------------------") logger.info("--- Training model ---") - logger.info("----------------------") model.train_model( data=train_data.drop(columns=identifier_columns), @@ -99,32 +88,24 @@ def build_model( model_hyperparameters=model_hyperparameters, ) - logger.info("----------------------------------") logger.info("--- Generating fit predictions ---") - logger.info("----------------------------------") fit_predictions = model.predict( data=train_data, post_prediction_logic=post_prediction_logic ) - logger.info("------------------------------") logger.info("--- Generating fit metrics ---") - logger.info("------------------------------") metrics_output = metrics.generate_metrics( target=train_data[target], predictions=pd.Series(fit_predictions), ) - logger.info("--------------------") logger.info("--- Saving model ---") - logger.info("--------------------") model.save_model(path=Path(model_save_location)) - logger.info("--------------------------") logger.info("--- Saving fit metrics ---") - logger.info("--------------------------") dataclient.save_data( obj=metrics_output, location=fit_metrics_filepath, save_config=None @@ -133,13 +114,9 @@ def build_model( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("--------------------------") logger.info(f"--- Build Model Stage ---") - logger.info("--------------------------") build_model( dataclient=dataclient, @@ -154,6 +131,4 @@ if __name__ == "__main__": fit_metrics_filepath=fit_metrics_filepath, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py index 9461392..acb9e99 100644 --- a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py @@ -10,9 +10,7 @@ from core.Logger import logger from config import settings from generate_predictions import generate_predictions -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -33,15 +31,11 @@ model_filepath = build_model_params["model_save_filepath"] predictions_output_filepath = generate_predictions_params["predictions_output_filepath"] predictions_column_name = generate_predictions_params["predictions_column_name"] -logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") -logger.info("-------------------------") model = model_factory(build_model_params["model_type"]) -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") # We may have different locations of loading hence why we use one specified in generate_predictions.yaml # I.e. for metric runs, this will be a local data client @@ -59,13 +53,9 @@ output_dataclient = dataclient_factory( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("----------------------------------") logger.info(f"--- Generate Predictions Stage---") - logger.info("----------------------------------") generate_predictions( input_dataclient=input_dataclient, @@ -78,6 +68,4 @@ if __name__ == "__main__": predictions_column_name=predictions_column_name, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py index 7b115a2..ddcd3cc 100644 --- a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py +++ b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py @@ -16,9 +16,7 @@ from core.MLMetrics import metrics_factory from core.Logger import logger from config import settings -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -35,16 +33,11 @@ predictions_output_filepath = generate_predictions_params["predictions_output_fi predictions_column_name = generate_predictions_params["predictions_column_name"] metrics_output_filepath = generate_metrics_params["metrics_output_filepath"] - -logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") -logger.info("-------------------------") model = model_factory(build_model_params["model_type"]) -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") # Use data client for input and output, as we use dvc to cache later to the cloud dataclient_type = generate_metrics_params["dataclient_type"] @@ -53,9 +46,7 @@ dataclient = dataclient_factory( dataclient_config=client_params[dataclient_type], ) -logger.info("---------------------------") logger.info(f"--- Initiate MLMetrics ---") -logger.info("---------------------------") metrics = metrics_factory(generate_metrics_params["metrics_type"]) @@ -75,34 +66,26 @@ def generate_metrics( For a given model, we generate prediction and evaluate this against the true target """ - logger.info("-------------------------") logger.info("--- Loading test data ---") - logger.info("-------------------------") test_data = input_dataclient.load_data( location=test_data_filepath, load_config=None ) - logger.info("---------------------------") logger.info("--- Loading predictions ---") - logger.info("---------------------------") predictions = input_dataclient.load_data( location=predictions_output_filepath, load_config=None ) - logger.info("--------------------------") logger.info("--- Generating metrics ---") - logger.info("--------------------------") metrics_output = metrics.generate_metrics( target=test_data[target], predictions=pd.Series(predictions[predictions_column_name]), ) - logger.info("----------------------") logger.info("--- Saving metrics ---") - logger.info("----------------------") output_dataclient.save_data( obj=metrics_output, location=metrics_output_filepath, save_config=None @@ -111,13 +94,9 @@ def generate_metrics( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("------------------------------") logger.info(f"--- Generate Metrics Stage---") - logger.info("------------------------------") generate_metrics( input_dataclient=dataclient, @@ -131,6 +110,4 @@ if __name__ == "__main__": metrics_output_filepath=metrics_output_filepath, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/core/Logger.py b/modules/ml-pipeline/src/pipeline/core/Logger.py index a0fc231..2194063 100644 --- a/modules/ml-pipeline/src/pipeline/core/Logger.py +++ b/modules/ml-pipeline/src/pipeline/core/Logger.py @@ -22,6 +22,8 @@ def setup_logger(): # Add the stream handler to the logger logger.addHandler(stream_handler) + logger.propagate = False + return logger