import pandas as pd from pathlib import Path from etl.epc.DataProcessor import EPCDataProcessor from etl.epc.Pipeline import EPCPipeline DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates" def main(): """ Orchestration function """ directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] # directories = directories[235:275] epc_pipeline = EPCPipeline( directories=directories, use_parallel=True, epc_data_processor=EPCDataProcessor(run_mode="training"), ) epc_pipeline.run() # For testing # dataset_df = epc_pipeline.compiled_dataset # dataset_df.to_parquet("refactor_datasets/dataset_with0perm_all.parquet") # pd.DataFrame(epc_pipeline.compiled_all_equal_rows).to_parquet("refactor_datasets/all_equal_rows_with0perm_all.parquet") # pd.concat(epc_pipeline.compiled_cleaning_averages).to_parquet("refactor_datasets/cleaning_averages_with0perm_all.parquet") # from utils.s3 import read_dataframe_from_s3_parquet # dataset = read_dataframe_from_s3_parquet( # bucket_name="retrofit-data-dev", # file_key="sap_change_model/dataset_test.parquet", # ) if __name__ == "__main__": main()