Model/etl/epc/property_change_app.py
2025-11-01 15:34:45 +00:00

39 lines
1.2 KiB
Python

import pandas as pd
from pathlib import Path
from etl.epc.DataProcessor import EPCDataProcessor
from etl.epc.Pipeline import EPCPipeline
DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
def main():
"""
Orchestration function
"""
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
# directories = directories[235:275]
epc_pipeline = EPCPipeline(
directories=directories,
use_parallel=True,
epc_data_processor=EPCDataProcessor(run_mode="training"),
)
epc_pipeline.run()
# For testing
# dataset_df = epc_pipeline.compiled_dataset
# dataset_df.to_parquet("refactor_datasets/dataset_with0perm_all.parquet")
# pd.DataFrame(epc_pipeline.compiled_all_equal_rows).to_parquet("refactor_datasets/all_equal_rows_with0perm_all.parquet")
# pd.concat(epc_pipeline.compiled_cleaning_averages).to_parquet("refactor_datasets/cleaning_averages_with0perm_all.parquet")
# from utils.s3 import read_dataframe_from_s3_parquet
# dataset = read_dataframe_from_s3_parquet(
# bucket_name="retrofit-data-dev",
# file_key="sap_change_model/dataset_test.parquet",
# )
if __name__ == "__main__":
main()