mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
39 lines
1.2 KiB
Python
39 lines
1.2 KiB
Python
import pandas as pd
|
|
from pathlib import Path
|
|
from etl.epc.DataProcessor import EPCDataProcessor
|
|
from etl.epc.Pipeline import EPCPipeline
|
|
|
|
DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
|
|
|
|
|
|
def main():
|
|
"""
|
|
Orchestration function
|
|
"""
|
|
|
|
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
|
# directories = directories[235:275]
|
|
|
|
epc_pipeline = EPCPipeline(
|
|
directories=directories,
|
|
use_parallel=True,
|
|
epc_data_processor=EPCDataProcessor(run_mode="training"),
|
|
)
|
|
|
|
epc_pipeline.run()
|
|
|
|
# For testing
|
|
# dataset_df = epc_pipeline.compiled_dataset
|
|
# dataset_df.to_parquet("refactor_datasets/dataset_with0perm_all.parquet")
|
|
# pd.DataFrame(epc_pipeline.compiled_all_equal_rows).to_parquet("refactor_datasets/all_equal_rows_with0perm_all.parquet")
|
|
# pd.concat(epc_pipeline.compiled_cleaning_averages).to_parquet("refactor_datasets/cleaning_averages_with0perm_all.parquet")
|
|
|
|
# from utils.s3 import read_dataframe_from_s3_parquet
|
|
# dataset = read_dataframe_from_s3_parquet(
|
|
# bucket_name="retrofit-data-dev",
|
|
# file_key="sap_change_model/dataset_test.parquet",
|
|
# )
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|