fixed concat issue

This commit is contained in:
Michael Duong 2024-01-16 22:45:18 +00:00
parent 255bfc182d
commit df33ff93c3
3 changed files with 26 additions and 26 deletions

View file

@ -135,6 +135,7 @@ class EPCDataProcessor:
self.fill_invalid_constituency_fields(ignore_step=ignore_step) self.fill_invalid_constituency_fields(ignore_step=ignore_step)
self.make_cleaning_averages(ignore_step=ignore_step) self.make_cleaning_averages(ignore_step=ignore_step)
self.add_local_authority_to_cleaning_average(ignore_step=ignore_step)
# TODO: check if this has impact on training dataset # TODO: check if this has impact on training dataset
cleaned_data = self.apply_averages_cleaning( cleaned_data = self.apply_averages_cleaning(
@ -152,7 +153,6 @@ class EPCDataProcessor:
self.data = self.data if cleaned_data is None else cleaned_data self.data = self.data if cleaned_data is None else cleaned_data
self.add_local_authority_to_cleaning_average(ignore_step=ignore_step)
self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step) self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
self.cast_data_columns_to_lower() self.cast_data_columns_to_lower()

View file

@ -123,7 +123,7 @@ class EPCPipeline:
data = self.epc_data_processor.data data = self.epc_data_processor.data
epc_records = [EPCRecord(**x, run_mode="newdata") for x in data.to_dict(orient='records')] epc_records = [EPCRecord(**x, run_mode="newdata") for x in data.to_dict(orient='records')]
@ -137,23 +137,23 @@ class EPCPipeline:
for directory in tqdm(self.directories): for directory in tqdm(self.directories):
self.process_directory(directory) self.process_directory(directory)
# save_dataframe_to_s3_parquet( save_dataframe_to_s3_parquet(
# df=self.compiled_dataset, df=self.compiled_dataset,
# bucket_name=self.epc_bucket_name, bucket_name=self.epc_bucket_name,
# file_key=self.epc_compiled_dataset_key, file_key=self.epc_compiled_dataset_key,
# ) )
# save_dataframe_to_s3_parquet( save_dataframe_to_s3_parquet(
# df=pd.concat(self.compiled_all_equal_rows), df=pd.DataFrame(self.compiled_all_equal_rows),
# bucket_name=self.epc_bucket_name, bucket_name=self.epc_bucket_name,
# file_key=self.epc_all_equal_rows_key, file_key=self.epc_all_equal_rows_key,
# ) )
# save_dataframe_to_s3_parquet( save_dataframe_to_s3_parquet(
# df=pd.concat(self.compiled_cleaning_averages), df=pd.concat(self.compiled_cleaning_averages),
# bucket_name=self.epc_bucket_name, bucket_name=self.epc_bucket_name,
# file_key=self.epc_cleaning_dataset_key, file_key=self.epc_cleaning_dataset_key,
# ) )
def process_directory(self, directory: Path): def process_directory(self, directory: Path):
""" """

View file

@ -18,16 +18,16 @@ def main():
epc_pipeline.run() epc_pipeline.run()
# For testing # For testing
dataset_df = epc_pipeline.compiled_dataset # dataset_df = epc_pipeline.compiled_dataset
dataset_df.to_parquet("refactor_datasets/dataset.parquet") # dataset_df.to_parquet("refactor_datasets/dataset.parquet")
pd.DataFrame(epc_pipeline.compiled_all_equal_rows).to_parquet("refactor_datasets/all_equal_rows.parquet") # pd.DataFrame(epc_pipeline.compiled_all_equal_rows).to_parquet("refactor_datasets/all_equal_rows.parquet")
pd.concat(epc_pipeline.compiled_cleaning_averages).to_parquet("refactor_datasets/cleaning_averages.parquet") # pd.concat(epc_pipeline.compiled_cleaning_averages).to_parquet("refactor_datasets/cleaning_averages.parquet")
from utils.s3 import read_dataframe_from_s3_parquet # from utils.s3 import read_dataframe_from_s3_parquet
dataset = read_dataframe_from_s3_parquet( # dataset = read_dataframe_from_s3_parquet(
bucket_name="retrofit-data-dev", # bucket_name="retrofit-data-dev",
file_key="sap_change_model/dataset_test.parquet", # file_key="sap_change_model/dataset_test.parquet",
) # )
if __name__ == "__main__": if __name__ == "__main__":