fixed concat issue

This commit is contained in:
Michael Duong 2024-01-16 22:45:18 +00:00
parent 255bfc182d
commit df33ff93c3
3 changed files with 26 additions and 26 deletions

View file

@ -135,6 +135,7 @@ class EPCDataProcessor:
self.fill_invalid_constituency_fields(ignore_step=ignore_step)
self.make_cleaning_averages(ignore_step=ignore_step)
self.add_local_authority_to_cleaning_average(ignore_step=ignore_step)
# TODO: check if this has impact on training dataset
cleaned_data = self.apply_averages_cleaning(
@ -152,7 +153,6 @@ class EPCDataProcessor:
self.data = self.data if cleaned_data is None else cleaned_data
self.add_local_authority_to_cleaning_average(ignore_step=ignore_step)
self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
self.cast_data_columns_to_lower()

View file

@ -123,7 +123,7 @@ class EPCPipeline:
data = self.epc_data_processor.data
epc_records = [EPCRecord(**x, run_mode="newdata") for x in data.to_dict(orient='records')]
@ -137,23 +137,23 @@ class EPCPipeline:
for directory in tqdm(self.directories):
self.process_directory(directory)
# save_dataframe_to_s3_parquet(
# df=self.compiled_dataset,
# bucket_name=self.epc_bucket_name,
# file_key=self.epc_compiled_dataset_key,
# )
save_dataframe_to_s3_parquet(
df=self.compiled_dataset,
bucket_name=self.epc_bucket_name,
file_key=self.epc_compiled_dataset_key,
)
# save_dataframe_to_s3_parquet(
# df=pd.concat(self.compiled_all_equal_rows),
# bucket_name=self.epc_bucket_name,
# file_key=self.epc_all_equal_rows_key,
# )
save_dataframe_to_s3_parquet(
df=pd.DataFrame(self.compiled_all_equal_rows),
bucket_name=self.epc_bucket_name,
file_key=self.epc_all_equal_rows_key,
)
# save_dataframe_to_s3_parquet(
# df=pd.concat(self.compiled_cleaning_averages),
# bucket_name=self.epc_bucket_name,
# file_key=self.epc_cleaning_dataset_key,
# )
save_dataframe_to_s3_parquet(
df=pd.concat(self.compiled_cleaning_averages),
bucket_name=self.epc_bucket_name,
file_key=self.epc_cleaning_dataset_key,
)
def process_directory(self, directory: Path):
"""

View file

@ -18,16 +18,16 @@ def main():
epc_pipeline.run()
# For testing
dataset_df = epc_pipeline.compiled_dataset
dataset_df.to_parquet("refactor_datasets/dataset.parquet")
pd.DataFrame(epc_pipeline.compiled_all_equal_rows).to_parquet("refactor_datasets/all_equal_rows.parquet")
pd.concat(epc_pipeline.compiled_cleaning_averages).to_parquet("refactor_datasets/cleaning_averages.parquet")
# dataset_df = epc_pipeline.compiled_dataset
# dataset_df.to_parquet("refactor_datasets/dataset.parquet")
# pd.DataFrame(epc_pipeline.compiled_all_equal_rows).to_parquet("refactor_datasets/all_equal_rows.parquet")
# pd.concat(epc_pipeline.compiled_cleaning_averages).to_parquet("refactor_datasets/cleaning_averages.parquet")
from utils.s3 import read_dataframe_from_s3_parquet
dataset = read_dataframe_from_s3_parquet(
bucket_name="retrofit-data-dev",
file_key="sap_change_model/dataset_test.parquet",
)
# from utils.s3 import read_dataframe_from_s3_parquet
# dataset = read_dataframe_from_s3_parquet(
# bucket_name="retrofit-data-dev",
# file_key="sap_change_model/dataset_test.parquet",
# )
if __name__ == "__main__":