mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
fixed concat issue
This commit is contained in:
parent
255bfc182d
commit
df33ff93c3
3 changed files with 26 additions and 26 deletions
|
|
@ -135,6 +135,7 @@ class EPCDataProcessor:
|
||||||
self.fill_invalid_constituency_fields(ignore_step=ignore_step)
|
self.fill_invalid_constituency_fields(ignore_step=ignore_step)
|
||||||
|
|
||||||
self.make_cleaning_averages(ignore_step=ignore_step)
|
self.make_cleaning_averages(ignore_step=ignore_step)
|
||||||
|
self.add_local_authority_to_cleaning_average(ignore_step=ignore_step)
|
||||||
|
|
||||||
# TODO: check if this has impact on training dataset
|
# TODO: check if this has impact on training dataset
|
||||||
cleaned_data = self.apply_averages_cleaning(
|
cleaned_data = self.apply_averages_cleaning(
|
||||||
|
|
@ -152,7 +153,6 @@ class EPCDataProcessor:
|
||||||
|
|
||||||
self.data = self.data if cleaned_data is None else cleaned_data
|
self.data = self.data if cleaned_data is None else cleaned_data
|
||||||
|
|
||||||
self.add_local_authority_to_cleaning_average(ignore_step=ignore_step)
|
|
||||||
self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
|
self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
|
||||||
self.cast_data_columns_to_lower()
|
self.cast_data_columns_to_lower()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -123,7 +123,7 @@ class EPCPipeline:
|
||||||
|
|
||||||
data = self.epc_data_processor.data
|
data = self.epc_data_processor.data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
epc_records = [EPCRecord(**x, run_mode="newdata") for x in data.to_dict(orient='records')]
|
epc_records = [EPCRecord(**x, run_mode="newdata") for x in data.to_dict(orient='records')]
|
||||||
|
|
||||||
|
|
@ -137,23 +137,23 @@ class EPCPipeline:
|
||||||
for directory in tqdm(self.directories):
|
for directory in tqdm(self.directories):
|
||||||
self.process_directory(directory)
|
self.process_directory(directory)
|
||||||
|
|
||||||
# save_dataframe_to_s3_parquet(
|
save_dataframe_to_s3_parquet(
|
||||||
# df=self.compiled_dataset,
|
df=self.compiled_dataset,
|
||||||
# bucket_name=self.epc_bucket_name,
|
bucket_name=self.epc_bucket_name,
|
||||||
# file_key=self.epc_compiled_dataset_key,
|
file_key=self.epc_compiled_dataset_key,
|
||||||
# )
|
)
|
||||||
|
|
||||||
# save_dataframe_to_s3_parquet(
|
save_dataframe_to_s3_parquet(
|
||||||
# df=pd.concat(self.compiled_all_equal_rows),
|
df=pd.DataFrame(self.compiled_all_equal_rows),
|
||||||
# bucket_name=self.epc_bucket_name,
|
bucket_name=self.epc_bucket_name,
|
||||||
# file_key=self.epc_all_equal_rows_key,
|
file_key=self.epc_all_equal_rows_key,
|
||||||
# )
|
)
|
||||||
|
|
||||||
# save_dataframe_to_s3_parquet(
|
save_dataframe_to_s3_parquet(
|
||||||
# df=pd.concat(self.compiled_cleaning_averages),
|
df=pd.concat(self.compiled_cleaning_averages),
|
||||||
# bucket_name=self.epc_bucket_name,
|
bucket_name=self.epc_bucket_name,
|
||||||
# file_key=self.epc_cleaning_dataset_key,
|
file_key=self.epc_cleaning_dataset_key,
|
||||||
# )
|
)
|
||||||
|
|
||||||
def process_directory(self, directory: Path):
|
def process_directory(self, directory: Path):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -18,16 +18,16 @@ def main():
|
||||||
epc_pipeline.run()
|
epc_pipeline.run()
|
||||||
|
|
||||||
# For testing
|
# For testing
|
||||||
dataset_df = epc_pipeline.compiled_dataset
|
# dataset_df = epc_pipeline.compiled_dataset
|
||||||
dataset_df.to_parquet("refactor_datasets/dataset.parquet")
|
# dataset_df.to_parquet("refactor_datasets/dataset.parquet")
|
||||||
pd.DataFrame(epc_pipeline.compiled_all_equal_rows).to_parquet("refactor_datasets/all_equal_rows.parquet")
|
# pd.DataFrame(epc_pipeline.compiled_all_equal_rows).to_parquet("refactor_datasets/all_equal_rows.parquet")
|
||||||
pd.concat(epc_pipeline.compiled_cleaning_averages).to_parquet("refactor_datasets/cleaning_averages.parquet")
|
# pd.concat(epc_pipeline.compiled_cleaning_averages).to_parquet("refactor_datasets/cleaning_averages.parquet")
|
||||||
|
|
||||||
from utils.s3 import read_dataframe_from_s3_parquet
|
# from utils.s3 import read_dataframe_from_s3_parquet
|
||||||
dataset = read_dataframe_from_s3_parquet(
|
# dataset = read_dataframe_from_s3_parquet(
|
||||||
bucket_name="retrofit-data-dev",
|
# bucket_name="retrofit-data-dev",
|
||||||
file_key="sap_change_model/dataset_test.parquet",
|
# file_key="sap_change_model/dataset_test.parquet",
|
||||||
)
|
# )
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue