created a small dataset of uprn and directory name to keep track of all equal rows

This commit is contained in:
Khalim Conn-Kowlessar 2023-10-18 10:14:52 +11:00
parent d5b2ff9c36
commit f0db6b69df

View file

@ -411,8 +411,8 @@ def app():
dataset = []
cleaning_dataset = []
# Keep track of the number of all equals
all_equal_count = 0
# Keep track of the all equals
all_equal_rows = []
for directory in tqdm(directories):
@ -521,8 +521,8 @@ def app():
)
if all_equal:
# Keep track of this for the moment
all_equal_count += 1
# Keep track of this for the moment so we can analyse
all_equal_rows.append({"uprn": uprn, "directory_name": directory.name})
continue
features = pd.concat([starting_record, ending_record])
@ -622,7 +622,7 @@ def app():
save_dataframe_to_s3_parquet(
df=output,
bucket_name="retrofit-data-dev",
file_key="sap_change_model/dataset_test.parquet",
file_key="sap_change_model/dataset.parquet",
)