From 4f02b86efb6cac069e887474415d501da3ae2ee6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 12 Sep 2023 11:33:27 +0100 Subject: [PATCH] renamed app to cleaner_app and added in creation of cleaned_data --- model_data/{app.py => cleaner_app.py} | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) rename model_data/{app.py => cleaner_app.py} (80%) diff --git a/model_data/app.py b/model_data/cleaner_app.py similarity index 80% rename from model_data/app.py rename to model_data/cleaner_app.py index 45c4a4f0..9656557b 100644 --- a/model_data/app.py +++ b/model_data/cleaner_app.py @@ -32,8 +32,13 @@ def app(): :return: """ + cleaned_data = {} epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()] for directory in tqdm(epc_directories): + directory_destructured = str(directory).split("/")[-1].split("-") + gss_code = directory_destructured[1] + local_authority = directory_destructured[2] + data = pd.read_csv(directory / "certificates.csv", low_memory=False) # Rename the columns to the same format as the api returns data.columns = [c.replace("_", "-").lower() for c in data.columns] @@ -45,17 +50,16 @@ def app(): # Incorporate input data into cleaning cleaner = EpcClean(data) - lighting_averages = cleaner.lighting_averages - # - # TODO: All of these outputs can be stored by constituency so we can reduce the amount - # of data we fetch - # - # TODO: WE need to store lighting_averages to a s3 - # We should also extend these averages so they're by more variables (property type, age band, - # constituency, - # etc) + cleaner.clean() - # TODO: cleaner.cleaned datasets to s3 + # Extended cleaned_data + for k, data in cleaner.cleaned.items(): + if k not in cleaned_data: + cleaned_data[k] = data + else: + existing_descriptions = [x["original_description"] for x in cleaned_data[k]] + new_data = [x for x in data if x["original_description"] not in existing_descriptions] + cleaned_data[k].extend(new_data) # TODO: Add property age band into this # uvalue_estimates = UvalueEstimations(data=data)