renamed app to cleaner_app and added in creation of cleaned_data

This commit is contained in:
Khalim Conn-Kowlessar 2023-09-12 11:33:27 +01:00
parent 1a442b2ec5
commit 4f02b86efb

View file

@ -32,8 +32,13 @@ def app():
:return:
"""
cleaned_data = {}
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
for directory in tqdm(epc_directories):
directory_destructured = str(directory).split("/")[-1].split("-")
gss_code = directory_destructured[1]
local_authority = directory_destructured[2]
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
# Rename the columns to the same format as the api returns
data.columns = [c.replace("_", "-").lower() for c in data.columns]
@ -45,17 +50,16 @@ def app():
# Incorporate input data into cleaning
cleaner = EpcClean(data)
lighting_averages = cleaner.lighting_averages
#
# TODO: All of these outputs can be stored by constituency so we can reduce the amount
# of data we fetch
#
# TODO: WE need to store lighting_averages to a s3
# We should also extend these averages so they're by more variables (property type, age band,
# constituency,
# etc)
cleaner.clean()
# TODO: cleaner.cleaned datasets to s3
# Extended cleaned_data
for k, data in cleaner.cleaned.items():
if k not in cleaned_data:
cleaned_data[k] = data
else:
existing_descriptions = [x["original_description"] for x in cleaned_data[k]]
new_data = [x for x in data if x["original_description"] not in existing_descriptions]
cleaned_data[k].extend(new_data)
# TODO: Add property age band into this
# uvalue_estimates = UvalueEstimations(data=data)