mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
renamed app to cleaner_app and added in creation of cleaned_data
This commit is contained in:
parent
1a442b2ec5
commit
4f02b86efb
1 changed files with 14 additions and 10 deletions
|
|
@ -32,8 +32,13 @@ def app():
|
|||
:return:
|
||||
"""
|
||||
|
||||
cleaned_data = {}
|
||||
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
for directory in tqdm(epc_directories):
|
||||
directory_destructured = str(directory).split("/")[-1].split("-")
|
||||
gss_code = directory_destructured[1]
|
||||
local_authority = directory_destructured[2]
|
||||
|
||||
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||||
# Rename the columns to the same format as the api returns
|
||||
data.columns = [c.replace("_", "-").lower() for c in data.columns]
|
||||
|
|
@ -45,17 +50,16 @@ def app():
|
|||
|
||||
# Incorporate input data into cleaning
|
||||
cleaner = EpcClean(data)
|
||||
lighting_averages = cleaner.lighting_averages
|
||||
#
|
||||
# TODO: All of these outputs can be stored by constituency so we can reduce the amount
|
||||
# of data we fetch
|
||||
#
|
||||
# TODO: WE need to store lighting_averages to a s3
|
||||
# We should also extend these averages so they're by more variables (property type, age band,
|
||||
# constituency,
|
||||
# etc)
|
||||
|
||||
cleaner.clean()
|
||||
# TODO: cleaner.cleaned datasets to s3
|
||||
# Extended cleaned_data
|
||||
for k, data in cleaner.cleaned.items():
|
||||
if k not in cleaned_data:
|
||||
cleaned_data[k] = data
|
||||
else:
|
||||
existing_descriptions = [x["original_description"] for x in cleaned_data[k]]
|
||||
new_data = [x for x in data if x["original_description"] not in existing_descriptions]
|
||||
cleaned_data[k].extend(new_data)
|
||||
|
||||
# TODO: Add property age band into this
|
||||
# uvalue_estimates = UvalueEstimations(data=data)
|
||||
Loading…
Add table
Reference in a new issue