from tqdm import tqdm import os import pandas as pd from model_data.config import EPC_AUTH_TOKEN from epc_api.client import EpcClient from model_data.downloader import pagenated_epc_download from model_data.EpcClean import EpcClean from model_data.analysis.UvalueEstimations import UvalueEstimations from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE from pathlib import Path LAND_REGISTRY_PATHS = [ os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv", ] EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates" def app(): """ For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API and produce a dataset of cleaned fields so that when we get new properties, we can quickly sanitise any description data :return: """ # Begin by setting up an empty cleaner cleaner = EpcClean([]) epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()] for directory in tqdm(epc_directories): data = pd.read_csv(directory / "certificates.csv", low_memory=False) # Rename the columns to the same format as the api returns data.columns = [c.replace("_", "-").lower() for c in data.columns] # Take just date before the date threshold data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] # Convert to list of dictioaries as returned by the api data = data.to_dict("records") # Incorporate input data into cleaning cleaner = EpcClean(data) lighting_averages = cleaner.lighting_averages # # TODO: All of these outputs can be stored by constituency so we can reduce the amount # of data we fetch # # TODO: WE need to store lighting_averages to a s3 # We should also extend these averages so they're by more variables (property type, age band, # constituency, # etc) cleaner.clean() # TODO: cleaner.cleaned datasets to s3 # TODO: Add property age band into this # uvalue_estimates = UvalueEstimations(data=data) # uvalue_estimates.get_estimates(cleaner=cleaner) # # TODO: Store these to a s3 # uvalue_estimates.walls # uvalue_estimates.floors # uvalue_estimates.roofs