from tqdm import tqdm import os from model_data.config import EPC_AUTH_TOKEN from epc_api.client import EpcClient from model_data.downloader import pagenated_epc_download from model_data.EpcClean import EpcClean from model_data.analysis.UvalueEstimations import UvalueEstimations from model_data.analysis.SapModel import SapModel LAND_REGISTRY_PATHS = [ os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv", ] def app(): """ For a pre-defined list of constituencies and property types, we'll download EPC data from the API and produce a dataset of cleaned fields so that when we get new properties, we can quickly sanitise any description data :return: """ epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN) constituencies = {'E14000555', 'E14000726', 'E14000720', 'E14000721', 'E14000553', 'E14000752'} property_types = ["bungalow", "flat", "house", "maisonette", "park home"] floor_areas = ["unknown", "s", "m", "l", "xl", "xxl", "xxxl"] # We pull properties from local authorities, by property type. This will allow us to build # a dataset of up to 10k properties per local authority/property type combination # For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were # conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England # and Wales from 31 July 2014 # Download data from August 2014 onwards data = [] for c in tqdm(constituencies): for pt in property_types: for fa in floor_areas: data.extend( pagenated_epc_download( client=epc_client, params={ "constituency": c, "property-type": pt, "from-month": 8, "from-year": 2014, "floor-area": fa, }, page_size=5000, n_pages=10, ) ) # Production of sample data for land registry # address_meta = [ # { # "postcode": x["postcode"].upper(), # "address1": x["address1"].upper(), # "address2": x["address2"].upper(), # "address3": x["address3"].upper(), # "address": x["address"], # "uprn": x["uprn"] # } for x in data # ] # # import pickle # with open("sample_addresses.pkl", "wb") as f: # pickle.dump(address_meta, f) # Incorporate input data into cleaning cleaner = EpcClean(data) lighting_averages = cleaner.lighting_averages # TODO: WE need to store lighting_averages to a db # We should also extend these averages so they're by more variables (property type, age band, constituency, # etc) cleaner.clean() # TODO: cleaner.cleaned datasets to a db # TODO: Add property age band into this uvalue_estimates = UvalueEstimations(data=data) uvalue_estimates.get_estimates(cleaner=cleaner) # TODO: Store these to a db sap_model = SapModel(data=data, cleaner=cleaner) sap_model.run() # TODO: Store outputs to db