import pandas as pd from tqdm import tqdm import os from model_data.temp_inputs import input_data from model_data.Property import Property from model_data.config import EPC_AUTH_TOKEN from epc_api.client import EpcClient from model_data.downloader import pagenated_epc_download from model_data.EpcClean import EpcClean from model_data.OpenUprnClient import OpenUprnClient def handler(): # To begin with, the input data is a list of dictionaries, however we would read this file in epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN) input_properties = [ Property(postcode=config['postcode'], address1=config['address1'], epc_client=epc_client) for config in input_data ] for p in input_properties: p.search_address_epc() uprns = [p.data['uprn'] for p in input_properties] open_uprn_client = OpenUprnClient( path=os.path.abspath( os.path.dirname(__file__) ) + "/model_data/local_data/osopenuprn_202306_csv/osopenuprn_202305.csv", uprns=uprns ) open_uprn_client.read() for p in input_properties: p.get_coordinates(open_uprn_client) local_authorities = {p.data['local-authority'] for p in input_properties} data = [] for la in tqdm(local_authorities): data.extend( pagenated_epc_download( client=epc_client, params={"local-authority": la}, page_size=5000, n_pages=10, ) ) cleaner = EpcClean(data) cleaner.clean() import pickle with open(os.path.abspath(os.path.dirname(__file__)) + "/data.pkl", "rb") as f: data = pickle.load(f) postcodes = [x["postcode"].upper() for x in data] address_meta = [ { "postcode": x["postcode"].upper(), "address1": x["address1"].upper(), "address2": x["address2"].upper(), "address3": x["address3"].upper(), "address": x["address"], "uprn": x["uprn"] } for x in data ] # For testing: # from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes # from collections import Counter # count = Counter([x["main-fuel"] for x in data]) # descriptions = {x["hotwater-description"] for x in data} # out = [] # for description in descriptions: # res = HotWaterAttributes(description).process() # out.append( # { # "original_description": description, # **res # } # ) # df = pd.DataFrame(out) # df = df.sort_values("original_description") # df = df.reset_index(drop=True) # # import numpy as np # idx = 1 # record = df[df.index == idx].to_dict("records")[0] # record = {k: v for k, v in record.items() if v not in [None, np.nan]} # from pprint import pprint # pprint(record) # # # Issues: # # 1) '2207 Time and temperature zone control' - we don't pick up any reference to the fact this is a time and # # temperature zone control # # and we only pick up temperature zone control at the moment. Can we capture this too # # 2) 'Charging system linked to use of community heating, programmer and at least two room stats' - what are room # # stats and how should # # we capture this? # # df.to_dict("records") # Land registry from model_data.LandRegistryClient import LandRegistryClient land_registry_client = LandRegistryClient( paths=[ os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv", ], addresses=address_meta ) from dbfread import DBF borehole_file = os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/borehole/borehole.dbf" table = DBF(borehole_file) borehole_data = [x for x in tqdm(table, total=len(table))] # There are ~1.4 million entries in this dataset and so we firstly want to reduce the number of # entries in here if possible before we produce any form of comparison between our properties, to infer # the distance from the property to the nearest borehole # Let's take a sample borehole_sample = borehole_data[:1000] df = pd.DataFrame(borehole_sample) # for each property, find the nearest borehole # Data dictionary: This description is based on the information presented in the following # Geological articles: # https://nora.nerc.ac.uk/id/eprint/509366/1/IR04115.pdf # https://shop.bgs.ac.uk/Resources/Shop/doc/info/Borehole_Abbreviations.pdf?_ga=2.246788941.895115819.1686912089 # -542796874.1686912089 # https://core.ac.uk/download/63732.pdf # # # QS - Borehole identifier information # NUMB - Borehole identifier information # BSUFF - Borehole identifier information # REGNO # RT - Borehole identifier information # GRID_REFER # EASTING - British National Grid coordinates # NORTHING - British National Grid coordinates # X - British National Grid coordinates - same as EASTING but has a float typing # Y - British National Grid coordinates - same as NORTHING but has a float typing # CONFIDENTI # STRTHEIGHT # NAME # LENGTH # BGS_ID # DATE_KNOWN # DATE_K_TYP # DATE_ENTER # AGS_LOG_UR