From b75fedf3ac79528d9e1001c6a9018856d89f6707 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 19 Jun 2023 09:28:13 +0100 Subject: [PATCH] tidying up app script --- model_data/BoreholeClient.py | 56 +++++++++++++ model_data/app.py | 157 +++++------------------------------ 2 files changed, 79 insertions(+), 134 deletions(-) create mode 100644 model_data/BoreholeClient.py diff --git a/model_data/BoreholeClient.py b/model_data/BoreholeClient.py new file mode 100644 index 00000000..39e281a6 --- /dev/null +++ b/model_data/BoreholeClient.py @@ -0,0 +1,56 @@ +import math +from tqdm import tqdm +from dbfread import DBF +from utils import setup_logger + +logger = setup_logger() + + +class BoreholeClient: + """ + Data dictionary: This description is based on the information presented in the following + Geological articles: + https://nora.nerc.ac.uk/id/eprint/509366/1/IR04115.pdf + https://shop.bgs.ac.uk/Resources/Shop/doc/info/Borehole_Abbreviations.pdf?_ga=2.246788941.895115819.1686912089 + -542796874.1686912089 + https://core.ac.uk/download/63732.pdf + + + QS - Borehole identifier information + NUMB - Borehole identifier information + BSUFF - Borehole identifier information + REGNO + RT - Borehole identifier information + GRID_REFER + EASTING - British National Grid coordinates + NORTHING - British National Grid coordinates + X - British National Grid coordinates - same as EASTING but has a float typing + Y - British National Grid coordinates - same as NORTHING but has a float typing + CONFIDENTI + STRTHEIGHT + NAME + LENGTH + BGS_ID + DATE_KNOWN + DATE_K_TYP + DATE_ENTER + AGS_LOG_UR + """ + + def __init__(self, path): + self.path: str = path + self.data = None + + def read(self): + logger.info("Reading in borehole data") + table = DBF(self.path) + borehole_data = [x for x in tqdm(table, total=len(table))] + + self.data = borehole_data + + @staticmethod + def distance_between_bng_coords(x1_bng, y1_bng, x2_bng, y2_bng): + # Calculate the Euclidean distance between the points + distance_m = math.sqrt((x2_bng - x1_bng) ** 2 + (y2_bng - y1_bng) ** 2) + distance_km = distance_m / 1000 # convert meters to kilometers + return distance_m, distance_km diff --git a/model_data/app.py b/model_data/app.py index 8c5fc488..26584123 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -1,6 +1,8 @@ import pandas as pd from tqdm import tqdm import os +from BoreholeClient import BoreholeClient +from model_data.LandRegistryClient import LandRegistryClient from model_data.temp_inputs import input_data from model_data.Property import Property @@ -10,6 +12,17 @@ from model_data.downloader import pagenated_epc_download from model_data.EpcClean import EpcClean from model_data.OpenUprnClient import OpenUprnClient +LAND_REGISTRY_PATHS = [ + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv", + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv", + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv", + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv", + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv", + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv", + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv", + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv", +] + def handler(): # To begin with, the input data is a list of dictionaries, however we would read this file in @@ -54,11 +67,6 @@ def handler(): cleaner.clean() - import pickle - with open(os.path.abspath(os.path.dirname(__file__)) + "/data.pkl", "rb") as f: - data = pickle.load(f) - - postcodes = [x["postcode"].upper() for x in data] address_meta = [ { "postcode": x["postcode"].upper(), @@ -70,151 +78,32 @@ def handler(): } for x in data ] - # For testing: - # from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes - # from collections import Counter - # count = Counter([x["main-fuel"] for x in data]) - # descriptions = {x["hotwater-description"] for x in data} - # out = [] - # for description in descriptions: - # res = HotWaterAttributes(description).process() - # out.append( - # { - # "original_description": description, - # **res - # } - # ) - # df = pd.DataFrame(out) - # df = df.sort_values("original_description") - # df = df.reset_index(drop=True) - # - # import numpy as np - # idx = 1 - # record = df[df.index == idx].to_dict("records")[0] - # record = {k: v for k, v in record.items() if v not in [None, np.nan]} - # from pprint import pprint - # pprint(record) - # - # # Issues: - # # 1) '2207 Time and temperature zone control' - we don't pick up any reference to the fact this is a time and - # # temperature zone control - # # and we only pick up temperature zone control at the moment. Can we capture this too - # # 2) 'Charging system linked to use of community heating, programmer and at least two room stats' - what are room - # # stats and how should - # # we capture this? - # - # df.to_dict("records") - # Land registry - from model_data.LandRegistryClient import LandRegistryClient - land_registry_client = LandRegistryClient( - paths=[ - os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv", - os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv", - os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv", - os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv", - os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv", - os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv", - os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv", - os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv", - ], + paths=LAND_REGISTRY_PATHS, addresses=address_meta ) + land_registry_client.read() - from dbfread import DBF - - borehole_file = os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/borehole/borehole.dbf" - table = DBF(borehole_file) - borehole_data = [x for x in tqdm(table, total=len(table))] + # Borehole + borehole_client = BoreholeClient( + path=os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/borehole/borehole.dbf" + ) + borehole_client.read() # There are ~1.4 million entries in this dataset and so we firstly want to reduce the number of # entries in here if possible before we produce any form of comparison between our properties, to infer # the distance from the property to the nearest borehole # Let's take a sample - borehole_sample = borehole_data[:1000] - df = pd.DataFrame(borehole_sample) + borehold_compare_to = borehole_client.data[0] + property = input_properties[0] # for each property, find the nearest borehole - # Data dictionary: This description is based on the information presented in the following - # Geological articles: - # https://nora.nerc.ac.uk/id/eprint/509366/1/IR04115.pdf - # https://shop.bgs.ac.uk/Resources/Shop/doc/info/Borehole_Abbreviations.pdf?_ga=2.246788941.895115819.1686912089 - # -542796874.1686912089 - # https://core.ac.uk/download/63732.pdf - # - # - # QS - Borehole identifier information - # NUMB - Borehole identifier information - # BSUFF - Borehole identifier information - # REGNO - # RT - Borehole identifier information - # GRID_REFER - # EASTING - British National Grid coordinates - # NORTHING - British National Grid coordinates - # X - British National Grid coordinates - same as EASTING but has a float typing - # Y - British National Grid coordinates - same as NORTHING but has a float typing - # CONFIDENTI - # STRTHEIGHT - # NAME - # LENGTH - # BGS_ID - # DATE_KNOWN - # DATE_K_TYP - # DATE_ENTER - # AGS_LOG_UR - - from pyproj import Proj, transform, Geod - - def distance_between_coords(longitude, latitude, x_bng, y_bng): - # Define the projections - wgs84 = Proj(init='epsg:4326') # WGS84 (longitude, latitude) - bng = Proj(init='epsg:27700') # British National Grid - - # Convert (longitude, latitude) to BNG coordinates - x, y = transform(wgs84, bng, longitude, latitude) - - # Define a geographic measure object - g = Geod(ellps='WGS84') - - # Calculate the distance between the points - # Note: Pyproj's 'Geod.inv' method returns azimuths and distance. - # We're only interested in distance here, so we only keep the third result - _, _, distance = g.inv(x, y, x_bng, y_bng) - - return distance - - def distance_between_bng_coords(x1_bng, y1_bng, x2_bng, y2_bng): - # Define a geographic measure object - g = Geod(ellps='airy') # Airy ellipsoid is used by the British National Grid - - # Calculate the distance between the points - # Note: Pyproj's 'Geod.inv' method returns azimuths and distance. - # We're only interested in distance here, so we only keep the third result - _, _, distance = g.inv(x1_bng, y1_bng, x2_bng, y2_bng) - - return distance - - import math - - import math - - def distance_between_bng_coords(x1_bng, y1_bng, x2_bng, y2_bng): - # Calculate the Euclidean distance between the points - distance_m = math.sqrt((x2_bng - x1_bng) ** 2 + (y2_bng - y1_bng) ** 2) - distance_km = distance_m / 1000 # convert meters to kilometers - return distance_m, distance_km - - property = input_properties[0] - - borehold_compare_to = borehole_data[0] - - dist_m, dist_km = distance_between_bng_coords( + dist_m, dist_km = borehole_client.distance_between_bng_coords( x1_bng=property.coordinates["x_coordinate"], y1_bng=property.coordinates["y_coordinate"], x2_bng=borehold_compare_to["X"], y2_bng=borehold_compare_to["Y"], ) - # ground source heat pump.