import boto3 import os import tempfile import geopandas as gpd import numpy as np from enum import Enum from shapely.geometry import Point from utils.logger import setup_logger from utils.s3 import read_io_from_s3 from datatypes.datatypes import OpenUprnCoordinateData logger = setup_logger() def read_shapefile_from_s3(bucket_name, s3_file_key): """ Read a shapefile from S3 into a GeoDataFrame. :param bucket_name: The name of the S3 bucket :param s3_file_key: The file path of the shape file :return: GeoDataFrame containing the shapefile data """ s3_folder_key = "/".join(s3_file_key.split("/")[:-1]) shape_file_key = s3_file_key.split("/")[-1] # Create a temporary directory with tempfile.TemporaryDirectory() as tmpdirname: s3_client = boto3.client('s3') # Ensure the temporary directory exists logger.info("Creating temporary directory at %s" % tmpdirname) os.makedirs(tmpdirname, exist_ok=True) # List all files in the given S3 folder s3_objects = s3_client.list_objects(Bucket=bucket_name, Prefix=s3_folder_key)['Contents'] # Download each file to the temporary directory for s3_object in s3_objects: file_key = s3_object['Key'] file_name = os.path.basename(file_key) local_file_path = os.path.join(tmpdirname, file_name) # Explicitly create the temporary file with open(local_file_path, 'wb') as tmpfile: s3_client.download_fileobj(bucket_name, file_key, tmpfile) # Read the shapefile from the temporary directory into a GeoDataFrame shapefile_path = os.path.join(tmpdirname, shape_file_key) gdf = gpd.read_file(shapefile_path) return gdf class ConservationAreaClient: """ Class to interact and manupulate convervation area data. The historic england data can be found at the following location: https://opendata-historicengland.hub.arcgis.com/datasets/historicengland::conservation-areas/about We also use a separate government conservation area dataset which can be found here: https://www.planning.data.gov.uk/dataset/conservation-area """ SOURCES = ["historic_england"] IN_CONSERVATION_AREA = True NOT_IN_CONSERVATION_AREA = False UNKNOWN = None def __init__(self, historic_england_path, gov_path, bucket): self.historic_england_path = historic_england_path self.gov_path = gov_path self.bucket = bucket self.historic_england_data = None self.gov_data = None def read(self): """ Read the data """ logger.info("Reading in historic england conservation area shapefile") self.historic_england_data = read_shapefile_from_s3( bucket_name=self.bucket, s3_file_key=self.historic_england_path ) logger.info("Reading in Govenment conservation area geojson") self.gov_data = gpd.read_file( read_io_from_s3( bucket_name=self.bucket, file_key=self.gov_path ) ) self.gov_data = self.gov_data.drop(columns=["dataset"]) # Convert the gov data to british national grid co-ordinates self.gov_data = self.gov_data.to_crs("EPSG:27700") def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData): """ Check if a property is in a conservation area :param coordinates: dictionary, which should have the OpenUprnCoordinateData format :return: """ if not coordinates: raise ValueError("Coordinates have not been set, run get_coordinates() first") is_in_conservation_area = self.is_in_conservation_area_historic_england( x_bng=coordinates.X_COORDINATE, y_bng=coordinates.Y_COORDINATE ) if is_in_conservation_area != "unknown": return is_in_conservation_area if is_in_conservation_area == "unknown": # We double check the secondary data source backup = self.is_in_conservation_area_historic_gov( longitude=coordinates.LONGITUDE, latitude=coordinates.LATITUDE ) if backup: return ConservationAreaClient.IN_CONSERVATION_AREA else: return ConservationAreaClient.UNKNOWN def is_in_conservation_area_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: joined_gdf_he = gpd.sjoin(uprn_gdf, self.historic_england_data, how="left", predicate="within") # Identify where we have definitive information (not "unknown") in_conservation_he = ~joined_gdf_he.index_right.isna() & ( joined_gdf_he["NAME"] != "No data available for publication by HE" ) uprn_in_conservation_he = joined_gdf_he[in_conservation_he]["UPRN"].unique() # The right index will be missing when we don't have a match so the uprn is not in a conservation # area uprn_not_in_conservation_he = joined_gdf_he.loc[ ~joined_gdf_he["UPRN"].isin(uprn_in_conservation_he) & joined_gdf_he.index_right.isna(), "UPRN" ].unique() # For unknowns, check against government data unknown_uprns = uprn_gdf.loc[~uprn_gdf["UPRN"].isin(uprn_in_conservation_he)]["UPRN"] unknown_gdf = uprn_gdf[uprn_gdf["UPRN"].isin(unknown_uprns)] joined_gdf_gov = gpd.sjoin(unknown_gdf, self.gov_data, how="left", predicate="within") uprn_in_conservation_gov = joined_gdf_gov.loc[~joined_gdf_gov.index_right.isna(), "UPRN"].unique() uprn_gdf['conservation_status'] = self.UNKNOWN uprn_gdf.loc[ uprn_gdf["UPRN"].isin(uprn_in_conservation_he), 'conservation_status' ] = self.IN_CONSERVATION_AREA uprn_gdf.loc[ uprn_gdf["UPRN"].isin(uprn_not_in_conservation_he), 'conservation_status' ] = self.NOT_IN_CONSERVATION_AREA uprn_gdf.loc[ uprn_gdf["UPRN"].isin(uprn_in_conservation_gov), 'conservation_status' ] = self.IN_CONSERVATION_AREA return uprn_gdf def is_in_conservation_area_historic_england(self, x_bng: float, y_bng: float) -> str: """ Check if a property is in a conservation area :param x_bng: x coordinate in british national grid coordinates :param y_bng: y coordinate in british national grid coordinates """ point = Point(x_bng, y_bng) within_areas = self.historic_england_data.contains(point) if within_areas.any(): names = self.historic_england_data.loc[within_areas, "NAME"] # We want to deduce if we actually have data on this area if all(names.values == "No data available for publication by HE"): return self.UNKNOWN return self.IN_CONSERVATION_AREA return self.NOT_IN_CONSERVATION_AREA def is_in_conservation_area_historic_gov(self, longitude: float, latitude: float) -> str: """ Check if a property is in a conservation area :param longitude: longtitude coordinate :param latitude: latitude coordinate """ point = Point(longitude, latitude) return self.gov_data.contains(point).any() def calculate_distance_to_nearest_conservation_area(self, x: float, y: float, source: str) -> float: if source == "historic_england": return self._distance_to_nearest_conservation_area_historic_england(x, y, self.historic_england_data) @staticmethod def _distance_to_nearest_conservation_area_historic_england( x: float, y: float, conservation_areas: gpd.GeoDataFrame ) -> float: """ Calculate the distance from a given point to the nearest conservation area. :param x: The x-coordinate of the point. :param y: The y-coordinate of the point. :param conservation_areas: A GeoDataFrame containing the conservation areas polygons. :return: The distance in the same units as the coordinate system of the conservation areas. :raises FileNotFoundError: If the conservation areas GeoDataFrame is not found. :raises IndexError: If no nearest conservation area is found. """ # Convert the point coordinates to a Shapely Point object point_geom = Point(x, y) # Calculate the distance between the point and the conservation areas distances = conservation_areas.geometry.distance(point_geom) # Find the minimum distance. Since the data uses british national grid, the units are meters. distance_meters = distances.min() return distance_meters class InConservationArea(Enum): IN_CONSERVATION_AREA = ConservationAreaClient.IN_CONSERVATION_AREA NOT_IN_CONSERVATION_AREA = ConservationAreaClient.NOT_IN_CONSERVATION_AREA UNKNOWN = ConservationAreaClient.UNKNOWN