Model/etl/spatial/ConservationAreaClient.py

234 lines
8.8 KiB
Python

import boto3
import os
import tempfile
import geopandas as gpd
import numpy as np
from enum import Enum
from shapely.geometry import Point
from utils.logger import setup_logger
from utils.s3 import read_io_from_s3
from datatypes.datatypes import OpenUprnCoordinateData
logger = setup_logger()
def read_shapefile_from_s3(bucket_name, s3_file_key):
"""
Read a shapefile from S3 into a GeoDataFrame.
:param bucket_name: The name of the S3 bucket
:param s3_file_key: The file path of the shape file
:return: GeoDataFrame containing the shapefile data
"""
s3_folder_key = "/".join(s3_file_key.split("/")[:-1])
shape_file_key = s3_file_key.split("/")[-1]
# Create a temporary directory
with tempfile.TemporaryDirectory() as tmpdirname:
s3_client = boto3.client('s3')
# Ensure the temporary directory exists
logger.info("Creating temporary directory at %s" % tmpdirname)
os.makedirs(tmpdirname, exist_ok=True)
# List all files in the given S3 folder
s3_objects = s3_client.list_objects(Bucket=bucket_name, Prefix=s3_folder_key)['Contents']
# Download each file to the temporary directory
for s3_object in s3_objects:
file_key = s3_object['Key']
file_name = os.path.basename(file_key)
local_file_path = os.path.join(tmpdirname, file_name)
# Explicitly create the temporary file
with open(local_file_path, 'wb') as tmpfile:
s3_client.download_fileobj(bucket_name, file_key, tmpfile)
# Read the shapefile from the temporary directory into a GeoDataFrame
shapefile_path = os.path.join(tmpdirname, shape_file_key)
gdf = gpd.read_file(shapefile_path)
return gdf
class ConservationAreaClient:
"""
Class to interact and manupulate convervation area data. The historic england data
can be found at the following location:
https://opendata-historicengland.hub.arcgis.com/datasets/historicengland::conservation-areas/about
We also use a separate government conservation area dataset which can be found here:
https://www.planning.data.gov.uk/dataset/conservation-area
"""
SOURCES = ["historic_england"]
IN_CONSERVATION_AREA = True
NOT_IN_CONSERVATION_AREA = False
UNKNOWN = None
def __init__(self, historic_england_path, gov_path, bucket):
self.historic_england_path = historic_england_path
self.gov_path = gov_path
self.bucket = bucket
self.historic_england_data = None
self.gov_data = None
def read(self):
"""
Read the data
"""
logger.info("Reading in historic england conservation area shapefile")
self.historic_england_data = read_shapefile_from_s3(
bucket_name=self.bucket, s3_file_key=self.historic_england_path
)
logger.info("Reading in Govenment conservation area geojson")
self.gov_data = gpd.read_file(
read_io_from_s3(
bucket_name=self.bucket,
file_key=self.gov_path
)
)
self.gov_data = self.gov_data.drop(columns=["dataset"])
# Convert the gov data to british national grid co-ordinates
self.gov_data = self.gov_data.to_crs("EPSG:27700")
def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):
"""
Check if a property is in a conservation area
:param coordinates: dictionary, which should have the OpenUprnCoordinateData format
:return:
"""
if not coordinates:
raise ValueError("Coordinates have not been set, run get_coordinates() first")
is_in_conservation_area = self.is_in_conservation_area_historic_england(
x_bng=coordinates.X_COORDINATE,
y_bng=coordinates.Y_COORDINATE
)
if is_in_conservation_area != "unknown":
return is_in_conservation_area
if is_in_conservation_area == "unknown":
# We double check the secondary data source
backup = self.is_in_conservation_area_historic_gov(
longitude=coordinates.LONGITUDE,
latitude=coordinates.LATITUDE
)
if backup:
return ConservationAreaClient.IN_CONSERVATION_AREA
else:
return ConservationAreaClient.UNKNOWN
def is_in_conservation_area_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
joined_gdf_he = gpd.sjoin(uprn_gdf, self.historic_england_data, how="left", predicate="within")
# Identify where we have definitive information (not "unknown")
in_conservation_he = ~joined_gdf_he.index_right.isna() & (
joined_gdf_he["NAME"] != "No data available for publication by HE"
)
uprn_in_conservation_he = joined_gdf_he[in_conservation_he]["UPRN"].unique()
# The right index will be missing when we don't have a match so the uprn is not in a conservation
# area
uprn_not_in_conservation_he = joined_gdf_he.loc[
~joined_gdf_he["UPRN"].isin(uprn_in_conservation_he) & joined_gdf_he.index_right.isna(),
"UPRN"
].unique()
# For unknowns, check against government data
unknown_uprns = uprn_gdf.loc[~uprn_gdf["UPRN"].isin(uprn_in_conservation_he)]["UPRN"]
unknown_gdf = uprn_gdf[uprn_gdf["UPRN"].isin(unknown_uprns)]
joined_gdf_gov = gpd.sjoin(unknown_gdf, self.gov_data, how="left", predicate="within")
uprn_in_conservation_gov = joined_gdf_gov.loc[~joined_gdf_gov.index_right.isna(), "UPRN"].unique()
uprn_gdf['conservation_status'] = self.UNKNOWN
uprn_gdf.loc[
uprn_gdf["UPRN"].isin(uprn_in_conservation_he), 'conservation_status'
] = self.IN_CONSERVATION_AREA
uprn_gdf.loc[
uprn_gdf["UPRN"].isin(uprn_not_in_conservation_he), 'conservation_status'
] = self.NOT_IN_CONSERVATION_AREA
uprn_gdf.loc[
uprn_gdf["UPRN"].isin(uprn_in_conservation_gov), 'conservation_status'
] = self.IN_CONSERVATION_AREA
return uprn_gdf
def is_in_conservation_area_historic_england(self, x_bng: float, y_bng: float) -> str:
"""
Check if a property is in a conservation area
:param x_bng: x coordinate in british national grid coordinates
:param y_bng: y coordinate in british national grid coordinates
"""
point = Point(x_bng, y_bng)
within_areas = self.historic_england_data.contains(point)
if within_areas.any():
names = self.historic_england_data.loc[within_areas, "NAME"]
# We want to deduce if we actually have data on this area
if all(names.values == "No data available for publication by HE"):
return self.UNKNOWN
return self.IN_CONSERVATION_AREA
return self.NOT_IN_CONSERVATION_AREA
def is_in_conservation_area_historic_gov(self, longitude: float, latitude: float) -> str:
"""
Check if a property is in a conservation area
:param longitude: longtitude coordinate
:param latitude: latitude coordinate
"""
point = Point(longitude, latitude)
return self.gov_data.contains(point).any()
def calculate_distance_to_nearest_conservation_area(self, x: float, y: float, source: str) -> float:
if source == "historic_england":
return self._distance_to_nearest_conservation_area_historic_england(x, y, self.historic_england_data)
@staticmethod
def _distance_to_nearest_conservation_area_historic_england(
x: float, y: float, conservation_areas: gpd.GeoDataFrame
) -> float:
"""
Calculate the distance from a given point to the nearest conservation area.
:param x: The x-coordinate of the point.
:param y: The y-coordinate of the point.
:param conservation_areas: A GeoDataFrame containing the conservation areas polygons.
:return: The distance in the same units as the coordinate system of the conservation areas.
:raises FileNotFoundError: If the conservation areas GeoDataFrame is not found.
:raises IndexError: If no nearest conservation area is found.
"""
# Convert the point coordinates to a Shapely Point object
point_geom = Point(x, y)
# Calculate the distance between the point and the conservation areas
distances = conservation_areas.geometry.distance(point_geom)
# Find the minimum distance. Since the data uses british national grid, the units are meters.
distance_meters = distances.min()
return distance_meters
class InConservationArea(Enum):
IN_CONSERVATION_AREA = ConservationAreaClient.IN_CONSERVATION_AREA
NOT_IN_CONSERVATION_AREA = ConservationAreaClient.NOT_IN_CONSERVATION_AREA
UNKNOWN = ConservationAreaClient.UNKNOWN