mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
234 lines
8.8 KiB
Python
234 lines
8.8 KiB
Python
import boto3
|
|
import os
|
|
import tempfile
|
|
import geopandas as gpd
|
|
import numpy as np
|
|
from enum import Enum
|
|
from shapely.geometry import Point
|
|
from utils.logger import setup_logger
|
|
from utils.s3 import read_io_from_s3
|
|
from datatypes.datatypes import OpenUprnCoordinateData
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
def read_shapefile_from_s3(bucket_name, s3_file_key):
|
|
"""
|
|
Read a shapefile from S3 into a GeoDataFrame.
|
|
|
|
:param bucket_name: The name of the S3 bucket
|
|
:param s3_file_key: The file path of the shape file
|
|
:return: GeoDataFrame containing the shapefile data
|
|
"""
|
|
|
|
s3_folder_key = "/".join(s3_file_key.split("/")[:-1])
|
|
shape_file_key = s3_file_key.split("/")[-1]
|
|
# Create a temporary directory
|
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
|
s3_client = boto3.client('s3')
|
|
|
|
# Ensure the temporary directory exists
|
|
logger.info("Creating temporary directory at %s" % tmpdirname)
|
|
os.makedirs(tmpdirname, exist_ok=True)
|
|
|
|
# List all files in the given S3 folder
|
|
s3_objects = s3_client.list_objects(Bucket=bucket_name, Prefix=s3_folder_key)['Contents']
|
|
|
|
# Download each file to the temporary directory
|
|
for s3_object in s3_objects:
|
|
file_key = s3_object['Key']
|
|
file_name = os.path.basename(file_key)
|
|
local_file_path = os.path.join(tmpdirname, file_name)
|
|
# Explicitly create the temporary file
|
|
with open(local_file_path, 'wb') as tmpfile:
|
|
s3_client.download_fileobj(bucket_name, file_key, tmpfile)
|
|
|
|
# Read the shapefile from the temporary directory into a GeoDataFrame
|
|
shapefile_path = os.path.join(tmpdirname, shape_file_key)
|
|
gdf = gpd.read_file(shapefile_path)
|
|
|
|
return gdf
|
|
|
|
|
|
class ConservationAreaClient:
|
|
"""
|
|
Class to interact and manupulate convervation area data. The historic england data
|
|
can be found at the following location:
|
|
https://opendata-historicengland.hub.arcgis.com/datasets/historicengland::conservation-areas/about
|
|
|
|
We also use a separate government conservation area dataset which can be found here:
|
|
https://www.planning.data.gov.uk/dataset/conservation-area
|
|
"""
|
|
|
|
SOURCES = ["historic_england"]
|
|
IN_CONSERVATION_AREA = True
|
|
NOT_IN_CONSERVATION_AREA = False
|
|
UNKNOWN = None
|
|
|
|
def __init__(self, historic_england_path, gov_path, bucket):
|
|
self.historic_england_path = historic_england_path
|
|
self.gov_path = gov_path
|
|
self.bucket = bucket
|
|
|
|
self.historic_england_data = None
|
|
self.gov_data = None
|
|
|
|
def read(self):
|
|
"""
|
|
Read the data
|
|
"""
|
|
logger.info("Reading in historic england conservation area shapefile")
|
|
self.historic_england_data = read_shapefile_from_s3(
|
|
bucket_name=self.bucket, s3_file_key=self.historic_england_path
|
|
)
|
|
|
|
logger.info("Reading in Govenment conservation area geojson")
|
|
|
|
self.gov_data = gpd.read_file(
|
|
read_io_from_s3(
|
|
bucket_name=self.bucket,
|
|
file_key=self.gov_path
|
|
)
|
|
)
|
|
self.gov_data = self.gov_data.drop(columns=["dataset"])
|
|
# Convert the gov data to british national grid co-ordinates
|
|
self.gov_data = self.gov_data.to_crs("EPSG:27700")
|
|
|
|
def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):
|
|
|
|
"""
|
|
Check if a property is in a conservation area
|
|
:param coordinates: dictionary, which should have the OpenUprnCoordinateData format
|
|
:return:
|
|
"""
|
|
|
|
if not coordinates:
|
|
raise ValueError("Coordinates have not been set, run get_coordinates() first")
|
|
|
|
is_in_conservation_area = self.is_in_conservation_area_historic_england(
|
|
x_bng=coordinates.X_COORDINATE,
|
|
y_bng=coordinates.Y_COORDINATE
|
|
)
|
|
|
|
if is_in_conservation_area != "unknown":
|
|
return is_in_conservation_area
|
|
|
|
if is_in_conservation_area == "unknown":
|
|
# We double check the secondary data source
|
|
backup = self.is_in_conservation_area_historic_gov(
|
|
longitude=coordinates.LONGITUDE,
|
|
latitude=coordinates.LATITUDE
|
|
)
|
|
|
|
if backup:
|
|
return ConservationAreaClient.IN_CONSERVATION_AREA
|
|
else:
|
|
return ConservationAreaClient.UNKNOWN
|
|
|
|
def is_in_conservation_area_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
|
|
|
joined_gdf_he = gpd.sjoin(uprn_gdf, self.historic_england_data, how="left", predicate="within")
|
|
|
|
# Identify where we have definitive information (not "unknown")
|
|
in_conservation_he = ~joined_gdf_he.index_right.isna() & (
|
|
joined_gdf_he["NAME"] != "No data available for publication by HE"
|
|
)
|
|
|
|
uprn_in_conservation_he = joined_gdf_he[in_conservation_he]["UPRN"].unique()
|
|
# The right index will be missing when we don't have a match so the uprn is not in a conservation
|
|
# area
|
|
uprn_not_in_conservation_he = joined_gdf_he.loc[
|
|
~joined_gdf_he["UPRN"].isin(uprn_in_conservation_he) & joined_gdf_he.index_right.isna(),
|
|
"UPRN"
|
|
].unique()
|
|
|
|
# For unknowns, check against government data
|
|
unknown_uprns = uprn_gdf.loc[~uprn_gdf["UPRN"].isin(uprn_in_conservation_he)]["UPRN"]
|
|
unknown_gdf = uprn_gdf[uprn_gdf["UPRN"].isin(unknown_uprns)]
|
|
|
|
joined_gdf_gov = gpd.sjoin(unknown_gdf, self.gov_data, how="left", predicate="within")
|
|
uprn_in_conservation_gov = joined_gdf_gov.loc[~joined_gdf_gov.index_right.isna(), "UPRN"].unique()
|
|
|
|
uprn_gdf['conservation_status'] = self.UNKNOWN
|
|
uprn_gdf.loc[
|
|
uprn_gdf["UPRN"].isin(uprn_in_conservation_he), 'conservation_status'
|
|
] = self.IN_CONSERVATION_AREA
|
|
uprn_gdf.loc[
|
|
uprn_gdf["UPRN"].isin(uprn_not_in_conservation_he), 'conservation_status'
|
|
] = self.NOT_IN_CONSERVATION_AREA
|
|
uprn_gdf.loc[
|
|
uprn_gdf["UPRN"].isin(uprn_in_conservation_gov), 'conservation_status'
|
|
] = self.IN_CONSERVATION_AREA
|
|
|
|
return uprn_gdf
|
|
|
|
def is_in_conservation_area_historic_england(self, x_bng: float, y_bng: float) -> str:
|
|
"""
|
|
Check if a property is in a conservation area
|
|
:param x_bng: x coordinate in british national grid coordinates
|
|
:param y_bng: y coordinate in british national grid coordinates
|
|
"""
|
|
|
|
point = Point(x_bng, y_bng)
|
|
|
|
within_areas = self.historic_england_data.contains(point)
|
|
|
|
if within_areas.any():
|
|
names = self.historic_england_data.loc[within_areas, "NAME"]
|
|
# We want to deduce if we actually have data on this area
|
|
|
|
if all(names.values == "No data available for publication by HE"):
|
|
return self.UNKNOWN
|
|
|
|
return self.IN_CONSERVATION_AREA
|
|
|
|
return self.NOT_IN_CONSERVATION_AREA
|
|
|
|
def is_in_conservation_area_historic_gov(self, longitude: float, latitude: float) -> str:
|
|
"""
|
|
Check if a property is in a conservation area
|
|
:param longitude: longtitude coordinate
|
|
:param latitude: latitude coordinate
|
|
"""
|
|
|
|
point = Point(longitude, latitude)
|
|
|
|
return self.gov_data.contains(point).any()
|
|
|
|
def calculate_distance_to_nearest_conservation_area(self, x: float, y: float, source: str) -> float:
|
|
if source == "historic_england":
|
|
return self._distance_to_nearest_conservation_area_historic_england(x, y, self.historic_england_data)
|
|
|
|
@staticmethod
|
|
def _distance_to_nearest_conservation_area_historic_england(
|
|
x: float, y: float, conservation_areas: gpd.GeoDataFrame
|
|
) -> float:
|
|
"""
|
|
Calculate the distance from a given point to the nearest conservation area.
|
|
|
|
:param x: The x-coordinate of the point.
|
|
:param y: The y-coordinate of the point.
|
|
:param conservation_areas: A GeoDataFrame containing the conservation areas polygons.
|
|
|
|
:return: The distance in the same units as the coordinate system of the conservation areas.
|
|
|
|
:raises FileNotFoundError: If the conservation areas GeoDataFrame is not found.
|
|
:raises IndexError: If no nearest conservation area is found.
|
|
"""
|
|
|
|
# Convert the point coordinates to a Shapely Point object
|
|
point_geom = Point(x, y)
|
|
|
|
# Calculate the distance between the point and the conservation areas
|
|
distances = conservation_areas.geometry.distance(point_geom)
|
|
|
|
# Find the minimum distance. Since the data uses british national grid, the units are meters.
|
|
distance_meters = distances.min()
|
|
|
|
return distance_meters
|
|
|
|
|
|
class InConservationArea(Enum):
|
|
IN_CONSERVATION_AREA = ConservationAreaClient.IN_CONSERVATION_AREA
|
|
NOT_IN_CONSERVATION_AREA = ConservationAreaClient.NOT_IN_CONSERVATION_AREA
|
|
UNKNOWN = ConservationAreaClient.UNKNOWN
|