Added s3 download for conservation area data

This commit is contained in:
Khalim Conn-Kowlessar 2023-09-29 10:19:13 +01:00
parent 642a224a7b
commit 64b6b67499
6 changed files with 142 additions and 28 deletions

3
.gitignore vendored
View file

@ -261,3 +261,6 @@ model_data/simulation_system/predictions/
.idea/Model.iml
.idea/misc.iml
adhoc
adhoc/*

View file

@ -79,17 +79,17 @@ def sap_to_epc(sap_points: int):
if sap_points <= 0 or sap_points > 100:
raise ValueError("SAP points should be between 1 and 100.")
if sap_points > 91:
if sap_points >= 92:
return "A"
elif sap_points > 80:
elif sap_points >= 81:
return "B"
elif sap_points > 69:
elif sap_points >= 69:
return "C"
elif sap_points > 55:
elif sap_points >= 55:
return "D"
elif sap_points > 39:
elif sap_points >= 39:
return "E"
elif sap_points > 21:
elif sap_points >= 21:
return "F"
else:
return "G"
@ -108,13 +108,13 @@ def epc_to_sap_lower_bound(epc: str):
elif epc == "B":
return 81
elif epc == "C":
return 70
return 69
elif epc == "D":
return 56
return 55
elif epc == "E":
return 40
return 39
elif epc == "F":
return 22
return 21
elif epc == "G":
return 1
else:

View file

@ -1,12 +1,55 @@
from enum import Enum
import boto3
import os
import tempfile
import pandas as pd
import geopandas as gpd
from enum import Enum
from shapely.geometry import Point
from utils.logger import setup_logger
from utils.s3 import read_io_from_s3
from datatypes.datatypes import OpenUprnCoordinateData
logger = setup_logger()
def read_shapefile_from_s3(bucket_name, s3_file_key):
"""
Read a shapefile from S3 into a GeoDataFrame.
:param bucket_name: The name of the S3 bucket
:param s3_file_key: The file path of the shape file
:return: GeoDataFrame containing the shapefile data
"""
s3_folder_key = "/".join(s3_file_key.split("/")[:-1])
shape_file_key = s3_file_key.split("/")[-1]
# Create a temporary directory
with tempfile.TemporaryDirectory() as tmpdirname:
s3_client = boto3.client('s3')
# Ensure the temporary directory exists
logger.info("Creating temporary directory at %s" % tmpdirname)
os.makedirs(tmpdirname, exist_ok=True)
# List all files in the given S3 folder
s3_objects = s3_client.list_objects(Bucket=bucket_name, Prefix=s3_folder_key)['Contents']
# Download each file to the temporary directory
for s3_object in s3_objects:
file_key = s3_object['Key']
file_name = os.path.basename(file_key)
local_file_path = os.path.join(tmpdirname, file_name)
# Explicitly create the temporary file
with open(local_file_path, 'wb') as tmpfile:
s3_client.download_fileobj(bucket_name, file_key, tmpfile)
# Read the shapefile from the temporary directory into a GeoDataFrame
shapefile_path = os.path.join(tmpdirname, shape_file_key)
gdf = gpd.read_file(shapefile_path)
return gdf
class ConservationAreaClient:
"""
Class to interact and manupulate convervation area data. The historic england data
@ -22,9 +65,10 @@ class ConservationAreaClient:
NOT_IN_CONSERVATION_AREA = "not_in_conservation_area"
UNKNOWN = "unknown"
def __init__(self, historic_england_path, gov_path):
def __init__(self, historic_england_path, gov_path, bucket):
self.historic_england_path = historic_england_path
self.gov_path = gov_path
self.bucket = bucket
self.historic_england_data = None
self.gov_data = None
@ -34,10 +78,18 @@ class ConservationAreaClient:
Read the data
"""
logger.info("Reading in historic england conservation area shapefile")
self.historic_england_data = gpd.read_file(self.historic_england_path)
self.historic_england_data = read_shapefile_from_s3(
bucket_name=self.bucket, s3_file_key=self.historic_england_path
)
logger.info("Reading in Govenment conservation area geojson")
self.gov_data = gpd.read_file(self.gov_path)
self.gov_data = gpd.read_file(
read_io_from_s3(
bucket_name=self.bucket,
file_key=self.gov_path
)
)
self.gov_data = self.gov_data.drop(columns=["dataset"])
def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):

View file

@ -3,19 +3,49 @@ This application reads in the open uprn data from a static location and loads it
our database for querying from other services
"""
import os
from conservation_areas.ConservationAreaClient import ConservationAreaClient
from etl.conservation_areas.ConservationAreaClient import ConservationAreaClient
from datatypes.datatypes import OpenUprnCoordinateData
BUCKET = "retrofit-data-dev"
HISTORIC_ENGLAND_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
GOV_PATHNAME = "spatial/gov-conservation-area.geojson"
def app():
# TODO: Store the input data in S3 [x]
# Read the input data from S3 [ ]
# Document the data source and where to find it [x]
# Write the outputs to S3
"""
This application uses the conservation area datasets to determine if a UPRN is
in a conservation area or now
We use two sources of data for determining if homes are in conservation areas.
The first is the Historic England dataset, which is a shapefile containing
polygons of conservation areas. The second is the gov.uk dataset, which is a
geojson file containing polygons of conservation areas.
The Historic England dataset can be found here:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The listed building dataset is also found at Historic England at:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The hertitige buildings dataset is also found at Historic England at:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The Gov.uk dataset can be found here:
https://www.planning.data.gov.uk/dataset/conservation-area
For the moment, these data sources are downloaded manually and uploaded to S3.
This application then processes those files and writes the results to s3
"""
conservation_area_client = ConservationAreaClient(
historic_england_path=os.path.abspath(
os.path.dirname(__file__)
) + "/model_data/local_data/Historic_Eng_Conservation_Areas/Conservation_Areas.shp",
gov_path=os.path.abspath(
os.path.dirname(__file__)
) + "/model_data/local_data/gov-conservation-area.geojson"
historic_england_path=HISTORIC_ENGLAND_PATHNAME,
gov_path=GOV_PATHNAME,
bucket=BUCKET
)
conservation_area_client.read()
@ -39,6 +69,11 @@ def app():
'LONGITUDE': -0.0823165}
]
open_uprn_data = [
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
'LONGITUDE': -0.0823165}
]
result = [
{
"uprn": coordinates["UPRN"],

View file

@ -2,6 +2,9 @@ import boto3
from io import BytesIO
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
import pandas as pd
from utils.logger import setup_logger
logger = setup_logger()
def read_from_s3(bucket_name, s3_file_name):
@ -46,6 +49,27 @@ def save_data_to_s3(data, bucket_name, s3_file_name):
print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')
def read_io_from_s3(bucket_name, file_key):
"""
Read a file from S3 into a BytesIO object. This can be used by other methods to parse the response
Because we use
:param bucket_name: The name of the S3 bucket
:param file_key: The file name of the shapefile in S3
:return: Io file to be parsed by another method
"""
client = boto3.client('s3')
# Get the Parquet file from S3
response = client.get_object(Bucket=bucket_name, Key=file_key)
# Read the file into an io object
buffer = BytesIO(response['Body'].read())
return buffer
def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
"""
Save a pandas DataFrame to S3 as a Parquet file.
@ -75,14 +99,14 @@ def read_dataframe_from_s3_parquet(bucket_name, file_key):
:return: A pandas DataFrame.
"""
# Create the boto3 client
client = boto3.client('s3')
if not file_key.endswith(".parquet"):
raise logger.warning("This file doesn't look like a parquet file")
# Get the Parquet file from S3
response = client.get_object(Bucket=bucket_name, Key=file_key)
parquet_buffer = read_io_from_s3(
bucket_name=bucket_name,
file_key=file_key
)
# Read the file into a pandas DataFrame
parquet_buffer = BytesIO(response['Body'].read())
df = pd.read_parquet(parquet_buffer)
return df