From 64b6b67499c15eedbffa4f581600f8328936e05f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 29 Sep 2023 10:19:13 +0100 Subject: [PATCH] Added s3 download for conservation area data --- .gitignore | 3 + backend/app/utils.py | 20 +++---- .../ConservationAreaClient.py | 60 +++++++++++++++++-- .../conservation_areas}/app.py | 51 +++++++++++++--- .../conservation_areas}/requirements.txt | 0 utils/s3.py | 36 +++++++++-- 6 files changed, 142 insertions(+), 28 deletions(-) rename {conservation_areas => etl/conservation_areas}/ConservationAreaClient.py (72%) rename {conservation_areas => etl/conservation_areas}/app.py (50%) rename {conservation_areas => etl/conservation_areas}/requirements.txt (100%) diff --git a/.gitignore b/.gitignore index 98db3e9a..36067acf 100644 --- a/.gitignore +++ b/.gitignore @@ -261,3 +261,6 @@ model_data/simulation_system/predictions/ .idea/Model.iml .idea/misc.iml + +adhoc +adhoc/* \ No newline at end of file diff --git a/backend/app/utils.py b/backend/app/utils.py index 7099eba1..b4ba1bb9 100644 --- a/backend/app/utils.py +++ b/backend/app/utils.py @@ -79,17 +79,17 @@ def sap_to_epc(sap_points: int): if sap_points <= 0 or sap_points > 100: raise ValueError("SAP points should be between 1 and 100.") - if sap_points > 91: + if sap_points >= 92: return "A" - elif sap_points > 80: + elif sap_points >= 81: return "B" - elif sap_points > 69: + elif sap_points >= 69: return "C" - elif sap_points > 55: + elif sap_points >= 55: return "D" - elif sap_points > 39: + elif sap_points >= 39: return "E" - elif sap_points > 21: + elif sap_points >= 21: return "F" else: return "G" @@ -108,13 +108,13 @@ def epc_to_sap_lower_bound(epc: str): elif epc == "B": return 81 elif epc == "C": - return 70 + return 69 elif epc == "D": - return 56 + return 55 elif epc == "E": - return 40 + return 39 elif epc == "F": - return 22 + return 21 elif epc == "G": return 1 else: diff --git a/conservation_areas/ConservationAreaClient.py b/etl/conservation_areas/ConservationAreaClient.py similarity index 72% rename from conservation_areas/ConservationAreaClient.py rename to etl/conservation_areas/ConservationAreaClient.py index 164042f9..fbf72704 100644 --- a/conservation_areas/ConservationAreaClient.py +++ b/etl/conservation_areas/ConservationAreaClient.py @@ -1,12 +1,55 @@ -from enum import Enum +import boto3 +import os +import tempfile +import pandas as pd import geopandas as gpd +from enum import Enum from shapely.geometry import Point from utils.logger import setup_logger +from utils.s3 import read_io_from_s3 from datatypes.datatypes import OpenUprnCoordinateData logger = setup_logger() +def read_shapefile_from_s3(bucket_name, s3_file_key): + """ + Read a shapefile from S3 into a GeoDataFrame. + + :param bucket_name: The name of the S3 bucket + :param s3_file_key: The file path of the shape file + :return: GeoDataFrame containing the shapefile data + """ + + s3_folder_key = "/".join(s3_file_key.split("/")[:-1]) + shape_file_key = s3_file_key.split("/")[-1] + # Create a temporary directory + with tempfile.TemporaryDirectory() as tmpdirname: + s3_client = boto3.client('s3') + + # Ensure the temporary directory exists + logger.info("Creating temporary directory at %s" % tmpdirname) + os.makedirs(tmpdirname, exist_ok=True) + + # List all files in the given S3 folder + s3_objects = s3_client.list_objects(Bucket=bucket_name, Prefix=s3_folder_key)['Contents'] + + # Download each file to the temporary directory + for s3_object in s3_objects: + file_key = s3_object['Key'] + file_name = os.path.basename(file_key) + local_file_path = os.path.join(tmpdirname, file_name) + # Explicitly create the temporary file + with open(local_file_path, 'wb') as tmpfile: + s3_client.download_fileobj(bucket_name, file_key, tmpfile) + + # Read the shapefile from the temporary directory into a GeoDataFrame + shapefile_path = os.path.join(tmpdirname, shape_file_key) + gdf = gpd.read_file(shapefile_path) + + return gdf + + class ConservationAreaClient: """ Class to interact and manupulate convervation area data. The historic england data @@ -22,9 +65,10 @@ class ConservationAreaClient: NOT_IN_CONSERVATION_AREA = "not_in_conservation_area" UNKNOWN = "unknown" - def __init__(self, historic_england_path, gov_path): + def __init__(self, historic_england_path, gov_path, bucket): self.historic_england_path = historic_england_path self.gov_path = gov_path + self.bucket = bucket self.historic_england_data = None self.gov_data = None @@ -34,10 +78,18 @@ class ConservationAreaClient: Read the data """ logger.info("Reading in historic england conservation area shapefile") - self.historic_england_data = gpd.read_file(self.historic_england_path) + self.historic_england_data = read_shapefile_from_s3( + bucket_name=self.bucket, s3_file_key=self.historic_england_path + ) logger.info("Reading in Govenment conservation area geojson") - self.gov_data = gpd.read_file(self.gov_path) + + self.gov_data = gpd.read_file( + read_io_from_s3( + bucket_name=self.bucket, + file_key=self.gov_path + ) + ) self.gov_data = self.gov_data.drop(columns=["dataset"]) def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData): diff --git a/conservation_areas/app.py b/etl/conservation_areas/app.py similarity index 50% rename from conservation_areas/app.py rename to etl/conservation_areas/app.py index 1038bcfe..dddaede6 100644 --- a/conservation_areas/app.py +++ b/etl/conservation_areas/app.py @@ -3,19 +3,49 @@ This application reads in the open uprn data from a static location and loads it our database for querying from other services """ -import os -from conservation_areas.ConservationAreaClient import ConservationAreaClient +from etl.conservation_areas.ConservationAreaClient import ConservationAreaClient from datatypes.datatypes import OpenUprnCoordinateData +BUCKET = "retrofit-data-dev" +HISTORIC_ENGLAND_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp" +GOV_PATHNAME = "spatial/gov-conservation-area.geojson" + def app(): + # TODO: Store the input data in S3 [x] + # Read the input data from S3 [ ] + # Document the data source and where to find it [x] + # Write the outputs to S3 + + """ + This application uses the conservation area datasets to determine if a UPRN is + in a conservation area or now + + We use two sources of data for determining if homes are in conservation areas. + The first is the Historic England dataset, which is a shapefile containing + polygons of conservation areas. The second is the gov.uk dataset, which is a + geojson file containing polygons of conservation areas. + + The Historic England dataset can be found here: + https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e + + The listed building dataset is also found at Historic England at: + https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e + + The hertitige buildings dataset is also found at Historic England at: + https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e + + The Gov.uk dataset can be found here: + https://www.planning.data.gov.uk/dataset/conservation-area + + For the moment, these data sources are downloaded manually and uploaded to S3. + This application then processes those files and writes the results to s3 + """ + conservation_area_client = ConservationAreaClient( - historic_england_path=os.path.abspath( - os.path.dirname(__file__) - ) + "/model_data/local_data/Historic_Eng_Conservation_Areas/Conservation_Areas.shp", - gov_path=os.path.abspath( - os.path.dirname(__file__) - ) + "/model_data/local_data/gov-conservation-area.geojson" + historic_england_path=HISTORIC_ENGLAND_PATHNAME, + gov_path=GOV_PATHNAME, + bucket=BUCKET ) conservation_area_client.read() @@ -39,6 +69,11 @@ def app(): 'LONGITUDE': -0.0823165} ] + open_uprn_data = [ + {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309, + 'LONGITUDE': -0.0823165} + ] + result = [ { "uprn": coordinates["UPRN"], diff --git a/conservation_areas/requirements.txt b/etl/conservation_areas/requirements.txt similarity index 100% rename from conservation_areas/requirements.txt rename to etl/conservation_areas/requirements.txt diff --git a/utils/s3.py b/utils/s3.py index 8d24d6c0..be0aa008 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -2,6 +2,9 @@ import boto3 from io import BytesIO from botocore.exceptions import NoCredentialsError, PartialCredentialsError import pandas as pd +from utils.logger import setup_logger + +logger = setup_logger() def read_from_s3(bucket_name, s3_file_name): @@ -46,6 +49,27 @@ def save_data_to_s3(data, bucket_name, s3_file_name): print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}') +def read_io_from_s3(bucket_name, file_key): + """ + Read a file from S3 into a BytesIO object. This can be used by other methods to parse the response + + Because we use + + :param bucket_name: The name of the S3 bucket + :param file_key: The file name of the shapefile in S3 + :return: Io file to be parsed by another method + """ + client = boto3.client('s3') + + # Get the Parquet file from S3 + response = client.get_object(Bucket=bucket_name, Key=file_key) + + # Read the file into an io object + buffer = BytesIO(response['Body'].read()) + + return buffer + + def save_dataframe_to_s3_parquet(df, bucket_name, file_key): """ Save a pandas DataFrame to S3 as a Parquet file. @@ -75,14 +99,14 @@ def read_dataframe_from_s3_parquet(bucket_name, file_key): :return: A pandas DataFrame. """ - # Create the boto3 client - client = boto3.client('s3') + if not file_key.endswith(".parquet"): + raise logger.warning("This file doesn't look like a parquet file") - # Get the Parquet file from S3 - response = client.get_object(Bucket=bucket_name, Key=file_key) + parquet_buffer = read_io_from_s3( + bucket_name=bucket_name, + file_key=file_key + ) - # Read the file into a pandas DataFrame - parquet_buffer = BytesIO(response['Body'].read()) df = pd.read_parquet(parquet_buffer) return df