Added s3 download for conservation area data

2026-07-27 23:35:01 +00:00 · 2023-09-29 10:19:13 +01:00 · 2023-09-29 10:19:13 +01:00 · 64b6b67499
commit 64b6b67499
parent 642a224a7b
6 changed files with 142 additions and 28 deletions
--- a/.gitignore
+++ b/.gitignore
@ -261,3 +261,6 @@ model_data/simulation_system/predictions/

 .idea/Model.iml
 .idea/misc.iml
+
+adhoc
+adhoc/*
--- a/backend/app/utils.py
+++ b/backend/app/utils.py
@ -79,17 +79,17 @@ def sap_to_epc(sap_points: int):
    if sap_points <= 0 or sap_points > 100:
        raise ValueError("SAP points should be between 1 and 100.")

-    if sap_points > 91:
+    if sap_points >= 92:
        return "A"
-    elif sap_points > 80:
+    elif sap_points >= 81:
        return "B"
-    elif sap_points > 69:
+    elif sap_points >= 69:
        return "C"
-    elif sap_points > 55:
+    elif sap_points >= 55:
        return "D"
-    elif sap_points > 39:
+    elif sap_points >= 39:
        return "E"
-    elif sap_points > 21:
+    elif sap_points >= 21:
        return "F"
    else:
        return "G"
@ -108,13 +108,13 @@ def epc_to_sap_lower_bound(epc: str):
    elif epc == "B":
        return 81
    elif epc == "C":
-        return 70
+        return 69
    elif epc == "D":
-        return 56
+        return 55
    elif epc == "E":
-        return 40
+        return 39
    elif epc == "F":
-        return 22
+        return 21
    elif epc == "G":
        return 1
    else:
--- a/etl/conservation_areas/ConservationAreaClient.py
+++ b/etl/conservation_areas/ConservationAreaClient.py
@ -1,12 +1,55 @@
-from enum import Enum
+import boto3
+import os
+import tempfile
+import pandas as pd
 import geopandas as gpd
+from enum import Enum
 from shapely.geometry import Point
 from utils.logger import setup_logger
+from utils.s3 import read_io_from_s3
 from datatypes.datatypes import OpenUprnCoordinateData

 logger = setup_logger()


+def read_shapefile_from_s3(bucket_name, s3_file_key):
+    """
+    Read a shapefile from S3 into a GeoDataFrame.
+
+    :param bucket_name: The name of the S3 bucket
+    :param s3_file_key: The file path of the shape file
+    :return: GeoDataFrame containing the shapefile data
+    """
+
+    s3_folder_key = "/".join(s3_file_key.split("/")[:-1])
+    shape_file_key = s3_file_key.split("/")[-1]
+    # Create a temporary directory
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        s3_client = boto3.client('s3')
+
+        # Ensure the temporary directory exists
+        logger.info("Creating temporary directory at %s" % tmpdirname)
+        os.makedirs(tmpdirname, exist_ok=True)
+
+        # List all files in the given S3 folder
+        s3_objects = s3_client.list_objects(Bucket=bucket_name, Prefix=s3_folder_key)['Contents']
+
+        # Download each file to the temporary directory
+        for s3_object in s3_objects:
+            file_key = s3_object['Key']
+            file_name = os.path.basename(file_key)
+            local_file_path = os.path.join(tmpdirname, file_name)
+            # Explicitly create the temporary file
+            with open(local_file_path, 'wb') as tmpfile:
+                s3_client.download_fileobj(bucket_name, file_key, tmpfile)
+
+        # Read the shapefile from the temporary directory into a GeoDataFrame
+        shapefile_path = os.path.join(tmpdirname, shape_file_key)
+        gdf = gpd.read_file(shapefile_path)
+
+    return gdf
+
+
 class ConservationAreaClient:
    """
    Class to interact and manupulate convervation area data. The historic england data
@ -22,9 +65,10 @@ class ConservationAreaClient:
    NOT_IN_CONSERVATION_AREA = "not_in_conservation_area"
    UNKNOWN = "unknown"

-    def __init__(self, historic_england_path, gov_path):
+    def __init__(self, historic_england_path, gov_path, bucket):
        self.historic_england_path = historic_england_path
        self.gov_path = gov_path
+        self.bucket = bucket

        self.historic_england_data = None
        self.gov_data = None
@ -34,10 +78,18 @@ class ConservationAreaClient:
        Read the data
        """
        logger.info("Reading in historic england conservation area shapefile")
-        self.historic_england_data = gpd.read_file(self.historic_england_path)
+        self.historic_england_data = read_shapefile_from_s3(
+            bucket_name=self.bucket, s3_file_key=self.historic_england_path
+        )

        logger.info("Reading in Govenment conservation area geojson")
-        self.gov_data = gpd.read_file(self.gov_path)
+
+        self.gov_data = gpd.read_file(
+            read_io_from_s3(
+                bucket_name=self.bucket,
+                file_key=self.gov_path
+            )
+        )
        self.gov_data = self.gov_data.drop(columns=["dataset"])

    def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):
--- a/etl/conservation_areas/app.py
+++ b/etl/conservation_areas/app.py
@ -3,19 +3,49 @@ This application reads in the open uprn data from a static location and loads it
 our database for querying from other services
 """

-import os
-from conservation_areas.ConservationAreaClient import ConservationAreaClient
+from etl.conservation_areas.ConservationAreaClient import ConservationAreaClient
 from datatypes.datatypes import OpenUprnCoordinateData

+BUCKET = "retrofit-data-dev"
+HISTORIC_ENGLAND_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
+GOV_PATHNAME = "spatial/gov-conservation-area.geojson"
+

 def app():
+    # TODO: Store the input data in S3 [x]
+    #       Read the input data from S3 [ ]
+    #       Document the data source and where to find it [x]
+    #       Write the outputs to S3
+
+    """
+    This application uses the conservation area datasets to determine if a UPRN is
+    in a conservation area or now
+
+    We use two sources of data for determining if homes are in conservation areas.
+    The first is the Historic England dataset, which is a shapefile containing
+    polygons of conservation areas. The second is the gov.uk dataset, which is a
+    geojson file containing polygons of conservation areas.
+
+    The Historic England dataset can be found here:
+    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
+
+    The listed building dataset is also found at Historic England at:
+    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
+
+    The hertitige buildings dataset is also found at Historic England at:
+    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
+
+    The Gov.uk dataset can be found here:
+    https://www.planning.data.gov.uk/dataset/conservation-area
+
+    For the moment, these data sources are downloaded manually and uploaded to S3.
+    This application then processes those files and writes the results to s3
+    """
+
    conservation_area_client = ConservationAreaClient(
-        historic_england_path=os.path.abspath(
-            os.path.dirname(__file__)
-        ) + "/model_data/local_data/Historic_Eng_Conservation_Areas/Conservation_Areas.shp",
-        gov_path=os.path.abspath(
-            os.path.dirname(__file__)
-        ) + "/model_data/local_data/gov-conservation-area.geojson"
+        historic_england_path=HISTORIC_ENGLAND_PATHNAME,
+        gov_path=GOV_PATHNAME,
+        bucket=BUCKET
    )
    conservation_area_client.read()

@ -39,6 +69,11 @@ def app():
         'LONGITUDE': -0.0823165}
    ]

+    open_uprn_data = [
+        {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
+         'LONGITUDE': -0.0823165}
+    ]
+
    result = [
        {
            "uprn": coordinates["UPRN"],
--- a/etl/conservation_areas/requirements.txt
+++ b/etl/conservation_areas/requirements.txt
--- a/utils/s3.py
+++ b/utils/s3.py
@ -2,6 +2,9 @@ import boto3
 from io import BytesIO
 from botocore.exceptions import NoCredentialsError, PartialCredentialsError
 import pandas as pd
+from utils.logger import setup_logger
+
+logger = setup_logger()


 def read_from_s3(bucket_name, s3_file_name):
@ -46,6 +49,27 @@ def save_data_to_s3(data, bucket_name, s3_file_name):
        print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')


+def read_io_from_s3(bucket_name, file_key):
+    """
+    Read a file from S3 into a BytesIO object. This can be used by other methods to parse the response
+
+    Because we use
+
+    :param bucket_name: The name of the S3 bucket
+    :param file_key: The file name of the shapefile in S3
+    :return: Io file to be parsed by another method
+    """
+    client = boto3.client('s3')
+
+    # Get the Parquet file from S3
+    response = client.get_object(Bucket=bucket_name, Key=file_key)
+
+    # Read the file into an io object
+    buffer = BytesIO(response['Body'].read())
+
+    return buffer
+
+
 def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
    """
    Save a pandas DataFrame to S3 as a Parquet file.
@ -75,14 +99,14 @@ def read_dataframe_from_s3_parquet(bucket_name, file_key):
    :return: A pandas DataFrame.
    """

-    # Create the boto3 client
-    client = boto3.client('s3')
+    if not file_key.endswith(".parquet"):
+        raise logger.warning("This file doesn't look like a parquet file")

-    # Get the Parquet file from S3
-    response = client.get_object(Bucket=bucket_name, Key=file_key)
+    parquet_buffer = read_io_from_s3(
+        bucket_name=bucket_name,
+        file_key=file_key
+    )

-    # Read the file into a pandas DataFrame
-    parquet_buffer = BytesIO(response['Body'].read())
    df = pd.read_parquet(parquet_buffer)

    return df