From 64b6b67499c15eedbffa4f581600f8328936e05f Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 29 Sep 2023 10:19:13 +0100
Subject: [PATCH] Added s3 download for conservation area data

---
 .gitignore                                    |  3 +
 backend/app/utils.py                          | 20 +++----
 .../ConservationAreaClient.py                 | 60 +++++++++++++++++--
 .../conservation_areas}/app.py                | 51 +++++++++++++---
 .../conservation_areas}/requirements.txt      |  0
 utils/s3.py                                   | 36 +++++++++--
 6 files changed, 142 insertions(+), 28 deletions(-)
 rename {conservation_areas => etl/conservation_areas}/ConservationAreaClient.py (72%)
 rename {conservation_areas => etl/conservation_areas}/app.py (50%)
 rename {conservation_areas => etl/conservation_areas}/requirements.txt (100%)

diff --git a/.gitignore b/.gitignore
index 98db3e9a..36067acf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -261,3 +261,6 @@ model_data/simulation_system/predictions/
 
 .idea/Model.iml
 .idea/misc.iml
+
+adhoc
+adhoc/*
\ No newline at end of file
diff --git a/backend/app/utils.py b/backend/app/utils.py
index 7099eba1..b4ba1bb9 100644
--- a/backend/app/utils.py
+++ b/backend/app/utils.py
@@ -79,17 +79,17 @@ def sap_to_epc(sap_points: int):
     if sap_points <= 0 or sap_points > 100:
         raise ValueError("SAP points should be between 1 and 100.")
 
-    if sap_points > 91:
+    if sap_points >= 92:
         return "A"
-    elif sap_points > 80:
+    elif sap_points >= 81:
         return "B"
-    elif sap_points > 69:
+    elif sap_points >= 69:
         return "C"
-    elif sap_points > 55:
+    elif sap_points >= 55:
         return "D"
-    elif sap_points > 39:
+    elif sap_points >= 39:
         return "E"
-    elif sap_points > 21:
+    elif sap_points >= 21:
         return "F"
     else:
         return "G"
@@ -108,13 +108,13 @@ def epc_to_sap_lower_bound(epc: str):
     elif epc == "B":
         return 81
     elif epc == "C":
-        return 70
+        return 69
     elif epc == "D":
-        return 56
+        return 55
     elif epc == "E":
-        return 40
+        return 39
     elif epc == "F":
-        return 22
+        return 21
     elif epc == "G":
         return 1
     else:
diff --git a/conservation_areas/ConservationAreaClient.py b/etl/conservation_areas/ConservationAreaClient.py
similarity index 72%
rename from conservation_areas/ConservationAreaClient.py
rename to etl/conservation_areas/ConservationAreaClient.py
index 164042f9..fbf72704 100644
--- a/conservation_areas/ConservationAreaClient.py
+++ b/etl/conservation_areas/ConservationAreaClient.py
@@ -1,12 +1,55 @@
-from enum import Enum
+import boto3
+import os
+import tempfile
+import pandas as pd
 import geopandas as gpd
+from enum import Enum
 from shapely.geometry import Point
 from utils.logger import setup_logger
+from utils.s3 import read_io_from_s3
 from datatypes.datatypes import OpenUprnCoordinateData
 
 logger = setup_logger()
 
 
+def read_shapefile_from_s3(bucket_name, s3_file_key):
+    """
+    Read a shapefile from S3 into a GeoDataFrame.
+
+    :param bucket_name: The name of the S3 bucket
+    :param s3_file_key: The file path of the shape file
+    :return: GeoDataFrame containing the shapefile data
+    """
+
+    s3_folder_key = "/".join(s3_file_key.split("/")[:-1])
+    shape_file_key = s3_file_key.split("/")[-1]
+    # Create a temporary directory
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        s3_client = boto3.client('s3')
+
+        # Ensure the temporary directory exists
+        logger.info("Creating temporary directory at %s" % tmpdirname)
+        os.makedirs(tmpdirname, exist_ok=True)
+
+        # List all files in the given S3 folder
+        s3_objects = s3_client.list_objects(Bucket=bucket_name, Prefix=s3_folder_key)['Contents']
+
+        # Download each file to the temporary directory
+        for s3_object in s3_objects:
+            file_key = s3_object['Key']
+            file_name = os.path.basename(file_key)
+            local_file_path = os.path.join(tmpdirname, file_name)
+            # Explicitly create the temporary file
+            with open(local_file_path, 'wb') as tmpfile:
+                s3_client.download_fileobj(bucket_name, file_key, tmpfile)
+
+        # Read the shapefile from the temporary directory into a GeoDataFrame
+        shapefile_path = os.path.join(tmpdirname, shape_file_key)
+        gdf = gpd.read_file(shapefile_path)
+
+    return gdf
+
+
 class ConservationAreaClient:
     """
     Class to interact and manupulate convervation area data. The historic england data
@@ -22,9 +65,10 @@ class ConservationAreaClient:
     NOT_IN_CONSERVATION_AREA = "not_in_conservation_area"
     UNKNOWN = "unknown"
 
-    def __init__(self, historic_england_path, gov_path):
+    def __init__(self, historic_england_path, gov_path, bucket):
         self.historic_england_path = historic_england_path
         self.gov_path = gov_path
+        self.bucket = bucket
 
         self.historic_england_data = None
         self.gov_data = None
@@ -34,10 +78,18 @@ class ConservationAreaClient:
         Read the data
         """
         logger.info("Reading in historic england conservation area shapefile")
-        self.historic_england_data = gpd.read_file(self.historic_england_path)
+        self.historic_england_data = read_shapefile_from_s3(
+            bucket_name=self.bucket, s3_file_key=self.historic_england_path
+        )
 
         logger.info("Reading in Govenment conservation area geojson")
-        self.gov_data = gpd.read_file(self.gov_path)
+
+        self.gov_data = gpd.read_file(
+            read_io_from_s3(
+                bucket_name=self.bucket,
+                file_key=self.gov_path
+            )
+        )
         self.gov_data = self.gov_data.drop(columns=["dataset"])
 
     def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):
diff --git a/conservation_areas/app.py b/etl/conservation_areas/app.py
similarity index 50%
rename from conservation_areas/app.py
rename to etl/conservation_areas/app.py
index 1038bcfe..dddaede6 100644
--- a/conservation_areas/app.py
+++ b/etl/conservation_areas/app.py
@@ -3,19 +3,49 @@ This application reads in the open uprn data from a static location and loads it
 our database for querying from other services
 """
 
-import os
-from conservation_areas.ConservationAreaClient import ConservationAreaClient
+from etl.conservation_areas.ConservationAreaClient import ConservationAreaClient
 from datatypes.datatypes import OpenUprnCoordinateData
 
+BUCKET = "retrofit-data-dev"
+HISTORIC_ENGLAND_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
+GOV_PATHNAME = "spatial/gov-conservation-area.geojson"
+
 
 def app():
+    # TODO: Store the input data in S3 [x]
+    #       Read the input data from S3 [ ]
+    #       Document the data source and where to find it [x]
+    #       Write the outputs to S3
+
+    """
+    This application uses the conservation area datasets to determine if a UPRN is
+    in a conservation area or now
+
+    We use two sources of data for determining if homes are in conservation areas.
+    The first is the Historic England dataset, which is a shapefile containing
+    polygons of conservation areas. The second is the gov.uk dataset, which is a
+    geojson file containing polygons of conservation areas.
+
+    The Historic England dataset can be found here:
+    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
+
+    The listed building dataset is also found at Historic England at:
+    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
+
+    The hertitige buildings dataset is also found at Historic England at:
+    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
+
+    The Gov.uk dataset can be found here:
+    https://www.planning.data.gov.uk/dataset/conservation-area
+
+    For the moment, these data sources are downloaded manually and uploaded to S3.
+    This application then processes those files and writes the results to s3
+    """
+
     conservation_area_client = ConservationAreaClient(
-        historic_england_path=os.path.abspath(
-            os.path.dirname(__file__)
-        ) + "/model_data/local_data/Historic_Eng_Conservation_Areas/Conservation_Areas.shp",
-        gov_path=os.path.abspath(
-            os.path.dirname(__file__)
-        ) + "/model_data/local_data/gov-conservation-area.geojson"
+        historic_england_path=HISTORIC_ENGLAND_PATHNAME,
+        gov_path=GOV_PATHNAME,
+        bucket=BUCKET
     )
     conservation_area_client.read()
 
@@ -39,6 +69,11 @@ def app():
          'LONGITUDE': -0.0823165}
     ]
 
+    open_uprn_data = [
+        {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
+         'LONGITUDE': -0.0823165}
+    ]
+
     result = [
         {
             "uprn": coordinates["UPRN"],
diff --git a/conservation_areas/requirements.txt b/etl/conservation_areas/requirements.txt
similarity index 100%
rename from conservation_areas/requirements.txt
rename to etl/conservation_areas/requirements.txt
diff --git a/utils/s3.py b/utils/s3.py
index 8d24d6c0..be0aa008 100644
--- a/utils/s3.py
+++ b/utils/s3.py
@@ -2,6 +2,9 @@ import boto3
 from io import BytesIO
 from botocore.exceptions import NoCredentialsError, PartialCredentialsError
 import pandas as pd
+from utils.logger import setup_logger
+
+logger = setup_logger()
 
 
 def read_from_s3(bucket_name, s3_file_name):
@@ -46,6 +49,27 @@ def save_data_to_s3(data, bucket_name, s3_file_name):
         print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')
 
 
+def read_io_from_s3(bucket_name, file_key):
+    """
+    Read a file from S3 into a BytesIO object. This can be used by other methods to parse the response
+
+    Because we use
+
+    :param bucket_name: The name of the S3 bucket
+    :param file_key: The file name of the shapefile in S3
+    :return: Io file to be parsed by another method
+    """
+    client = boto3.client('s3')
+
+    # Get the Parquet file from S3
+    response = client.get_object(Bucket=bucket_name, Key=file_key)
+
+    # Read the file into an io object
+    buffer = BytesIO(response['Body'].read())
+
+    return buffer
+
+
 def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
     """
     Save a pandas DataFrame to S3 as a Parquet file.
@@ -75,14 +99,14 @@ def read_dataframe_from_s3_parquet(bucket_name, file_key):
     :return: A pandas DataFrame.
     """
 
-    # Create the boto3 client
-    client = boto3.client('s3')
+    if not file_key.endswith(".parquet"):
+        raise logger.warning("This file doesn't look like a parquet file")
 
-    # Get the Parquet file from S3
-    response = client.get_object(Bucket=bucket_name, Key=file_key)
+    parquet_buffer = read_io_from_s3(
+        bucket_name=bucket_name,
+        file_key=file_key
+    )
 
-    # Read the file into a pandas DataFrame
-    parquet_buffer = BytesIO(response['Body'].read())
     df = pd.read_parquet(parquet_buffer)
 
     return df