From f94cbd438554fd8953701365c0379748c1035321 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 5 Oct 2023 14:27:00 +0100
Subject: [PATCH] added spatial readme

---
 .github/workflows/deploy_sap_model_lambda.yml | 81 -------------------
 etl/properties/requirements.txt               |  0
 etl/spatial/OpenUprnClient.py                 | 31 ++++++-
 etl/spatial/README.md                         | 48 +++++++++++
 etl/spatial/app.py                            | 54 +------------
 5 files changed, 78 insertions(+), 136 deletions(-)
 delete mode 100644 .github/workflows/deploy_sap_model_lambda.yml
 create mode 100644 etl/properties/requirements.txt
 create mode 100644 etl/spatial/README.md

diff --git a/.github/workflows/deploy_sap_model_lambda.yml b/.github/workflows/deploy_sap_model_lambda.yml
deleted file mode 100644
index fb4b8dde..00000000
--- a/.github/workflows/deploy_sap_model_lambda.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-name: Sap Model Deploy
-
-on:
-  push:
-    branches: [ dev, prod ]
-
-jobs:
-  deploy:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-
-      - name: Set up Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.10.12
-
-      - name: Install Serverless and plugins
-        run: |
-          npm install -g serverless
-          npm install -g serverless-domain-manager
-
-      - name: AWS credentials for dev
-        if: github.ref == 'refs/heads/dev'
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
-          aws-region: eu-west-2
-
-      - name: AWS credentials for prod
-        if: github.ref == 'refs/heads/prod'
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}
-          aws-region: eu-west-2
-
-      - name: Set domain name
-        id: set_domain
-        run: echo "::set-output name=domain::${{ secrets[format('{0}_DOMAIN_NAME', github.ref_name)] }}"
-
-      - name: Set ECR credentials
-        id: set_ecr_credentials
-        run: |
-          echo "::set-output name=ecr_uri::${{ secrets[format('{0}_SAP_MODEL_ECR_URI', github.ref_name)] }}"
-
-      - name: Setup Docker
-        uses: docker/setup-buildx-action@v1
-
-      - name: Login to ECR
-        run: |
-          aws ecr get-login-password --region eu-west-2 | docker login --username AWS --password-stdin ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
-
-      # Building and pushing Docker image with caching
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v3
-        with:
-          context: ./model_data/simulation_system
-          file: ./model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda
-          push: true
-          tags: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}:${{ github.sha }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          platform: linux/amd64
-          provenance: false
-
-      - name: Deploy to AWS Lambda via Serverless
-        env:
-          RUNTIME_ENVIRONMENT: ${{ github.ref_name }}
-          MODEL_DIRECTORY_BUCKET: 'retrofit-model-directory-${{ github.ref_name }}'
-          PREDICTIONS_BUCKET: 'retrofit-sap-predictions-${{ github.ref_name }}'
-          DATA_BUCKET: 'retrofit-data-${{ github.ref_name }}'
-          DOMAIN_NAME: ${{ steps.set_domain.outputs.domain }}
-          ECR_URI: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
-          GITHUB_SHA: ${{ github.sha }}
-        run: |
-          # Deploy to AWS Lambda via Serverless
-          sls deploy --config sapmodel.serverless.yml --stage ${{ github.ref_name }} --verbose
diff --git a/etl/properties/requirements.txt b/etl/properties/requirements.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/etl/spatial/OpenUprnClient.py b/etl/spatial/OpenUprnClient.py
index 90d78e8c..7392c4ac 100644
--- a/etl/spatial/OpenUprnClient.py
+++ b/etl/spatial/OpenUprnClient.py
@@ -1,8 +1,9 @@
+import os
 from tqdm import tqdm
 import pandas as pd
 import geopandas as gpd
 from utils.logger import setup_logger
-from utils.s3 import read_io_from_s3
+from utils.s3 import read_io_from_s3, save_dataframe_to_s3_parquet
 
 logger = setup_logger()
 
@@ -71,21 +72,21 @@ class OpenUprnClient:
         for partition, group in tqdm(self.data.groupby('partition')):
             min_uprn = group['UPRN'].min()
             max_uprn = group['UPRN'].max()
-            self.filenames[partition] = f"{min_uprn}_{max_uprn}.csv"
+            self.filenames[partition] = f"{min_uprn}_{max_uprn}.parquet"
 
         self.data['filename'] = self.data['partition'].map(self.filenames)
 
     @staticmethod
     def find_filename_for_uprn(uprn, filenames):
         for filename in filenames:
-            min_uprn, max_uprn = map(int, filename.replace(".csv", "").split("_"))
+            min_uprn, max_uprn = map(int, filename.replace(".parquet", "").split("_"))
             if min_uprn <= uprn <= max_uprn:
                 return filename
         return None
 
     @staticmethod
     def convert_bng_data_to_gpd(df):
-        
+
         gpd_data = gpd.GeoDataFrame(
             df,
             geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE),
@@ -93,3 +94,25 @@ class OpenUprnClient:
         )
 
         return gpd_data
+
+    def save_filenames_to_s3(self, bucket_name):
+        """
+        Save the filenames to s3
+        :param bucket_name:
+        :return:
+        """
+        file_key = os.path.join("spatial", "filename_meta.parquet")
+
+        filenames = pd.DataFrame({"filenames": list(self.filenames.values())})
+        filenames[['lower', 'upper']] = filenames['filenames'].str.replace('.parquet', '').str.extract(
+            '(\d+)_(\d+)'
+        )
+        filenames['lower'] = filenames['lower'].astype(int)
+        filenames['upper'] = filenames['upper'].astype(int)
+
+        logger.info("Saving filenames to s3 at {}".format(file_key))
+        save_dataframe_to_s3_parquet(
+            df=filenames,
+            file_key=file_key,
+            bucket_name=bucket_name
+        )
diff --git a/etl/spatial/README.md b/etl/spatial/README.md
new file mode 100644
index 00000000..ab68fcd5
--- /dev/null
+++ b/etl/spatial/README.md
@@ -0,0 +1,48 @@
+# Spatial - Geospatial Data Processing Service
+
+## Overview
+
+The Spatial service is designed to read, process, and analyze geospatial data related to
+conservation areas and special buildings. It uses datasets from Historic England and the
+UK government to determine whether a given UPRN (Unique Property Reference Number) is within
+a conservation area or is a listed building. The processed data is saved back to an S3 bucket
+in a parquet format for easy retrieval and further analysis.
+
+## Dependencies
+
+Dependencies are listed in requirements.txt. To install them, run:
+
+```
+pip install -r requirements.txt
+```
+
+## Data Sources
+
+1. **Historic England Conservation Areas**: Shapefile containing polygons of conservation areas.
+2. **UK Government Conservation Areas**: GeoJSON file containing polygons of conservation areas.
+3. **Open UPRN Data**: CSV file with UPRN and corresponding geospatial data.
+4. **Historic England Listed Buildings**: Shapefile with information on listed buildings.
+5. **Historic England Heritage Buildings at Risk**: Shapefile with information on heritage buildings at risk.
+
+## Files
+
+- app.py: Main application file that orchestrates the data processing flow.
+- ConservationAreaClient.py: Handles reading and processing of conservation area data.
+- OpenUprnClient.py: Manages reading and partitioning of Open UPRN data.
+- SpecialBuildingsClient.py: Takes care of reading and processing data related to special buildings.
+- requirements.txt: Lists all Python package dependencies.
+
+## How to Run
+
+1. Make sure you have all the required packages installed.
+2. Update the S3 bucket and file path constants in app.py.
+3. Run app.py.
+
+## Workflow
+
+1. Read the datasets for conservation areas and special buildings.
+2. Read the Open UPRN dataset and partition it into smaller chunks based on UPRN.
+3. For each partition:
+    - Convert UPRN data to geopandas DataFrame.
+    - Check if each UPRN is within a conservation area or is a special building.
+    - Save the processed data back to S3 in parquet format.
\ No newline at end of file
diff --git a/etl/spatial/app.py b/etl/spatial/app.py
index 39fe1434..d58509dd 100644
--- a/etl/spatial/app.py
+++ b/etl/spatial/app.py
@@ -9,12 +9,11 @@ import pandas as pd
 from etl.spatial.ConservationAreaClient import ConservationAreaClient
 from etl.spatial.OpenUprnClient import OpenUprnClient
 from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
-from datatypes.datatypes import OpenUprnCoordinateData
 from utils.logger import setup_logger
 from utils.s3 import save_dataframe_to_s3_parquet
 
 BUCKET = "retrofit-datalake-dev"
-OUTPUT_BUCKET = "retrofit-dev-dev"
+OUTPUT_BUCKET = "retrofit-data-dev"
 HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
 GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
 OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
@@ -27,13 +26,6 @@ logger = setup_logger()
 
 
 def app():
-    # TODO: Store the input data in S3 [x]
-    #       Read the input data from S3 [x]
-    #       Document the data source and where to find it [x]
-    #       Incorportate listed buildings [x]
-    #       Incorporate heritage buildings [x]
-    #       Write the outputs to S3 [ ]
-
     """
     This application uses the conservation area datasets to determine if a UPRN is
     in a conservation area or now
@@ -79,30 +71,16 @@ def app():
     )
     special_buildings_client.read()
 
-    # Local version
-    OPEN_UPRN_PATHNAME = "/Users/khalimconn-kowlessar/Documents/hestia/Model/model_data/local_data" \
-                         "/osopenuprn_202306_csv/osopenuprn_202305.csv"
     open_uprn_client = OpenUprnClient(
         path=OPEN_UPRN_PATHNAME,
         bucket=BUCKET
     )
     open_uprn_client.read()
-    open_uprn_client.read_local()
 
     # We want to sort the data and split it into filenames on UPRN.
     # We'll split the data into chunks of 50,000
     open_uprn_client.create_file_partitions()
 
-    # special_buildings_client = SpecialBuildingsClient(
-    #     historic_england_listed_buildings_path=None,
-    #     historic_england_heritage_buildings_path=None,
-    #     bucket=None
-    # )
-    # special_buildings_client.historic_england_listed_buildings = \
-    # special_buildings_client2.historic_england_listed_buildings
-    # special_buildings_client.historic_england_heritage_buildings = \
-    #     special_buildings_client2.historic_england_heritage_buildings
-
     logger.info("Extracting spatial data for uprn partitions")
     to_loop_over = open_uprn_client.data.groupby("filename")
 
@@ -121,31 +99,5 @@ def app():
             df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
         )
 
-    # We need to iterate through the open uprn data and check if the coordinates are in a conservation area
-    open_uprn_data = [
-        {'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
-         'LONGITUDE': -0.0540506},
-        {'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
-         'LONGITUDE': -0.0498772},
-        {'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
-         'LONGITUDE': -0.226392},
-        {'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
-         'LONGITUDE': -0.0792445},
-        {'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
-         'LONGITUDE': -0.0792445},
-        {'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
-         'LONGITUDE': -0.0468833},
-        {'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
-         'LONGITUDE': -0.1362513},
-        {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
-         'LONGITUDE': -0.0823165}
-    ]
-
-    result = [
-        {
-            "uprn": coordinates["UPRN"],
-            "is_in_conservation_area": conservation_area_client.is_in_conservation_area(
-                OpenUprnCoordinateData(**coordinates))
-        } for coordinates in
-        open_uprn_data
-    ]
+    # We finally save the filesnames to s3
+    open_uprn_client.save_filenames_to_s3(bucket_name=OUTPUT_BUCKET)