added spatial readme

2026-07-27 23:35:01 +00:00 · 2023-10-05 14:27:00 +01:00 · 2023-10-05 14:27:00 +01:00 · f94cbd4385
commit f94cbd4385
parent c267496353
5 changed files with 78 additions and 136 deletions
--- a/.github/workflows/deploy_sap_model_lambda.yml
+++ b/.github/workflows/deploy_sap_model_lambda.yml
@ -1,81 +0,0 @@
-name: Sap Model Deploy
-
-on:
-  push:
-    branches: [ dev, prod ]
-
-jobs:
-  deploy:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-
-      - name: Set up Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.10.12
-
-      - name: Install Serverless and plugins
-        run: |
-          npm install -g serverless
-          npm install -g serverless-domain-manager
-
-      - name: AWS credentials for dev
-        if: github.ref == 'refs/heads/dev'
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
-          aws-region: eu-west-2
-
-      - name: AWS credentials for prod
-        if: github.ref == 'refs/heads/prod'
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}
-          aws-region: eu-west-2
-
-      - name: Set domain name
-        id: set_domain
-        run: echo "::set-output name=domain::${{ secrets[format('{0}_DOMAIN_NAME', github.ref_name)] }}"
-
-      - name: Set ECR credentials
-        id: set_ecr_credentials
-        run: |
-          echo "::set-output name=ecr_uri::${{ secrets[format('{0}_SAP_MODEL_ECR_URI', github.ref_name)] }}"
-
-      - name: Setup Docker
-        uses: docker/setup-buildx-action@v1
-
-      - name: Login to ECR
-        run: |
-          aws ecr get-login-password --region eu-west-2 | docker login --username AWS --password-stdin ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
-
-      # Building and pushing Docker image with caching
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v3
-        with:
-          context: ./model_data/simulation_system
-          file: ./model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda
-          push: true
-          tags: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}:${{ github.sha }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          platform: linux/amd64
-          provenance: false
-
-      - name: Deploy to AWS Lambda via Serverless
-        env:
-          RUNTIME_ENVIRONMENT: ${{ github.ref_name }}
-          MODEL_DIRECTORY_BUCKET: 'retrofit-model-directory-${{ github.ref_name }}'
-          PREDICTIONS_BUCKET: 'retrofit-sap-predictions-${{ github.ref_name }}'
-          DATA_BUCKET: 'retrofit-data-${{ github.ref_name }}'
-          DOMAIN_NAME: ${{ steps.set_domain.outputs.domain }}
-          ECR_URI: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
-          GITHUB_SHA: ${{ github.sha }}
-        run: |
-          # Deploy to AWS Lambda via Serverless
-          sls deploy --config sapmodel.serverless.yml --stage ${{ github.ref_name }} --verbose
--- a/etl/properties/requirements.txt
+++ b/etl/properties/requirements.txt
--- a/etl/spatial/OpenUprnClient.py
+++ b/etl/spatial/OpenUprnClient.py
@ -1,8 +1,9 @@
+import os
 from tqdm import tqdm
 import pandas as pd
 import geopandas as gpd
 from utils.logger import setup_logger
-from utils.s3 import read_io_from_s3
+from utils.s3 import read_io_from_s3, save_dataframe_to_s3_parquet

 logger = setup_logger()

@ -71,21 +72,21 @@ class OpenUprnClient:
        for partition, group in tqdm(self.data.groupby('partition')):
            min_uprn = group['UPRN'].min()
            max_uprn = group['UPRN'].max()
-            self.filenames[partition] = f"{min_uprn}_{max_uprn}.csv"
+            self.filenames[partition] = f"{min_uprn}_{max_uprn}.parquet"

        self.data['filename'] = self.data['partition'].map(self.filenames)

    @staticmethod
    def find_filename_for_uprn(uprn, filenames):
        for filename in filenames:
-            min_uprn, max_uprn = map(int, filename.replace(".csv", "").split("_"))
+            min_uprn, max_uprn = map(int, filename.replace(".parquet", "").split("_"))
            if min_uprn <= uprn <= max_uprn:
                return filename
        return None

    @staticmethod
    def convert_bng_data_to_gpd(df):
-        
+
        gpd_data = gpd.GeoDataFrame(
            df,
            geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE),
@ -93,3 +94,25 @@ class OpenUprnClient:
        )

        return gpd_data
+
+    def save_filenames_to_s3(self, bucket_name):
+        """
+        Save the filenames to s3
+        :param bucket_name:
+        :return:
+        """
+        file_key = os.path.join("spatial", "filename_meta.parquet")
+
+        filenames = pd.DataFrame({"filenames": list(self.filenames.values())})
+        filenames[['lower', 'upper']] = filenames['filenames'].str.replace('.parquet', '').str.extract(
+            '(\d+)_(\d+)'
+        )
+        filenames['lower'] = filenames['lower'].astype(int)
+        filenames['upper'] = filenames['upper'].astype(int)
+
+        logger.info("Saving filenames to s3 at {}".format(file_key))
+        save_dataframe_to_s3_parquet(
+            df=filenames,
+            file_key=file_key,
+            bucket_name=bucket_name
+        )
--- a/etl/spatial/README.md
+++ b/etl/spatial/README.md
@ -0,0 +1,48 @@
+# Spatial - Geospatial Data Processing Service
+
+## Overview
+
+The Spatial service is designed to read, process, and analyze geospatial data related to
+conservation areas and special buildings. It uses datasets from Historic England and the
+UK government to determine whether a given UPRN (Unique Property Reference Number) is within
+a conservation area or is a listed building. The processed data is saved back to an S3 bucket
+in a parquet format for easy retrieval and further analysis.
+
+## Dependencies
+
+Dependencies are listed in requirements.txt. To install them, run:
+
+```
+pip install -r requirements.txt
+```
+
+## Data Sources
+
+1. **Historic England Conservation Areas**: Shapefile containing polygons of conservation areas.
+2. **UK Government Conservation Areas**: GeoJSON file containing polygons of conservation areas.
+3. **Open UPRN Data**: CSV file with UPRN and corresponding geospatial data.
+4. **Historic England Listed Buildings**: Shapefile with information on listed buildings.
+5. **Historic England Heritage Buildings at Risk**: Shapefile with information on heritage buildings at risk.
+
+## Files
+
+- app.py: Main application file that orchestrates the data processing flow.
+- ConservationAreaClient.py: Handles reading and processing of conservation area data.
+- OpenUprnClient.py: Manages reading and partitioning of Open UPRN data.
+- SpecialBuildingsClient.py: Takes care of reading and processing data related to special buildings.
+- requirements.txt: Lists all Python package dependencies.
+
+## How to Run
+
+1. Make sure you have all the required packages installed.
+2. Update the S3 bucket and file path constants in app.py.
+3. Run app.py.
+
+## Workflow
+
+1. Read the datasets for conservation areas and special buildings.
+2. Read the Open UPRN dataset and partition it into smaller chunks based on UPRN.
+3. For each partition:
+    - Convert UPRN data to geopandas DataFrame.
+    - Check if each UPRN is within a conservation area or is a special building.
+    - Save the processed data back to S3 in parquet format.
--- a/etl/spatial/app.py
+++ b/etl/spatial/app.py
@ -9,12 +9,11 @@ import pandas as pd
 from etl.spatial.ConservationAreaClient import ConservationAreaClient
 from etl.spatial.OpenUprnClient import OpenUprnClient
 from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
-from datatypes.datatypes import OpenUprnCoordinateData
 from utils.logger import setup_logger
 from utils.s3 import save_dataframe_to_s3_parquet

 BUCKET = "retrofit-datalake-dev"
-OUTPUT_BUCKET = "retrofit-dev-dev"
+OUTPUT_BUCKET = "retrofit-data-dev"
 HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
 GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
 OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
@ -27,13 +26,6 @@ logger = setup_logger()


 def app():
-    # TODO: Store the input data in S3 [x]
-    #       Read the input data from S3 [x]
-    #       Document the data source and where to find it [x]
-    #       Incorportate listed buildings [x]
-    #       Incorporate heritage buildings [x]
-    #       Write the outputs to S3 [ ]
-
    """
    This application uses the conservation area datasets to determine if a UPRN is
    in a conservation area or now
@ -79,30 +71,16 @@ def app():
    )
    special_buildings_client.read()

-    # Local version
-    OPEN_UPRN_PATHNAME = "/Users/khalimconn-kowlessar/Documents/hestia/Model/model_data/local_data" \
-                         "/osopenuprn_202306_csv/osopenuprn_202305.csv"
    open_uprn_client = OpenUprnClient(
        path=OPEN_UPRN_PATHNAME,
        bucket=BUCKET
    )
    open_uprn_client.read()
-    open_uprn_client.read_local()

    # We want to sort the data and split it into filenames on UPRN.
    # We'll split the data into chunks of 50,000
    open_uprn_client.create_file_partitions()

-    # special_buildings_client = SpecialBuildingsClient(
-    #     historic_england_listed_buildings_path=None,
-    #     historic_england_heritage_buildings_path=None,
-    #     bucket=None
-    # )
-    # special_buildings_client.historic_england_listed_buildings = \
-    # special_buildings_client2.historic_england_listed_buildings
-    # special_buildings_client.historic_england_heritage_buildings = \
-    #     special_buildings_client2.historic_england_heritage_buildings
-
    logger.info("Extracting spatial data for uprn partitions")
    to_loop_over = open_uprn_client.data.groupby("filename")

@ -121,31 +99,5 @@ def app():
            df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
        )

-    # We need to iterate through the open uprn data and check if the coordinates are in a conservation area
-    open_uprn_data = [
-        {'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
-         'LONGITUDE': -0.0540506},
-        {'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
-         'LONGITUDE': -0.0498772},
-        {'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
-         'LONGITUDE': -0.226392},
-        {'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
-         'LONGITUDE': -0.0792445},
-        {'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
-         'LONGITUDE': -0.0792445},
-        {'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
-         'LONGITUDE': -0.0468833},
-        {'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
-         'LONGITUDE': -0.1362513},
-        {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
-         'LONGITUDE': -0.0823165}
-    ]
-
-    result = [
-        {
-            "uprn": coordinates["UPRN"],
-            "is_in_conservation_area": conservation_area_client.is_in_conservation_area(
-                OpenUprnCoordinateData(**coordinates))
-        } for coordinates in
-        open_uprn_data
-    ]
+    # We finally save the filesnames to s3
+    open_uprn_client.save_filenames_to_s3(bucket_name=OUTPUT_BUCKET)