From f94cbd438554fd8953701365c0379748c1035321 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 5 Oct 2023 14:27:00 +0100 Subject: [PATCH] added spatial readme --- .github/workflows/deploy_sap_model_lambda.yml | 81 ------------------- etl/properties/requirements.txt | 0 etl/spatial/OpenUprnClient.py | 31 ++++++- etl/spatial/README.md | 48 +++++++++++ etl/spatial/app.py | 54 +------------ 5 files changed, 78 insertions(+), 136 deletions(-) delete mode 100644 .github/workflows/deploy_sap_model_lambda.yml create mode 100644 etl/properties/requirements.txt create mode 100644 etl/spatial/README.md diff --git a/.github/workflows/deploy_sap_model_lambda.yml b/.github/workflows/deploy_sap_model_lambda.yml deleted file mode 100644 index fb4b8dde..00000000 --- a/.github/workflows/deploy_sap_model_lambda.yml +++ /dev/null @@ -1,81 +0,0 @@ -name: Sap Model Deploy - -on: - push: - branches: [ dev, prod ] - -jobs: - deploy: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: 3.10.12 - - - name: Install Serverless and plugins - run: | - npm install -g serverless - npm install -g serverless-domain-manager - - - name: AWS credentials for dev - if: github.ref == 'refs/heads/dev' - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} - aws-region: eu-west-2 - - - name: AWS credentials for prod - if: github.ref == 'refs/heads/prod' - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }} - aws-region: eu-west-2 - - - name: Set domain name - id: set_domain - run: echo "::set-output name=domain::${{ secrets[format('{0}_DOMAIN_NAME', github.ref_name)] }}" - - - name: Set ECR credentials - id: set_ecr_credentials - run: | - echo "::set-output name=ecr_uri::${{ secrets[format('{0}_SAP_MODEL_ECR_URI', github.ref_name)] }}" - - - name: Setup Docker - uses: docker/setup-buildx-action@v1 - - - name: Login to ECR - run: | - aws ecr get-login-password --region eu-west-2 | docker login --username AWS --password-stdin ${{ steps.set_ecr_credentials.outputs.ecr_uri }} - - # Building and pushing Docker image with caching - - name: Build and push Docker image - uses: docker/build-push-action@v3 - with: - context: ./model_data/simulation_system - file: ./model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda - push: true - tags: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}:${{ github.sha }} - cache-from: type=gha - cache-to: type=gha,mode=max - platform: linux/amd64 - provenance: false - - - name: Deploy to AWS Lambda via Serverless - env: - RUNTIME_ENVIRONMENT: ${{ github.ref_name }} - MODEL_DIRECTORY_BUCKET: 'retrofit-model-directory-${{ github.ref_name }}' - PREDICTIONS_BUCKET: 'retrofit-sap-predictions-${{ github.ref_name }}' - DATA_BUCKET: 'retrofit-data-${{ github.ref_name }}' - DOMAIN_NAME: ${{ steps.set_domain.outputs.domain }} - ECR_URI: ${{ steps.set_ecr_credentials.outputs.ecr_uri }} - GITHUB_SHA: ${{ github.sha }} - run: | - # Deploy to AWS Lambda via Serverless - sls deploy --config sapmodel.serverless.yml --stage ${{ github.ref_name }} --verbose diff --git a/etl/properties/requirements.txt b/etl/properties/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/etl/spatial/OpenUprnClient.py b/etl/spatial/OpenUprnClient.py index 90d78e8c..7392c4ac 100644 --- a/etl/spatial/OpenUprnClient.py +++ b/etl/spatial/OpenUprnClient.py @@ -1,8 +1,9 @@ +import os from tqdm import tqdm import pandas as pd import geopandas as gpd from utils.logger import setup_logger -from utils.s3 import read_io_from_s3 +from utils.s3 import read_io_from_s3, save_dataframe_to_s3_parquet logger = setup_logger() @@ -71,21 +72,21 @@ class OpenUprnClient: for partition, group in tqdm(self.data.groupby('partition')): min_uprn = group['UPRN'].min() max_uprn = group['UPRN'].max() - self.filenames[partition] = f"{min_uprn}_{max_uprn}.csv" + self.filenames[partition] = f"{min_uprn}_{max_uprn}.parquet" self.data['filename'] = self.data['partition'].map(self.filenames) @staticmethod def find_filename_for_uprn(uprn, filenames): for filename in filenames: - min_uprn, max_uprn = map(int, filename.replace(".csv", "").split("_")) + min_uprn, max_uprn = map(int, filename.replace(".parquet", "").split("_")) if min_uprn <= uprn <= max_uprn: return filename return None @staticmethod def convert_bng_data_to_gpd(df): - + gpd_data = gpd.GeoDataFrame( df, geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE), @@ -93,3 +94,25 @@ class OpenUprnClient: ) return gpd_data + + def save_filenames_to_s3(self, bucket_name): + """ + Save the filenames to s3 + :param bucket_name: + :return: + """ + file_key = os.path.join("spatial", "filename_meta.parquet") + + filenames = pd.DataFrame({"filenames": list(self.filenames.values())}) + filenames[['lower', 'upper']] = filenames['filenames'].str.replace('.parquet', '').str.extract( + '(\d+)_(\d+)' + ) + filenames['lower'] = filenames['lower'].astype(int) + filenames['upper'] = filenames['upper'].astype(int) + + logger.info("Saving filenames to s3 at {}".format(file_key)) + save_dataframe_to_s3_parquet( + df=filenames, + file_key=file_key, + bucket_name=bucket_name + ) diff --git a/etl/spatial/README.md b/etl/spatial/README.md new file mode 100644 index 00000000..ab68fcd5 --- /dev/null +++ b/etl/spatial/README.md @@ -0,0 +1,48 @@ +# Spatial - Geospatial Data Processing Service + +## Overview + +The Spatial service is designed to read, process, and analyze geospatial data related to +conservation areas and special buildings. It uses datasets from Historic England and the +UK government to determine whether a given UPRN (Unique Property Reference Number) is within +a conservation area or is a listed building. The processed data is saved back to an S3 bucket +in a parquet format for easy retrieval and further analysis. + +## Dependencies + +Dependencies are listed in requirements.txt. To install them, run: + +``` +pip install -r requirements.txt +``` + +## Data Sources + +1. **Historic England Conservation Areas**: Shapefile containing polygons of conservation areas. +2. **UK Government Conservation Areas**: GeoJSON file containing polygons of conservation areas. +3. **Open UPRN Data**: CSV file with UPRN and corresponding geospatial data. +4. **Historic England Listed Buildings**: Shapefile with information on listed buildings. +5. **Historic England Heritage Buildings at Risk**: Shapefile with information on heritage buildings at risk. + +## Files + +- app.py: Main application file that orchestrates the data processing flow. +- ConservationAreaClient.py: Handles reading and processing of conservation area data. +- OpenUprnClient.py: Manages reading and partitioning of Open UPRN data. +- SpecialBuildingsClient.py: Takes care of reading and processing data related to special buildings. +- requirements.txt: Lists all Python package dependencies. + +## How to Run + +1. Make sure you have all the required packages installed. +2. Update the S3 bucket and file path constants in app.py. +3. Run app.py. + +## Workflow + +1. Read the datasets for conservation areas and special buildings. +2. Read the Open UPRN dataset and partition it into smaller chunks based on UPRN. +3. For each partition: + - Convert UPRN data to geopandas DataFrame. + - Check if each UPRN is within a conservation area or is a special building. + - Save the processed data back to S3 in parquet format. \ No newline at end of file diff --git a/etl/spatial/app.py b/etl/spatial/app.py index 39fe1434..d58509dd 100644 --- a/etl/spatial/app.py +++ b/etl/spatial/app.py @@ -9,12 +9,11 @@ import pandas as pd from etl.spatial.ConservationAreaClient import ConservationAreaClient from etl.spatial.OpenUprnClient import OpenUprnClient from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient -from datatypes.datatypes import OpenUprnCoordinateData from utils.logger import setup_logger from utils.s3 import save_dataframe_to_s3_parquet BUCKET = "retrofit-datalake-dev" -OUTPUT_BUCKET = "retrofit-dev-dev" +OUTPUT_BUCKET = "retrofit-data-dev" HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp" GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson" OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv" @@ -27,13 +26,6 @@ logger = setup_logger() def app(): - # TODO: Store the input data in S3 [x] - # Read the input data from S3 [x] - # Document the data source and where to find it [x] - # Incorportate listed buildings [x] - # Incorporate heritage buildings [x] - # Write the outputs to S3 [ ] - """ This application uses the conservation area datasets to determine if a UPRN is in a conservation area or now @@ -79,30 +71,16 @@ def app(): ) special_buildings_client.read() - # Local version - OPEN_UPRN_PATHNAME = "/Users/khalimconn-kowlessar/Documents/hestia/Model/model_data/local_data" \ - "/osopenuprn_202306_csv/osopenuprn_202305.csv" open_uprn_client = OpenUprnClient( path=OPEN_UPRN_PATHNAME, bucket=BUCKET ) open_uprn_client.read() - open_uprn_client.read_local() # We want to sort the data and split it into filenames on UPRN. # We'll split the data into chunks of 50,000 open_uprn_client.create_file_partitions() - # special_buildings_client = SpecialBuildingsClient( - # historic_england_listed_buildings_path=None, - # historic_england_heritage_buildings_path=None, - # bucket=None - # ) - # special_buildings_client.historic_england_listed_buildings = \ - # special_buildings_client2.historic_england_listed_buildings - # special_buildings_client.historic_england_heritage_buildings = \ - # special_buildings_client2.historic_england_heritage_buildings - logger.info("Extracting spatial data for uprn partitions") to_loop_over = open_uprn_client.data.groupby("filename") @@ -121,31 +99,5 @@ def app(): df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET ) - # We need to iterate through the open uprn data and check if the coordinates are in a conservation area - open_uprn_data = [ - {'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407, - 'LONGITUDE': -0.0540506}, - {'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492, - 'LONGITUDE': -0.0498772}, - {'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579, - 'LONGITUDE': -0.226392}, - {'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629, - 'LONGITUDE': -0.0792445}, - {'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629, - 'LONGITUDE': -0.0792445}, - {'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385, - 'LONGITUDE': -0.0468833}, - {'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908, - 'LONGITUDE': -0.1362513}, - {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309, - 'LONGITUDE': -0.0823165} - ] - - result = [ - { - "uprn": coordinates["UPRN"], - "is_in_conservation_area": conservation_area_client.is_in_conservation_area( - OpenUprnCoordinateData(**coordinates)) - } for coordinates in - open_uprn_data - ] + # We finally save the filesnames to s3 + open_uprn_client.save_filenames_to_s3(bucket_name=OUTPUT_BUCKET)