added spatial readme

This commit is contained in:
Khalim Conn-Kowlessar 2023-10-05 14:27:00 +01:00
parent c267496353
commit f94cbd4385
5 changed files with 78 additions and 136 deletions

View file

@ -1,81 +0,0 @@
name: Sap Model Deploy
on:
push:
branches: [ dev, prod ]
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.10.12
- name: Install Serverless and plugins
run: |
npm install -g serverless
npm install -g serverless-domain-manager
- name: AWS credentials for dev
if: github.ref == 'refs/heads/dev'
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
aws-region: eu-west-2
- name: AWS credentials for prod
if: github.ref == 'refs/heads/prod'
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}
aws-region: eu-west-2
- name: Set domain name
id: set_domain
run: echo "::set-output name=domain::${{ secrets[format('{0}_DOMAIN_NAME', github.ref_name)] }}"
- name: Set ECR credentials
id: set_ecr_credentials
run: |
echo "::set-output name=ecr_uri::${{ secrets[format('{0}_SAP_MODEL_ECR_URI', github.ref_name)] }}"
- name: Setup Docker
uses: docker/setup-buildx-action@v1
- name: Login to ECR
run: |
aws ecr get-login-password --region eu-west-2 | docker login --username AWS --password-stdin ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
# Building and pushing Docker image with caching
- name: Build and push Docker image
uses: docker/build-push-action@v3
with:
context: ./model_data/simulation_system
file: ./model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda
push: true
tags: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}:${{ github.sha }}
cache-from: type=gha
cache-to: type=gha,mode=max
platform: linux/amd64
provenance: false
- name: Deploy to AWS Lambda via Serverless
env:
RUNTIME_ENVIRONMENT: ${{ github.ref_name }}
MODEL_DIRECTORY_BUCKET: 'retrofit-model-directory-${{ github.ref_name }}'
PREDICTIONS_BUCKET: 'retrofit-sap-predictions-${{ github.ref_name }}'
DATA_BUCKET: 'retrofit-data-${{ github.ref_name }}'
DOMAIN_NAME: ${{ steps.set_domain.outputs.domain }}
ECR_URI: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
GITHUB_SHA: ${{ github.sha }}
run: |
# Deploy to AWS Lambda via Serverless
sls deploy --config sapmodel.serverless.yml --stage ${{ github.ref_name }} --verbose

View file

View file

@ -1,8 +1,9 @@
import os
from tqdm import tqdm
import pandas as pd
import geopandas as gpd
from utils.logger import setup_logger
from utils.s3 import read_io_from_s3
from utils.s3 import read_io_from_s3, save_dataframe_to_s3_parquet
logger = setup_logger()
@ -71,21 +72,21 @@ class OpenUprnClient:
for partition, group in tqdm(self.data.groupby('partition')):
min_uprn = group['UPRN'].min()
max_uprn = group['UPRN'].max()
self.filenames[partition] = f"{min_uprn}_{max_uprn}.csv"
self.filenames[partition] = f"{min_uprn}_{max_uprn}.parquet"
self.data['filename'] = self.data['partition'].map(self.filenames)
@staticmethod
def find_filename_for_uprn(uprn, filenames):
for filename in filenames:
min_uprn, max_uprn = map(int, filename.replace(".csv", "").split("_"))
min_uprn, max_uprn = map(int, filename.replace(".parquet", "").split("_"))
if min_uprn <= uprn <= max_uprn:
return filename
return None
@staticmethod
def convert_bng_data_to_gpd(df):
gpd_data = gpd.GeoDataFrame(
df,
geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE),
@ -93,3 +94,25 @@ class OpenUprnClient:
)
return gpd_data
def save_filenames_to_s3(self, bucket_name):
"""
Save the filenames to s3
:param bucket_name:
:return:
"""
file_key = os.path.join("spatial", "filename_meta.parquet")
filenames = pd.DataFrame({"filenames": list(self.filenames.values())})
filenames[['lower', 'upper']] = filenames['filenames'].str.replace('.parquet', '').str.extract(
'(\d+)_(\d+)'
)
filenames['lower'] = filenames['lower'].astype(int)
filenames['upper'] = filenames['upper'].astype(int)
logger.info("Saving filenames to s3 at {}".format(file_key))
save_dataframe_to_s3_parquet(
df=filenames,
file_key=file_key,
bucket_name=bucket_name
)

48
etl/spatial/README.md Normal file
View file

@ -0,0 +1,48 @@
# Spatial - Geospatial Data Processing Service
## Overview
The Spatial service is designed to read, process, and analyze geospatial data related to
conservation areas and special buildings. It uses datasets from Historic England and the
UK government to determine whether a given UPRN (Unique Property Reference Number) is within
a conservation area or is a listed building. The processed data is saved back to an S3 bucket
in a parquet format for easy retrieval and further analysis.
## Dependencies
Dependencies are listed in requirements.txt. To install them, run:
```
pip install -r requirements.txt
```
## Data Sources
1. **Historic England Conservation Areas**: Shapefile containing polygons of conservation areas.
2. **UK Government Conservation Areas**: GeoJSON file containing polygons of conservation areas.
3. **Open UPRN Data**: CSV file with UPRN and corresponding geospatial data.
4. **Historic England Listed Buildings**: Shapefile with information on listed buildings.
5. **Historic England Heritage Buildings at Risk**: Shapefile with information on heritage buildings at risk.
## Files
- app.py: Main application file that orchestrates the data processing flow.
- ConservationAreaClient.py: Handles reading and processing of conservation area data.
- OpenUprnClient.py: Manages reading and partitioning of Open UPRN data.
- SpecialBuildingsClient.py: Takes care of reading and processing data related to special buildings.
- requirements.txt: Lists all Python package dependencies.
## How to Run
1. Make sure you have all the required packages installed.
2. Update the S3 bucket and file path constants in app.py.
3. Run app.py.
## Workflow
1. Read the datasets for conservation areas and special buildings.
2. Read the Open UPRN dataset and partition it into smaller chunks based on UPRN.
3. For each partition:
- Convert UPRN data to geopandas DataFrame.
- Check if each UPRN is within a conservation area or is a special building.
- Save the processed data back to S3 in parquet format.

View file

@ -9,12 +9,11 @@ import pandas as pd
from etl.spatial.ConservationAreaClient import ConservationAreaClient
from etl.spatial.OpenUprnClient import OpenUprnClient
from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
from datatypes.datatypes import OpenUprnCoordinateData
from utils.logger import setup_logger
from utils.s3 import save_dataframe_to_s3_parquet
BUCKET = "retrofit-datalake-dev"
OUTPUT_BUCKET = "retrofit-dev-dev"
OUTPUT_BUCKET = "retrofit-data-dev"
HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
@ -27,13 +26,6 @@ logger = setup_logger()
def app():
# TODO: Store the input data in S3 [x]
# Read the input data from S3 [x]
# Document the data source and where to find it [x]
# Incorportate listed buildings [x]
# Incorporate heritage buildings [x]
# Write the outputs to S3 [ ]
"""
This application uses the conservation area datasets to determine if a UPRN is
in a conservation area or now
@ -79,30 +71,16 @@ def app():
)
special_buildings_client.read()
# Local version
OPEN_UPRN_PATHNAME = "/Users/khalimconn-kowlessar/Documents/hestia/Model/model_data/local_data" \
"/osopenuprn_202306_csv/osopenuprn_202305.csv"
open_uprn_client = OpenUprnClient(
path=OPEN_UPRN_PATHNAME,
bucket=BUCKET
)
open_uprn_client.read()
open_uprn_client.read_local()
# We want to sort the data and split it into filenames on UPRN.
# We'll split the data into chunks of 50,000
open_uprn_client.create_file_partitions()
# special_buildings_client = SpecialBuildingsClient(
# historic_england_listed_buildings_path=None,
# historic_england_heritage_buildings_path=None,
# bucket=None
# )
# special_buildings_client.historic_england_listed_buildings = \
# special_buildings_client2.historic_england_listed_buildings
# special_buildings_client.historic_england_heritage_buildings = \
# special_buildings_client2.historic_england_heritage_buildings
logger.info("Extracting spatial data for uprn partitions")
to_loop_over = open_uprn_client.data.groupby("filename")
@ -121,31 +99,5 @@ def app():
df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
)
# We need to iterate through the open uprn data and check if the coordinates are in a conservation area
open_uprn_data = [
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
'LONGITUDE': -0.0540506},
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
'LONGITUDE': -0.0498772},
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
'LONGITUDE': -0.226392},
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
'LONGITUDE': -0.0468833},
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
'LONGITUDE': -0.1362513},
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
'LONGITUDE': -0.0823165}
]
result = [
{
"uprn": coordinates["UPRN"],
"is_in_conservation_area": conservation_area_client.is_in_conservation_area(
OpenUprnCoordinateData(**coordinates))
} for coordinates in
open_uprn_data
]
# We finally save the filesnames to s3
open_uprn_client.save_filenames_to_s3(bucket_name=OUTPUT_BUCKET)