mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
added spatial readme
This commit is contained in:
parent
c267496353
commit
f94cbd4385
5 changed files with 78 additions and 136 deletions
81
.github/workflows/deploy_sap_model_lambda.yml
vendored
81
.github/workflows/deploy_sap_model_lambda.yml
vendored
|
|
@ -1,81 +0,0 @@
|
|||
name: Sap Model Deploy
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ dev, prod ]
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.10.12
|
||||
|
||||
- name: Install Serverless and plugins
|
||||
run: |
|
||||
npm install -g serverless
|
||||
npm install -g serverless-domain-manager
|
||||
|
||||
- name: AWS credentials for dev
|
||||
if: github.ref == 'refs/heads/dev'
|
||||
uses: aws-actions/configure-aws-credentials@v1
|
||||
with:
|
||||
aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
|
||||
aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
|
||||
aws-region: eu-west-2
|
||||
|
||||
- name: AWS credentials for prod
|
||||
if: github.ref == 'refs/heads/prod'
|
||||
uses: aws-actions/configure-aws-credentials@v1
|
||||
with:
|
||||
aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}
|
||||
aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}
|
||||
aws-region: eu-west-2
|
||||
|
||||
- name: Set domain name
|
||||
id: set_domain
|
||||
run: echo "::set-output name=domain::${{ secrets[format('{0}_DOMAIN_NAME', github.ref_name)] }}"
|
||||
|
||||
- name: Set ECR credentials
|
||||
id: set_ecr_credentials
|
||||
run: |
|
||||
echo "::set-output name=ecr_uri::${{ secrets[format('{0}_SAP_MODEL_ECR_URI', github.ref_name)] }}"
|
||||
|
||||
- name: Setup Docker
|
||||
uses: docker/setup-buildx-action@v1
|
||||
|
||||
- name: Login to ECR
|
||||
run: |
|
||||
aws ecr get-login-password --region eu-west-2 | docker login --username AWS --password-stdin ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
|
||||
|
||||
# Building and pushing Docker image with caching
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
context: ./model_data/simulation_system
|
||||
file: ./model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda
|
||||
push: true
|
||||
tags: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}:${{ github.sha }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
platform: linux/amd64
|
||||
provenance: false
|
||||
|
||||
- name: Deploy to AWS Lambda via Serverless
|
||||
env:
|
||||
RUNTIME_ENVIRONMENT: ${{ github.ref_name }}
|
||||
MODEL_DIRECTORY_BUCKET: 'retrofit-model-directory-${{ github.ref_name }}'
|
||||
PREDICTIONS_BUCKET: 'retrofit-sap-predictions-${{ github.ref_name }}'
|
||||
DATA_BUCKET: 'retrofit-data-${{ github.ref_name }}'
|
||||
DOMAIN_NAME: ${{ steps.set_domain.outputs.domain }}
|
||||
ECR_URI: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
|
||||
GITHUB_SHA: ${{ github.sha }}
|
||||
run: |
|
||||
# Deploy to AWS Lambda via Serverless
|
||||
sls deploy --config sapmodel.serverless.yml --stage ${{ github.ref_name }} --verbose
|
||||
0
etl/properties/requirements.txt
Normal file
0
etl/properties/requirements.txt
Normal file
|
|
@ -1,8 +1,9 @@
|
|||
import os
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_io_from_s3
|
||||
from utils.s3 import read_io_from_s3, save_dataframe_to_s3_parquet
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
|
@ -71,21 +72,21 @@ class OpenUprnClient:
|
|||
for partition, group in tqdm(self.data.groupby('partition')):
|
||||
min_uprn = group['UPRN'].min()
|
||||
max_uprn = group['UPRN'].max()
|
||||
self.filenames[partition] = f"{min_uprn}_{max_uprn}.csv"
|
||||
self.filenames[partition] = f"{min_uprn}_{max_uprn}.parquet"
|
||||
|
||||
self.data['filename'] = self.data['partition'].map(self.filenames)
|
||||
|
||||
@staticmethod
|
||||
def find_filename_for_uprn(uprn, filenames):
|
||||
for filename in filenames:
|
||||
min_uprn, max_uprn = map(int, filename.replace(".csv", "").split("_"))
|
||||
min_uprn, max_uprn = map(int, filename.replace(".parquet", "").split("_"))
|
||||
if min_uprn <= uprn <= max_uprn:
|
||||
return filename
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def convert_bng_data_to_gpd(df):
|
||||
|
||||
|
||||
gpd_data = gpd.GeoDataFrame(
|
||||
df,
|
||||
geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE),
|
||||
|
|
@ -93,3 +94,25 @@ class OpenUprnClient:
|
|||
)
|
||||
|
||||
return gpd_data
|
||||
|
||||
def save_filenames_to_s3(self, bucket_name):
|
||||
"""
|
||||
Save the filenames to s3
|
||||
:param bucket_name:
|
||||
:return:
|
||||
"""
|
||||
file_key = os.path.join("spatial", "filename_meta.parquet")
|
||||
|
||||
filenames = pd.DataFrame({"filenames": list(self.filenames.values())})
|
||||
filenames[['lower', 'upper']] = filenames['filenames'].str.replace('.parquet', '').str.extract(
|
||||
'(\d+)_(\d+)'
|
||||
)
|
||||
filenames['lower'] = filenames['lower'].astype(int)
|
||||
filenames['upper'] = filenames['upper'].astype(int)
|
||||
|
||||
logger.info("Saving filenames to s3 at {}".format(file_key))
|
||||
save_dataframe_to_s3_parquet(
|
||||
df=filenames,
|
||||
file_key=file_key,
|
||||
bucket_name=bucket_name
|
||||
)
|
||||
|
|
|
|||
48
etl/spatial/README.md
Normal file
48
etl/spatial/README.md
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
# Spatial - Geospatial Data Processing Service
|
||||
|
||||
## Overview
|
||||
|
||||
The Spatial service is designed to read, process, and analyze geospatial data related to
|
||||
conservation areas and special buildings. It uses datasets from Historic England and the
|
||||
UK government to determine whether a given UPRN (Unique Property Reference Number) is within
|
||||
a conservation area or is a listed building. The processed data is saved back to an S3 bucket
|
||||
in a parquet format for easy retrieval and further analysis.
|
||||
|
||||
## Dependencies
|
||||
|
||||
Dependencies are listed in requirements.txt. To install them, run:
|
||||
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Data Sources
|
||||
|
||||
1. **Historic England Conservation Areas**: Shapefile containing polygons of conservation areas.
|
||||
2. **UK Government Conservation Areas**: GeoJSON file containing polygons of conservation areas.
|
||||
3. **Open UPRN Data**: CSV file with UPRN and corresponding geospatial data.
|
||||
4. **Historic England Listed Buildings**: Shapefile with information on listed buildings.
|
||||
5. **Historic England Heritage Buildings at Risk**: Shapefile with information on heritage buildings at risk.
|
||||
|
||||
## Files
|
||||
|
||||
- app.py: Main application file that orchestrates the data processing flow.
|
||||
- ConservationAreaClient.py: Handles reading and processing of conservation area data.
|
||||
- OpenUprnClient.py: Manages reading and partitioning of Open UPRN data.
|
||||
- SpecialBuildingsClient.py: Takes care of reading and processing data related to special buildings.
|
||||
- requirements.txt: Lists all Python package dependencies.
|
||||
|
||||
## How to Run
|
||||
|
||||
1. Make sure you have all the required packages installed.
|
||||
2. Update the S3 bucket and file path constants in app.py.
|
||||
3. Run app.py.
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Read the datasets for conservation areas and special buildings.
|
||||
2. Read the Open UPRN dataset and partition it into smaller chunks based on UPRN.
|
||||
3. For each partition:
|
||||
- Convert UPRN data to geopandas DataFrame.
|
||||
- Check if each UPRN is within a conservation area or is a special building.
|
||||
- Save the processed data back to S3 in parquet format.
|
||||
|
|
@ -9,12 +9,11 @@ import pandas as pd
|
|||
from etl.spatial.ConservationAreaClient import ConservationAreaClient
|
||||
from etl.spatial.OpenUprnClient import OpenUprnClient
|
||||
from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
|
||||
from datatypes.datatypes import OpenUprnCoordinateData
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import save_dataframe_to_s3_parquet
|
||||
|
||||
BUCKET = "retrofit-datalake-dev"
|
||||
OUTPUT_BUCKET = "retrofit-dev-dev"
|
||||
OUTPUT_BUCKET = "retrofit-data-dev"
|
||||
HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
|
||||
GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
|
||||
OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
|
||||
|
|
@ -27,13 +26,6 @@ logger = setup_logger()
|
|||
|
||||
|
||||
def app():
|
||||
# TODO: Store the input data in S3 [x]
|
||||
# Read the input data from S3 [x]
|
||||
# Document the data source and where to find it [x]
|
||||
# Incorportate listed buildings [x]
|
||||
# Incorporate heritage buildings [x]
|
||||
# Write the outputs to S3 [ ]
|
||||
|
||||
"""
|
||||
This application uses the conservation area datasets to determine if a UPRN is
|
||||
in a conservation area or now
|
||||
|
|
@ -79,30 +71,16 @@ def app():
|
|||
)
|
||||
special_buildings_client.read()
|
||||
|
||||
# Local version
|
||||
OPEN_UPRN_PATHNAME = "/Users/khalimconn-kowlessar/Documents/hestia/Model/model_data/local_data" \
|
||||
"/osopenuprn_202306_csv/osopenuprn_202305.csv"
|
||||
open_uprn_client = OpenUprnClient(
|
||||
path=OPEN_UPRN_PATHNAME,
|
||||
bucket=BUCKET
|
||||
)
|
||||
open_uprn_client.read()
|
||||
open_uprn_client.read_local()
|
||||
|
||||
# We want to sort the data and split it into filenames on UPRN.
|
||||
# We'll split the data into chunks of 50,000
|
||||
open_uprn_client.create_file_partitions()
|
||||
|
||||
# special_buildings_client = SpecialBuildingsClient(
|
||||
# historic_england_listed_buildings_path=None,
|
||||
# historic_england_heritage_buildings_path=None,
|
||||
# bucket=None
|
||||
# )
|
||||
# special_buildings_client.historic_england_listed_buildings = \
|
||||
# special_buildings_client2.historic_england_listed_buildings
|
||||
# special_buildings_client.historic_england_heritage_buildings = \
|
||||
# special_buildings_client2.historic_england_heritage_buildings
|
||||
|
||||
logger.info("Extracting spatial data for uprn partitions")
|
||||
to_loop_over = open_uprn_client.data.groupby("filename")
|
||||
|
||||
|
|
@ -121,31 +99,5 @@ def app():
|
|||
df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
|
||||
)
|
||||
|
||||
# We need to iterate through the open uprn data and check if the coordinates are in a conservation area
|
||||
open_uprn_data = [
|
||||
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
|
||||
'LONGITUDE': -0.0540506},
|
||||
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
|
||||
'LONGITUDE': -0.0498772},
|
||||
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
|
||||
'LONGITUDE': -0.226392},
|
||||
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
||||
'LONGITUDE': -0.0792445},
|
||||
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
||||
'LONGITUDE': -0.0792445},
|
||||
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
|
||||
'LONGITUDE': -0.0468833},
|
||||
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
|
||||
'LONGITUDE': -0.1362513},
|
||||
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
|
||||
'LONGITUDE': -0.0823165}
|
||||
]
|
||||
|
||||
result = [
|
||||
{
|
||||
"uprn": coordinates["UPRN"],
|
||||
"is_in_conservation_area": conservation_area_client.is_in_conservation_area(
|
||||
OpenUprnCoordinateData(**coordinates))
|
||||
} for coordinates in
|
||||
open_uprn_data
|
||||
]
|
||||
# We finally save the filesnames to s3
|
||||
open_uprn_client.save_filenames_to_s3(bucket_name=OUTPUT_BUCKET)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue