Model/etl/spatial/app.py
Khalim Conn-Kowlessar f94cbd4385 added spatial readme
2023-10-05 14:27:00 +01:00

103 lines
4.3 KiB
Python

"""
This application reads in the open uprn data from a static location and loads it into
our database for querying from other services
"""
import os
from tqdm import tqdm
import pandas as pd
from etl.spatial.ConservationAreaClient import ConservationAreaClient
from etl.spatial.OpenUprnClient import OpenUprnClient
from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
from utils.logger import setup_logger
from utils.s3 import save_dataframe_to_s3_parquet
BUCKET = "retrofit-datalake-dev"
OUTPUT_BUCKET = "retrofit-data-dev"
HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME = "spatial/National_Heritage_List_for_England_(" \
"NHLE)/Listed_Building_polygons.shp"
HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME = \
"spatial/Historic_England_Heritage_at_Risk_Register_2022/Historic_England_Heritage_at_Risk_Register_2022.shp"
logger = setup_logger()
def app():
"""
This application uses the conservation area datasets to determine if a UPRN is
in a conservation area or now
We use two sources of data for determining if homes are in conservation areas.
The first is the Historic England dataset, which is a shapefile containing
polygons of conservation areas. The second is the gov.uk dataset, which is a
geojson file containing polygons of conservation areas.
The Historic England dataset can be found here:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The listed building dataset is also found at Historic England at:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The hertitige buildings dataset is also found at Historic England at:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The Gov.uk dataset can be found here:
https://www.planning.data.gov.uk/dataset/conservation-area
The open UPRN data can be found here:
https://osdatahub.os.uk/downloads/open/OpenUPRN
The Office for National Statistics Postcode Lookup can be found here:
https://geoportal.statistics.gov.uk/datasets/9ac0331178b0435e839f62f41cc61c16/about
For the moment, these data sources are downloaded manually and uploaded to S3.
This application then processes those files and writes the results to s3
"""
conservation_area_client = ConservationAreaClient(
historic_england_path=HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME,
gov_path=GOV_CONSERVARION_AREAS_PATHNAME,
bucket=BUCKET
)
conservation_area_client.read()
special_buildings_client = SpecialBuildingsClient(
historic_england_listed_buildings_path=HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME,
historic_england_heritage_buildings_path=HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME,
bucket=BUCKET
)
special_buildings_client.read()
open_uprn_client = OpenUprnClient(
path=OPEN_UPRN_PATHNAME,
bucket=BUCKET
)
open_uprn_client.read()
# We want to sort the data and split it into filenames on UPRN.
# We'll split the data into chunks of 50,000
open_uprn_client.create_file_partitions()
logger.info("Extracting spatial data for uprn partitions")
to_loop_over = open_uprn_client.data.groupby("filename")
for filename, uprn_df in tqdm(open_uprn_client.data.groupby("filename"), total=len(to_loop_over)):
uprn_gdf = OpenUprnClient.convert_bng_data_to_gpd(uprn_df)
uprn_gdf = conservation_area_client.is_in_conservation_area_vectorised(uprn_gdf=uprn_gdf)
uprn_gdf = special_buildings_client.is_listed_building_vectorised(uprn_gdf=uprn_gdf)
uprn_gdf = special_buildings_client.is_heritage_building_at_risk_vectorised(uprn_gdf=uprn_gdf)
# Convert back to a regular dataframe
uprn_gdf = uprn_gdf.drop(columns=["geometry"])
uprn_gdf = pd.DataFrame(uprn_gdf)
save_dataframe_to_s3_parquet(
df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
)
# We finally save the filesnames to s3
open_uprn_client.save_filenames_to_s3(bucket_name=OUTPUT_BUCKET)