Model/etl/spatial/app.py

151 lines
6.7 KiB
Python

"""
This application reads in the open uprn data from a static location and loads it into
our database for querying from other services
"""
import os
from tqdm import tqdm
import pandas as pd
from etl.spatial.ConservationAreaClient import ConservationAreaClient
from etl.spatial.OpenUprnClient import OpenUprnClient
from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
from datatypes.datatypes import OpenUprnCoordinateData
from utils.logger import setup_logger
from utils.s3 import save_dataframe_to_s3_parquet
BUCKET = "retrofit-datalake-dev"
OUTPUT_BUCKET = "retrofit-dev-dev"
HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME = "spatial/National_Heritage_List_for_England_(" \
"NHLE)/Listed_Building_polygons.shp"
HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME = \
"spatial/Historic_England_Heritage_at_Risk_Register_2022/Historic_England_Heritage_at_Risk_Register_2022.shp"
logger = setup_logger()
def app():
# TODO: Store the input data in S3 [x]
# Read the input data from S3 [x]
# Document the data source and where to find it [x]
# Incorportate listed buildings [x]
# Incorporate heritage buildings [x]
# Write the outputs to S3 [ ]
"""
This application uses the conservation area datasets to determine if a UPRN is
in a conservation area or now
We use two sources of data for determining if homes are in conservation areas.
The first is the Historic England dataset, which is a shapefile containing
polygons of conservation areas. The second is the gov.uk dataset, which is a
geojson file containing polygons of conservation areas.
The Historic England dataset can be found here:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The listed building dataset is also found at Historic England at:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The hertitige buildings dataset is also found at Historic England at:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The Gov.uk dataset can be found here:
https://www.planning.data.gov.uk/dataset/conservation-area
The open UPRN data can be found here:
https://osdatahub.os.uk/downloads/open/OpenUPRN
The Office for National Statistics Postcode Lookup can be found here:
https://geoportal.statistics.gov.uk/datasets/9ac0331178b0435e839f62f41cc61c16/about
For the moment, these data sources are downloaded manually and uploaded to S3.
This application then processes those files and writes the results to s3
"""
conservation_area_client = ConservationAreaClient(
historic_england_path=HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME,
gov_path=GOV_CONSERVARION_AREAS_PATHNAME,
bucket=BUCKET
)
conservation_area_client.read()
special_buildings_client = SpecialBuildingsClient(
historic_england_listed_buildings_path=HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME,
historic_england_heritage_buildings_path=HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME,
bucket=BUCKET
)
special_buildings_client.read()
# Local version
OPEN_UPRN_PATHNAME = "/Users/khalimconn-kowlessar/Documents/hestia/Model/model_data/local_data" \
"/osopenuprn_202306_csv/osopenuprn_202305.csv"
open_uprn_client = OpenUprnClient(
path=OPEN_UPRN_PATHNAME,
bucket=BUCKET
)
open_uprn_client.read()
open_uprn_client.read_local()
# We want to sort the data and split it into filenames on UPRN.
# We'll split the data into chunks of 50,000
open_uprn_client.create_file_partitions()
# special_buildings_client = SpecialBuildingsClient(
# historic_england_listed_buildings_path=None,
# historic_england_heritage_buildings_path=None,
# bucket=None
# )
# special_buildings_client.historic_england_listed_buildings = \
# special_buildings_client2.historic_england_listed_buildings
# special_buildings_client.historic_england_heritage_buildings = \
# special_buildings_client2.historic_england_heritage_buildings
logger.info("Extracting spatial data for uprn partitions")
to_loop_over = open_uprn_client.data.groupby("filename")
for filename, uprn_df in tqdm(open_uprn_client.data.groupby("filename"), total=len(to_loop_over)):
uprn_gdf = OpenUprnClient.convert_bng_data_to_gpd(uprn_df)
uprn_gdf = conservation_area_client.is_in_conservation_area_vectorised(uprn_gdf=uprn_gdf)
uprn_gdf = special_buildings_client.is_listed_building_vectorised(uprn_gdf=uprn_gdf)
uprn_gdf = special_buildings_client.is_heritage_building_at_risk_vectorised(uprn_gdf=uprn_gdf)
# Convert back to a regular dataframe
uprn_gdf = uprn_gdf.drop(columns=["geometry"])
uprn_gdf = pd.DataFrame(uprn_gdf)
save_dataframe_to_s3_parquet(
df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
)
# We need to iterate through the open uprn data and check if the coordinates are in a conservation area
open_uprn_data = [
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
'LONGITUDE': -0.0540506},
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
'LONGITUDE': -0.0498772},
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
'LONGITUDE': -0.226392},
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
'LONGITUDE': -0.0468833},
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
'LONGITUDE': -0.1362513},
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
'LONGITUDE': -0.0823165}
]
result = [
{
"uprn": coordinates["UPRN"],
"is_in_conservation_area": conservation_area_client.is_in_conservation_area(
OpenUprnCoordinateData(**coordinates))
} for coordinates in
open_uprn_data
]