""" This application reads in the open uprn data from a static location and loads it into our database for querying from other services """ import os from tqdm import tqdm import pandas as pd import geopandas as gpd from etl.spatial.ConservationAreaClient import ConservationAreaClient from etl.spatial.OpenUprnClient import OpenUprnClient from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient from utils.logger import setup_logger from utils.s3 import save_dataframe_to_s3_parquet BUCKET = "retrofit-datalake-dev" OUTPUT_BUCKET = "retrofit-data-dev" HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp" GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson" OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv" HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME = "spatial/National_Heritage_List_for_England_(" \ "NHLE)/Listed_Building_polygons.shp" HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME = \ "spatial/Historic_England_Heritage_at_Risk_Register_2022/Historic_England_Heritage_at_Risk_Register_2022.shp" logger = setup_logger() def convert_bng_data_to_gpd(df): gpd_data = gpd.GeoDataFrame( df, geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE), crs="EPSG:27700" # British National Grid ) return gpd_data def app(): """ This application uses the conservation area datasets to determine if a UPRN is in a conservation area or now We use two sources of data for determining if homes are in conservation areas. The first is the Historic England dataset, which is a shapefile containing polygons of conservation areas. The second is the gov.uk dataset, which is a geojson file containing polygons of conservation areas. The Historic England dataset can be found here: https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e The listed building dataset is also found at Historic England at: https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e The hertitige buildings dataset is also found at Historic England at: https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e The Gov.uk dataset can be found here: https://www.planning.data.gov.uk/dataset/conservation-area The open UPRN data can be found here: https://osdatahub.os.uk/downloads/open/OpenUPRN The Office for National Statistics Postcode Lookup can be found here: https://geoportal.statistics.gov.uk/datasets/9ac0331178b0435e839f62f41cc61c16/about For the moment, these data sources are downloaded manually and uploaded to S3. This application then processes those files and writes the results to s3 """ conservation_area_client = ConservationAreaClient( historic_england_path=HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME, gov_path=GOV_CONSERVARION_AREAS_PATHNAME, bucket=BUCKET ) conservation_area_client.read() special_buildings_client = SpecialBuildingsClient( historic_england_listed_buildings_path=HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME, historic_england_heritage_buildings_path=HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME, bucket=BUCKET ) special_buildings_client.read() open_uprn_client = OpenUprnClient( path=OPEN_UPRN_PATHNAME, bucket=BUCKET ) open_uprn_client.read() # We want to sort the data and split it into filenames on UPRN. # We'll split the data into chunks of 50,000 open_uprn_client.create_file_partitions() logger.info("Extracting spatial data for uprn partitions") to_loop_over = open_uprn_client.data.groupby("filename") for filename, uprn_df in tqdm(open_uprn_client.data.groupby("filename"), total=len(to_loop_over)): uprn_gdf = convert_bng_data_to_gpd(uprn_df) uprn_gdf = conservation_area_client.is_in_conservation_area_vectorised(uprn_gdf=uprn_gdf) uprn_gdf = special_buildings_client.is_listed_building_vectorised(uprn_gdf=uprn_gdf) uprn_gdf = special_buildings_client.is_heritage_building_at_risk_vectorised(uprn_gdf=uprn_gdf) # Convert back to a regular dataframe uprn_gdf = uprn_gdf.drop(columns=["geometry"]) uprn_gdf = pd.DataFrame(uprn_gdf) save_dataframe_to_s3_parquet( df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET ) # We finally save the filesnames to s3 open_uprn_client.save_filenames_to_s3(bucket_name=OUTPUT_BUCKET)