mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
114 lines
4.5 KiB
Python
114 lines
4.5 KiB
Python
"""
|
|
This application reads in the open uprn data from a static location and loads it into
|
|
our database for querying from other services
|
|
"""
|
|
|
|
import os
|
|
from tqdm import tqdm
|
|
import pandas as pd
|
|
import geopandas as gpd
|
|
from etl.spatial.ConservationAreaClient import ConservationAreaClient
|
|
from etl.spatial.OpenUprnClient import OpenUprnClient
|
|
from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
|
|
from utils.logger import setup_logger
|
|
from utils.s3 import save_dataframe_to_s3_parquet
|
|
|
|
BUCKET = "retrofit-datalake-dev"
|
|
OUTPUT_BUCKET = "retrofit-data-dev"
|
|
HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
|
|
GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
|
|
OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
|
|
HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME = "spatial/National_Heritage_List_for_England_(" \
|
|
"NHLE)/Listed_Building_polygons.shp"
|
|
HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME = \
|
|
"spatial/Historic_England_Heritage_at_Risk_Register_2022/Historic_England_Heritage_at_Risk_Register_2022.shp"
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
def convert_bng_data_to_gpd(df):
|
|
gpd_data = gpd.GeoDataFrame(
|
|
df,
|
|
geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE),
|
|
crs="EPSG:27700" # British National Grid
|
|
)
|
|
|
|
return gpd_data
|
|
|
|
|
|
def app():
|
|
"""
|
|
This application uses the conservation area datasets to determine if a UPRN is
|
|
in a conservation area or now
|
|
|
|
We use two sources of data for determining if homes are in conservation areas.
|
|
The first is the Historic England dataset, which is a shapefile containing
|
|
polygons of conservation areas. The second is the gov.uk dataset, which is a
|
|
geojson file containing polygons of conservation areas.
|
|
|
|
The Historic England dataset can be found here:
|
|
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
|
|
|
The listed building dataset is also found at Historic England at:
|
|
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
|
|
|
The hertitige buildings dataset is also found at Historic England at:
|
|
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
|
|
|
The Gov.uk dataset can be found here:
|
|
https://www.planning.data.gov.uk/dataset/conservation-area
|
|
|
|
The open UPRN data can be found here:
|
|
https://osdatahub.os.uk/downloads/open/OpenUPRN
|
|
|
|
The Office for National Statistics Postcode Lookup can be found here:
|
|
https://geoportal.statistics.gov.uk/datasets/9ac0331178b0435e839f62f41cc61c16/about
|
|
|
|
For the moment, these data sources are downloaded manually and uploaded to S3.
|
|
This application then processes those files and writes the results to s3
|
|
"""
|
|
|
|
conservation_area_client = ConservationAreaClient(
|
|
historic_england_path=HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME,
|
|
gov_path=GOV_CONSERVARION_AREAS_PATHNAME,
|
|
bucket=BUCKET
|
|
)
|
|
conservation_area_client.read()
|
|
|
|
special_buildings_client = SpecialBuildingsClient(
|
|
historic_england_listed_buildings_path=HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME,
|
|
historic_england_heritage_buildings_path=HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME,
|
|
bucket=BUCKET
|
|
)
|
|
special_buildings_client.read()
|
|
|
|
open_uprn_client = OpenUprnClient(
|
|
path=OPEN_UPRN_PATHNAME,
|
|
bucket=BUCKET
|
|
)
|
|
open_uprn_client.read()
|
|
|
|
# We want to sort the data and split it into filenames on UPRN.
|
|
# We'll split the data into chunks of 50,000
|
|
open_uprn_client.create_file_partitions()
|
|
|
|
logger.info("Extracting spatial data for uprn partitions")
|
|
to_loop_over = open_uprn_client.data.groupby("filename")
|
|
|
|
for filename, uprn_df in tqdm(open_uprn_client.data.groupby("filename"), total=len(to_loop_over)):
|
|
uprn_gdf = convert_bng_data_to_gpd(uprn_df)
|
|
|
|
uprn_gdf = conservation_area_client.is_in_conservation_area_vectorised(uprn_gdf=uprn_gdf)
|
|
uprn_gdf = special_buildings_client.is_listed_building_vectorised(uprn_gdf=uprn_gdf)
|
|
uprn_gdf = special_buildings_client.is_heritage_building_at_risk_vectorised(uprn_gdf=uprn_gdf)
|
|
|
|
# Convert back to a regular dataframe
|
|
uprn_gdf = uprn_gdf.drop(columns=["geometry"])
|
|
uprn_gdf = pd.DataFrame(uprn_gdf)
|
|
|
|
save_dataframe_to_s3_parquet(
|
|
df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
|
|
)
|
|
|
|
# We finally save the filesnames to s3
|
|
open_uprn_client.save_filenames_to_s3(bucket_name=OUTPUT_BUCKET)
|