mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
151 lines
6.7 KiB
Python
151 lines
6.7 KiB
Python
"""
|
|
This application reads in the open uprn data from a static location and loads it into
|
|
our database for querying from other services
|
|
"""
|
|
|
|
import os
|
|
from tqdm import tqdm
|
|
import pandas as pd
|
|
from etl.spatial.ConservationAreaClient import ConservationAreaClient
|
|
from etl.spatial.OpenUprnClient import OpenUprnClient
|
|
from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
|
|
from datatypes.datatypes import OpenUprnCoordinateData
|
|
from utils.logger import setup_logger
|
|
from utils.s3 import save_dataframe_to_s3_parquet
|
|
|
|
BUCKET = "retrofit-datalake-dev"
|
|
OUTPUT_BUCKET = "retrofit-dev-dev"
|
|
HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
|
|
GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
|
|
OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
|
|
HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME = "spatial/National_Heritage_List_for_England_(" \
|
|
"NHLE)/Listed_Building_polygons.shp"
|
|
HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME = \
|
|
"spatial/Historic_England_Heritage_at_Risk_Register_2022/Historic_England_Heritage_at_Risk_Register_2022.shp"
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
def app():
|
|
# TODO: Store the input data in S3 [x]
|
|
# Read the input data from S3 [x]
|
|
# Document the data source and where to find it [x]
|
|
# Incorportate listed buildings [x]
|
|
# Incorporate heritage buildings [x]
|
|
# Write the outputs to S3 [ ]
|
|
|
|
"""
|
|
This application uses the conservation area datasets to determine if a UPRN is
|
|
in a conservation area or now
|
|
|
|
We use two sources of data for determining if homes are in conservation areas.
|
|
The first is the Historic England dataset, which is a shapefile containing
|
|
polygons of conservation areas. The second is the gov.uk dataset, which is a
|
|
geojson file containing polygons of conservation areas.
|
|
|
|
The Historic England dataset can be found here:
|
|
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
|
|
|
The listed building dataset is also found at Historic England at:
|
|
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
|
|
|
The hertitige buildings dataset is also found at Historic England at:
|
|
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
|
|
|
The Gov.uk dataset can be found here:
|
|
https://www.planning.data.gov.uk/dataset/conservation-area
|
|
|
|
The open UPRN data can be found here:
|
|
https://osdatahub.os.uk/downloads/open/OpenUPRN
|
|
|
|
The Office for National Statistics Postcode Lookup can be found here:
|
|
https://geoportal.statistics.gov.uk/datasets/9ac0331178b0435e839f62f41cc61c16/about
|
|
|
|
For the moment, these data sources are downloaded manually and uploaded to S3.
|
|
This application then processes those files and writes the results to s3
|
|
"""
|
|
|
|
conservation_area_client = ConservationAreaClient(
|
|
historic_england_path=HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME,
|
|
gov_path=GOV_CONSERVARION_AREAS_PATHNAME,
|
|
bucket=BUCKET
|
|
)
|
|
conservation_area_client.read()
|
|
|
|
special_buildings_client = SpecialBuildingsClient(
|
|
historic_england_listed_buildings_path=HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME,
|
|
historic_england_heritage_buildings_path=HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME,
|
|
bucket=BUCKET
|
|
)
|
|
special_buildings_client.read()
|
|
|
|
# Local version
|
|
OPEN_UPRN_PATHNAME = "/Users/khalimconn-kowlessar/Documents/hestia/Model/model_data/local_data" \
|
|
"/osopenuprn_202306_csv/osopenuprn_202305.csv"
|
|
open_uprn_client = OpenUprnClient(
|
|
path=OPEN_UPRN_PATHNAME,
|
|
bucket=BUCKET
|
|
)
|
|
open_uprn_client.read()
|
|
open_uprn_client.read_local()
|
|
|
|
# We want to sort the data and split it into filenames on UPRN.
|
|
# We'll split the data into chunks of 50,000
|
|
open_uprn_client.create_file_partitions()
|
|
|
|
# special_buildings_client = SpecialBuildingsClient(
|
|
# historic_england_listed_buildings_path=None,
|
|
# historic_england_heritage_buildings_path=None,
|
|
# bucket=None
|
|
# )
|
|
# special_buildings_client.historic_england_listed_buildings = \
|
|
# special_buildings_client2.historic_england_listed_buildings
|
|
# special_buildings_client.historic_england_heritage_buildings = \
|
|
# special_buildings_client2.historic_england_heritage_buildings
|
|
|
|
logger.info("Extracting spatial data for uprn partitions")
|
|
to_loop_over = open_uprn_client.data.groupby("filename")
|
|
|
|
for filename, uprn_df in tqdm(open_uprn_client.data.groupby("filename"), total=len(to_loop_over)):
|
|
uprn_gdf = OpenUprnClient.convert_bng_data_to_gpd(uprn_df)
|
|
|
|
uprn_gdf = conservation_area_client.is_in_conservation_area_vectorised(uprn_gdf=uprn_gdf)
|
|
uprn_gdf = special_buildings_client.is_listed_building_vectorised(uprn_gdf=uprn_gdf)
|
|
uprn_gdf = special_buildings_client.is_heritage_building_at_risk_vectorised(uprn_gdf=uprn_gdf)
|
|
|
|
# Convert back to a regular dataframe
|
|
uprn_gdf = uprn_gdf.drop(columns=["geometry"])
|
|
uprn_gdf = pd.DataFrame(uprn_gdf)
|
|
|
|
save_dataframe_to_s3_parquet(
|
|
df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
|
|
)
|
|
|
|
# We need to iterate through the open uprn data and check if the coordinates are in a conservation area
|
|
open_uprn_data = [
|
|
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
|
|
'LONGITUDE': -0.0540506},
|
|
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
|
|
'LONGITUDE': -0.0498772},
|
|
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
|
|
'LONGITUDE': -0.226392},
|
|
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
|
'LONGITUDE': -0.0792445},
|
|
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
|
'LONGITUDE': -0.0792445},
|
|
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
|
|
'LONGITUDE': -0.0468833},
|
|
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
|
|
'LONGITUDE': -0.1362513},
|
|
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
|
|
'LONGITUDE': -0.0823165}
|
|
]
|
|
|
|
result = [
|
|
{
|
|
"uprn": coordinates["UPRN"],
|
|
"is_in_conservation_area": conservation_area_client.is_in_conservation_area(
|
|
OpenUprnCoordinateData(**coordinates))
|
|
} for coordinates in
|
|
open_uprn_data
|
|
]
|