vectorised conservation area, heritage building and listedbuilding check

This commit is contained in:
Khalim Conn-Kowlessar 2023-10-02 22:54:15 +01:00
parent 64b6b67499
commit c267496353
11 changed files with 410 additions and 161 deletions

View file

@ -1,86 +0,0 @@
"""
This application reads in the open uprn data from a static location and loads it into
our database for querying from other services
"""
from etl.conservation_areas.ConservationAreaClient import ConservationAreaClient
from datatypes.datatypes import OpenUprnCoordinateData
BUCKET = "retrofit-data-dev"
HISTORIC_ENGLAND_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
GOV_PATHNAME = "spatial/gov-conservation-area.geojson"
def app():
# TODO: Store the input data in S3 [x]
# Read the input data from S3 [ ]
# Document the data source and where to find it [x]
# Write the outputs to S3
"""
This application uses the conservation area datasets to determine if a UPRN is
in a conservation area or now
We use two sources of data for determining if homes are in conservation areas.
The first is the Historic England dataset, which is a shapefile containing
polygons of conservation areas. The second is the gov.uk dataset, which is a
geojson file containing polygons of conservation areas.
The Historic England dataset can be found here:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The listed building dataset is also found at Historic England at:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The hertitige buildings dataset is also found at Historic England at:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The Gov.uk dataset can be found here:
https://www.planning.data.gov.uk/dataset/conservation-area
For the moment, these data sources are downloaded manually and uploaded to S3.
This application then processes those files and writes the results to s3
"""
conservation_area_client = ConservationAreaClient(
historic_england_path=HISTORIC_ENGLAND_PATHNAME,
gov_path=GOV_PATHNAME,
bucket=BUCKET
)
conservation_area_client.read()
# We need to iterate through the open uprn data and check if the coordinates are in a conservation area
open_uprn_data = [
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
'LONGITUDE': -0.0540506},
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
'LONGITUDE': -0.0498772},
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
'LONGITUDE': -0.226392},
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
'LONGITUDE': -0.0468833},
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
'LONGITUDE': -0.1362513},
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
'LONGITUDE': -0.0823165}
]
open_uprn_data = [
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
'LONGITUDE': -0.0823165}
]
result = [
{
"uprn": coordinates["UPRN"],
"is_in_conservation_area": conservation_area_client.is_in_conservation_area(
OpenUprnCoordinateData(**coordinates))
} for coordinates in
open_uprn_data
]
# TODO: Add a method to write to the database

View file

@ -1,8 +1,8 @@
import boto3
import os
import tempfile
import pandas as pd
import geopandas as gpd
import numpy as np
from enum import Enum
from shapely.geometry import Point
from utils.logger import setup_logger
@ -61,9 +61,9 @@ class ConservationAreaClient:
"""
SOURCES = ["historic_england"]
IN_CONSERVATION_AREA = "in_conservation_area"
NOT_IN_CONSERVATION_AREA = "not_in_conservation_area"
UNKNOWN = "unknown"
IN_CONSERVATION_AREA = True
NOT_IN_CONSERVATION_AREA = False
UNKNOWN = None
def __init__(self, historic_england_path, gov_path, bucket):
self.historic_england_path = historic_england_path
@ -91,6 +91,8 @@ class ConservationAreaClient:
)
)
self.gov_data = self.gov_data.drop(columns=["dataset"])
# Convert the gov data to british national grid co-ordinates
self.gov_data = self.gov_data.to_crs("EPSG:27700")
def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):
@ -123,6 +125,43 @@ class ConservationAreaClient:
else:
return ConservationAreaClient.UNKNOWN
def is_in_conservation_area_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
joined_gdf_he = gpd.sjoin(uprn_gdf, self.historic_england_data, how="left", predicate="within")
# Identify where we have definitive information (not "unknown")
in_conservation_he = ~joined_gdf_he.index_right.isna() & (
joined_gdf_he["NAME"] != "No data available for publication by HE"
)
uprn_in_conservation_he = joined_gdf_he[in_conservation_he]["UPRN"].unique()
# The right index will be missing when we don't have a match so the uprn is not in a conservation
# area
uprn_not_in_conservation_he = joined_gdf_he.loc[
~joined_gdf_he["UPRN"].isin(uprn_in_conservation_he) & joined_gdf_he.index_right.isna(),
"UPRN"
].unique()
# For unknowns, check against government data
unknown_uprns = uprn_gdf.loc[~uprn_gdf["UPRN"].isin(uprn_in_conservation_he)]["UPRN"]
unknown_gdf = uprn_gdf[uprn_gdf["UPRN"].isin(unknown_uprns)]
joined_gdf_gov = gpd.sjoin(unknown_gdf, self.gov_data, how="left", predicate="within")
uprn_in_conservation_gov = joined_gdf_gov.loc[~joined_gdf_gov.index_right.isna(), "UPRN"].unique()
uprn_gdf['conservation_status'] = self.UNKNOWN
uprn_gdf.loc[
uprn_gdf["UPRN"].isin(uprn_in_conservation_he), 'conservation_status'
] = self.IN_CONSERVATION_AREA
uprn_gdf.loc[
uprn_gdf["UPRN"].isin(uprn_not_in_conservation_he), 'conservation_status'
] = self.NOT_IN_CONSERVATION_AREA
uprn_gdf.loc[
uprn_gdf["UPRN"].isin(uprn_in_conservation_gov), 'conservation_status'
] = self.IN_CONSERVATION_AREA
return uprn_gdf
def is_in_conservation_area_historic_england(self, x_bng: float, y_bng: float) -> str:
"""
Check if a property is in a conservation area

View file

@ -0,0 +1,95 @@
from tqdm import tqdm
import pandas as pd
import geopandas as gpd
from utils.logger import setup_logger
from utils.s3 import read_io_from_s3
logger = setup_logger()
class OpenUprnClient:
"""
This client reads in the Open UPRN data from s3 which can be downloaded from here:
https://osdatahub.os.uk/downloads/open/OpenUPRN
This dataset contains a lookup of UPRNs to coordinates.
Specs for this dataset can be found here:
https://www.ordnancesurvey.co.uk/documents/product-support/tech-spec/open-uprn-techspec-v1.pdf
"""
def __init__(self, path, bucket, uprns=None):
self.path = path
self.bucket = bucket
self.uprns = [int(x) for x in uprns] if uprns else None
self.data = None
# This will be stored in S3 and will be the complete list of filenames
# We'll then use this to determine which file the UPRN's data is contained in
self.filenames = None
def read(self):
"""
This methodology is placeholder, while data sits localls
:return:
"""
logger.info("Reading in open uprn data")
df = pd.read_csv(
read_io_from_s3(
bucket_name=self.bucket,
file_key=self.path
)
)
if self.uprns:
df = df[df["UPRN"].isin(self.uprns)]
self.data = df
def read_local(self):
"""
For local testing
:return:
"""
logger.info("Reading in open uprn data")
df = pd.read_csv(self.path)
if self.uprns:
df = df[df["UPRN"].isin(self.uprns)]
self.data = df
def create_file_partitions(self, partition_size=50000):
logger.info("Sorting data by UPRN ascending")
self.data = self.data.sort_values("UPRN", ascending=True)
logger.info("Creating partitions")
self.data['partition'] = self.data.index // partition_size
self.filenames = {}
for partition, group in tqdm(self.data.groupby('partition')):
min_uprn = group['UPRN'].min()
max_uprn = group['UPRN'].max()
self.filenames[partition] = f"{min_uprn}_{max_uprn}.csv"
self.data['filename'] = self.data['partition'].map(self.filenames)
@staticmethod
def find_filename_for_uprn(uprn, filenames):
for filename in filenames:
min_uprn, max_uprn = map(int, filename.replace(".csv", "").split("_"))
if min_uprn <= uprn <= max_uprn:
return filename
return None
@staticmethod
def convert_bng_data_to_gpd(df):
gpd_data = gpd.GeoDataFrame(
df,
geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE),
crs="EPSG:27700" # British National Grid
)
return gpd_data

View file

@ -0,0 +1,114 @@
import geopandas as gpd
from shapely.geometry import Point
from utils.logger import setup_logger
from etl.spatial.ConservationAreaClient import read_shapefile_from_s3
from datatypes.datatypes import OpenUprnCoordinateData
logger = setup_logger()
class SpecialBuildingsClient:
"""
This class reads in data from Historic England, which can be used to determine if specific buildings are
listed or heritage buildings
"""
def __init__(self, historic_england_listed_buildings_path, historic_england_heritage_buildings_path, bucket):
self.historic_england_listed_buildings_path = historic_england_listed_buildings_path
self.historic_england_heritage_buildings_path = historic_england_heritage_buildings_path
self.bucket = bucket
self.historic_england_listed_buildings = None
self.historic_england_heritage_buildings = None
def read(self):
"""
Read the data
"""
logger.info("Reading in historic england listed buildings shapefile")
self.historic_england_listed_buildings = read_shapefile_from_s3(
bucket_name=self.bucket, s3_file_key=self.historic_england_listed_buildings_path
)
logger.info("Reading in historic england heritage buildings shapefile")
self.historic_england_heritage_buildings = read_shapefile_from_s3(
bucket_name=self.bucket, s3_file_key=self.historic_england_heritage_buildings_path
)
# Convert the gov data to british national grid co-ordinates
self.historic_england_heritage_buildings = self.historic_england_heritage_buildings.to_crs("EPSG:27700")
def is_listed_building(self, coordinates: OpenUprnCoordinateData) -> bool:
"""
Check if a location specified by British National Grid coordinates is a listed building.
:param coordinates: dictionary, which should have the OpenUprnCoordinateData format
:return: "listed_building" if the location is within a listed building polygon, "not_listed_building" otherwise
"""
# Convert the coordinates to a Shapely Point object
point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
# Check if the point is within any of the listed building polygons
within_listed_buildings = self.historic_england_listed_buildings.contains(point)
if within_listed_buildings.any():
# If the point is within any listed building polygon, log the names of the buildings and return
# "listed_building"
names = self.historic_england_listed_buildings.loc[within_listed_buildings, "Name"]
logger.info(f"The location is within the following listed buildings: {names.values}")
return True
# If the point is not within any listed building polygon, return "not_listed_building"
return False
def is_listed_building_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
# Check against historic England listed buildings data
joined_gdf_listed = gpd.sjoin(uprn_gdf, self.historic_england_listed_buildings, how="left", predicate="within")
# Identify where we have matches
uprn_is_listed = joined_gdf_listed[~joined_gdf_listed.index_right.isna()]["UPRN"].unique()
# Populate the results in the input GeoDataFrame
uprn_gdf['is_listed_building'] = False
uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_listed), 'is_listed_building'] = True
return uprn_gdf
def is_heritage_building_at_risk(self, coordinates: OpenUprnCoordinateData) -> bool:
"""
Check if a location specified by British National Grid coordinates is a heritage building at risk.
:param coordinates: dictionary, which should have the OpenUprnCoordinateData format
:return: "heritage_building_at_risk" if the location is within a heritage building at risk polygon,
"not_heritage_building_at_risk" otherwise
"""
# Convert the coordinates to a Shapely Point object
point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
# Check if the point is within any of the heritage building at risk polygons
within_heritage_buildings_at_risk = self.historic_england_heritage_buildings.contains(point)
if within_heritage_buildings_at_risk.any():
# If the point is within any heritage building at risk polygon, log the names of the buildings and return
# "heritage_building_at_risk"
names = self.historic_england_heritage_buildings.loc[within_heritage_buildings_at_risk, "EntryName"]
logger.info(f"The location is within the following heritage buildings at risk: {names.values}")
return True
# If the point is not within any heritage building at risk polygon, return "not_heritage_building_at_risk"
return False
def is_heritage_building_at_risk_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
# Check against historic England heritage buildings data
joined_gdf_heritage = gpd.sjoin(
uprn_gdf, self.historic_england_heritage_buildings, how="left", predicate="within"
)
# Identify where we have matches
uprn_is_heritage = joined_gdf_heritage[~joined_gdf_heritage.index_right.isna()]["UPRN"].unique()
# Populate the results in the input GeoDataFrame
uprn_gdf['is_heritage_building'] = False
uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_heritage), 'is_heritage_building'] = True
return uprn_gdf

151
etl/spatial/app.py Normal file
View file

@ -0,0 +1,151 @@
"""
This application reads in the open uprn data from a static location and loads it into
our database for querying from other services
"""
import os
from tqdm import tqdm
import pandas as pd
from etl.spatial.ConservationAreaClient import ConservationAreaClient
from etl.spatial.OpenUprnClient import OpenUprnClient
from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
from datatypes.datatypes import OpenUprnCoordinateData
from utils.logger import setup_logger
from utils.s3 import save_dataframe_to_s3_parquet
BUCKET = "retrofit-datalake-dev"
OUTPUT_BUCKET = "retrofit-dev-dev"
HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME = "spatial/National_Heritage_List_for_England_(" \
"NHLE)/Listed_Building_polygons.shp"
HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME = \
"spatial/Historic_England_Heritage_at_Risk_Register_2022/Historic_England_Heritage_at_Risk_Register_2022.shp"
logger = setup_logger()
def app():
# TODO: Store the input data in S3 [x]
# Read the input data from S3 [x]
# Document the data source and where to find it [x]
# Incorportate listed buildings [x]
# Incorporate heritage buildings [x]
# Write the outputs to S3 [ ]
"""
This application uses the conservation area datasets to determine if a UPRN is
in a conservation area or now
We use two sources of data for determining if homes are in conservation areas.
The first is the Historic England dataset, which is a shapefile containing
polygons of conservation areas. The second is the gov.uk dataset, which is a
geojson file containing polygons of conservation areas.
The Historic England dataset can be found here:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The listed building dataset is also found at Historic England at:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The hertitige buildings dataset is also found at Historic England at:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The Gov.uk dataset can be found here:
https://www.planning.data.gov.uk/dataset/conservation-area
The open UPRN data can be found here:
https://osdatahub.os.uk/downloads/open/OpenUPRN
The Office for National Statistics Postcode Lookup can be found here:
https://geoportal.statistics.gov.uk/datasets/9ac0331178b0435e839f62f41cc61c16/about
For the moment, these data sources are downloaded manually and uploaded to S3.
This application then processes those files and writes the results to s3
"""
conservation_area_client = ConservationAreaClient(
historic_england_path=HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME,
gov_path=GOV_CONSERVARION_AREAS_PATHNAME,
bucket=BUCKET
)
conservation_area_client.read()
special_buildings_client = SpecialBuildingsClient(
historic_england_listed_buildings_path=HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME,
historic_england_heritage_buildings_path=HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME,
bucket=BUCKET
)
special_buildings_client.read()
# Local version
OPEN_UPRN_PATHNAME = "/Users/khalimconn-kowlessar/Documents/hestia/Model/model_data/local_data" \
"/osopenuprn_202306_csv/osopenuprn_202305.csv"
open_uprn_client = OpenUprnClient(
path=OPEN_UPRN_PATHNAME,
bucket=BUCKET
)
open_uprn_client.read()
open_uprn_client.read_local()
# We want to sort the data and split it into filenames on UPRN.
# We'll split the data into chunks of 50,000
open_uprn_client.create_file_partitions()
# special_buildings_client = SpecialBuildingsClient(
# historic_england_listed_buildings_path=None,
# historic_england_heritage_buildings_path=None,
# bucket=None
# )
# special_buildings_client.historic_england_listed_buildings = \
# special_buildings_client2.historic_england_listed_buildings
# special_buildings_client.historic_england_heritage_buildings = \
# special_buildings_client2.historic_england_heritage_buildings
logger.info("Extracting spatial data for uprn partitions")
to_loop_over = open_uprn_client.data.groupby("filename")
for filename, uprn_df in tqdm(open_uprn_client.data.groupby("filename"), total=len(to_loop_over)):
uprn_gdf = OpenUprnClient.convert_bng_data_to_gpd(uprn_df)
uprn_gdf = conservation_area_client.is_in_conservation_area_vectorised(uprn_gdf=uprn_gdf)
uprn_gdf = special_buildings_client.is_listed_building_vectorised(uprn_gdf=uprn_gdf)
uprn_gdf = special_buildings_client.is_heritage_building_at_risk_vectorised(uprn_gdf=uprn_gdf)
# Convert back to a regular dataframe
uprn_gdf = uprn_gdf.drop(columns=["geometry"])
uprn_gdf = pd.DataFrame(uprn_gdf)
save_dataframe_to_s3_parquet(
df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
)
# We need to iterate through the open uprn data and check if the coordinates are in a conservation area
open_uprn_data = [
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
'LONGITUDE': -0.0540506},
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
'LONGITUDE': -0.0498772},
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
'LONGITUDE': -0.226392},
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
'LONGITUDE': -0.0468833},
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
'LONGITUDE': -0.1362513},
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
'LONGITUDE': -0.0823165}
]
result = [
{
"uprn": coordinates["UPRN"],
"is_in_conservation_area": conservation_area_client.is_in_conservation_area(
OpenUprnCoordinateData(**coordinates))
} for coordinates in
open_uprn_data
]

View file

@ -564,6 +564,12 @@ def app():
output = pd.concat(dataset)
# Remove any records that have huge swings in their floor area
output["tfa_diff_abs"] = abs(output["TOTAL_FLOOR_AREA_ENDING"] - output["TOTAL_FLOOR_AREA_STARTING"])
output["tfa_diff_prop"] = output["tfa_diff_abs"] / output["TOTAL_FLOOR_AREA_STARTING"]
output = output[output["tfa_diff_prop"] < 0.5]
output = output.drop(columns=["tfa_diff_abs", "tfa_diff_prop"])
uvalue_columns = [col for col in output.columns if "thermal_transmittance" in col]
for uvalue_col in uvalue_columns:
output[uvalue_col] = pd.to_numeric(output[uvalue_col])
@ -571,15 +577,7 @@ def app():
save_dataframe_to_s3_parquet(
df=output,
bucket_name="retrofit-data-dev",
file_key="sap_change_model/dataset_without_differencing.parquet",
)
output = DataProcessor.difference_data(output)
save_dataframe_to_s3_parquet(
df=output,
bucket_name="retrofit-data-dev",
file_key="sap_change_model/dataset_with_differencing.parquet",
file_key="sap_change_model/dataset.parquet",
)

View file

@ -1,31 +0,0 @@
import pandas as pd
from utils.logger import setup_logger
logger = setup_logger()
class OpenUprnClient:
"""
Specs for this dataset can be found here:
https://www.ordnancesurvey.co.uk/documents/product-support/tech-spec/open-uprn-techspec-v1.pdf
"""
# TODO: Document this
def __init__(self, path, uprns=None):
self.path = path
self.uprns = [int(x) for x in uprns] if uprns else None
self.data = None
def read(self):
"""
This methodology is placeholder, while data sits localls
:return:
"""
logger.info("Reading in open uprn data")
df = pd.read_csv(self.path)
if self.uprns:
df = df[df["UPRN"].isin(self.uprns)]
self.data = df

View file

View file

@ -1,18 +0,0 @@
"""
This application reads in the open uprn data from a static location and loads it into
our database for querying from other services
"""
import os
from open_uprn.OpenUprnClient import OpenUprnClient
def app():
open_uprn_client = OpenUprnClient(
path=os.path.abspath(
os.path.dirname(__file__)
) + "/model_data/local_data/osopenuprn_202306_csv/osopenuprn_202305.csv",
)
open_uprn_client.read()
# TODO: Add a method to write to the database

View file

@ -1,13 +0,0 @@
numpy==1.25.1
pandas==2.0.3
python-dateutil==2.8.2
pytz==2023.3
six==1.16.0
tzdata==2023.3
click==8.1.6
joblib==1.3.1
nltk==3.8.1
regex==2023.6.3
textblob==0.17.1
tqdm==4.65.0