mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
vectorised conservation area, heritage building and listedbuilding check
This commit is contained in:
parent
64b6b67499
commit
c267496353
11 changed files with 410 additions and 161 deletions
|
|
@ -1,86 +0,0 @@
|
|||
"""
|
||||
This application reads in the open uprn data from a static location and loads it into
|
||||
our database for querying from other services
|
||||
"""
|
||||
|
||||
from etl.conservation_areas.ConservationAreaClient import ConservationAreaClient
|
||||
from datatypes.datatypes import OpenUprnCoordinateData
|
||||
|
||||
BUCKET = "retrofit-data-dev"
|
||||
HISTORIC_ENGLAND_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
|
||||
GOV_PATHNAME = "spatial/gov-conservation-area.geojson"
|
||||
|
||||
|
||||
def app():
|
||||
# TODO: Store the input data in S3 [x]
|
||||
# Read the input data from S3 [ ]
|
||||
# Document the data source and where to find it [x]
|
||||
# Write the outputs to S3
|
||||
|
||||
"""
|
||||
This application uses the conservation area datasets to determine if a UPRN is
|
||||
in a conservation area or now
|
||||
|
||||
We use two sources of data for determining if homes are in conservation areas.
|
||||
The first is the Historic England dataset, which is a shapefile containing
|
||||
polygons of conservation areas. The second is the gov.uk dataset, which is a
|
||||
geojson file containing polygons of conservation areas.
|
||||
|
||||
The Historic England dataset can be found here:
|
||||
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||
|
||||
The listed building dataset is also found at Historic England at:
|
||||
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||
|
||||
The hertitige buildings dataset is also found at Historic England at:
|
||||
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||
|
||||
The Gov.uk dataset can be found here:
|
||||
https://www.planning.data.gov.uk/dataset/conservation-area
|
||||
|
||||
For the moment, these data sources are downloaded manually and uploaded to S3.
|
||||
This application then processes those files and writes the results to s3
|
||||
"""
|
||||
|
||||
conservation_area_client = ConservationAreaClient(
|
||||
historic_england_path=HISTORIC_ENGLAND_PATHNAME,
|
||||
gov_path=GOV_PATHNAME,
|
||||
bucket=BUCKET
|
||||
)
|
||||
conservation_area_client.read()
|
||||
|
||||
# We need to iterate through the open uprn data and check if the coordinates are in a conservation area
|
||||
open_uprn_data = [
|
||||
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
|
||||
'LONGITUDE': -0.0540506},
|
||||
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
|
||||
'LONGITUDE': -0.0498772},
|
||||
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
|
||||
'LONGITUDE': -0.226392},
|
||||
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
||||
'LONGITUDE': -0.0792445},
|
||||
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
||||
'LONGITUDE': -0.0792445},
|
||||
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
|
||||
'LONGITUDE': -0.0468833},
|
||||
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
|
||||
'LONGITUDE': -0.1362513},
|
||||
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
|
||||
'LONGITUDE': -0.0823165}
|
||||
]
|
||||
|
||||
open_uprn_data = [
|
||||
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
|
||||
'LONGITUDE': -0.0823165}
|
||||
]
|
||||
|
||||
result = [
|
||||
{
|
||||
"uprn": coordinates["UPRN"],
|
||||
"is_in_conservation_area": conservation_area_client.is_in_conservation_area(
|
||||
OpenUprnCoordinateData(**coordinates))
|
||||
} for coordinates in
|
||||
open_uprn_data
|
||||
]
|
||||
|
||||
# TODO: Add a method to write to the database
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
import boto3
|
||||
import os
|
||||
import tempfile
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
import numpy as np
|
||||
from enum import Enum
|
||||
from shapely.geometry import Point
|
||||
from utils.logger import setup_logger
|
||||
|
|
@ -61,9 +61,9 @@ class ConservationAreaClient:
|
|||
"""
|
||||
|
||||
SOURCES = ["historic_england"]
|
||||
IN_CONSERVATION_AREA = "in_conservation_area"
|
||||
NOT_IN_CONSERVATION_AREA = "not_in_conservation_area"
|
||||
UNKNOWN = "unknown"
|
||||
IN_CONSERVATION_AREA = True
|
||||
NOT_IN_CONSERVATION_AREA = False
|
||||
UNKNOWN = None
|
||||
|
||||
def __init__(self, historic_england_path, gov_path, bucket):
|
||||
self.historic_england_path = historic_england_path
|
||||
|
|
@ -91,6 +91,8 @@ class ConservationAreaClient:
|
|||
)
|
||||
)
|
||||
self.gov_data = self.gov_data.drop(columns=["dataset"])
|
||||
# Convert the gov data to british national grid co-ordinates
|
||||
self.gov_data = self.gov_data.to_crs("EPSG:27700")
|
||||
|
||||
def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):
|
||||
|
||||
|
|
@ -123,6 +125,43 @@ class ConservationAreaClient:
|
|||
else:
|
||||
return ConservationAreaClient.UNKNOWN
|
||||
|
||||
def is_in_conservation_area_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
||||
|
||||
joined_gdf_he = gpd.sjoin(uprn_gdf, self.historic_england_data, how="left", predicate="within")
|
||||
|
||||
# Identify where we have definitive information (not "unknown")
|
||||
in_conservation_he = ~joined_gdf_he.index_right.isna() & (
|
||||
joined_gdf_he["NAME"] != "No data available for publication by HE"
|
||||
)
|
||||
|
||||
uprn_in_conservation_he = joined_gdf_he[in_conservation_he]["UPRN"].unique()
|
||||
# The right index will be missing when we don't have a match so the uprn is not in a conservation
|
||||
# area
|
||||
uprn_not_in_conservation_he = joined_gdf_he.loc[
|
||||
~joined_gdf_he["UPRN"].isin(uprn_in_conservation_he) & joined_gdf_he.index_right.isna(),
|
||||
"UPRN"
|
||||
].unique()
|
||||
|
||||
# For unknowns, check against government data
|
||||
unknown_uprns = uprn_gdf.loc[~uprn_gdf["UPRN"].isin(uprn_in_conservation_he)]["UPRN"]
|
||||
unknown_gdf = uprn_gdf[uprn_gdf["UPRN"].isin(unknown_uprns)]
|
||||
|
||||
joined_gdf_gov = gpd.sjoin(unknown_gdf, self.gov_data, how="left", predicate="within")
|
||||
uprn_in_conservation_gov = joined_gdf_gov.loc[~joined_gdf_gov.index_right.isna(), "UPRN"].unique()
|
||||
|
||||
uprn_gdf['conservation_status'] = self.UNKNOWN
|
||||
uprn_gdf.loc[
|
||||
uprn_gdf["UPRN"].isin(uprn_in_conservation_he), 'conservation_status'
|
||||
] = self.IN_CONSERVATION_AREA
|
||||
uprn_gdf.loc[
|
||||
uprn_gdf["UPRN"].isin(uprn_not_in_conservation_he), 'conservation_status'
|
||||
] = self.NOT_IN_CONSERVATION_AREA
|
||||
uprn_gdf.loc[
|
||||
uprn_gdf["UPRN"].isin(uprn_in_conservation_gov), 'conservation_status'
|
||||
] = self.IN_CONSERVATION_AREA
|
||||
|
||||
return uprn_gdf
|
||||
|
||||
def is_in_conservation_area_historic_england(self, x_bng: float, y_bng: float) -> str:
|
||||
"""
|
||||
Check if a property is in a conservation area
|
||||
95
etl/spatial/OpenUprnClient.py
Normal file
95
etl/spatial/OpenUprnClient.py
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_io_from_s3
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class OpenUprnClient:
|
||||
"""
|
||||
|
||||
This client reads in the Open UPRN data from s3 which can be downloaded from here:
|
||||
https://osdatahub.os.uk/downloads/open/OpenUPRN
|
||||
|
||||
This dataset contains a lookup of UPRNs to coordinates.
|
||||
|
||||
Specs for this dataset can be found here:
|
||||
https://www.ordnancesurvey.co.uk/documents/product-support/tech-spec/open-uprn-techspec-v1.pdf
|
||||
"""
|
||||
|
||||
def __init__(self, path, bucket, uprns=None):
|
||||
self.path = path
|
||||
self.bucket = bucket
|
||||
self.uprns = [int(x) for x in uprns] if uprns else None
|
||||
self.data = None
|
||||
|
||||
# This will be stored in S3 and will be the complete list of filenames
|
||||
# We'll then use this to determine which file the UPRN's data is contained in
|
||||
self.filenames = None
|
||||
|
||||
def read(self):
|
||||
"""
|
||||
This methodology is placeholder, while data sits localls
|
||||
:return:
|
||||
"""
|
||||
logger.info("Reading in open uprn data")
|
||||
|
||||
df = pd.read_csv(
|
||||
read_io_from_s3(
|
||||
bucket_name=self.bucket,
|
||||
file_key=self.path
|
||||
)
|
||||
)
|
||||
if self.uprns:
|
||||
df = df[df["UPRN"].isin(self.uprns)]
|
||||
|
||||
self.data = df
|
||||
|
||||
def read_local(self):
|
||||
"""
|
||||
For local testing
|
||||
:return:
|
||||
"""
|
||||
logger.info("Reading in open uprn data")
|
||||
|
||||
df = pd.read_csv(self.path)
|
||||
if self.uprns:
|
||||
df = df[df["UPRN"].isin(self.uprns)]
|
||||
|
||||
self.data = df
|
||||
|
||||
def create_file_partitions(self, partition_size=50000):
|
||||
logger.info("Sorting data by UPRN ascending")
|
||||
self.data = self.data.sort_values("UPRN", ascending=True)
|
||||
|
||||
logger.info("Creating partitions")
|
||||
self.data['partition'] = self.data.index // partition_size
|
||||
|
||||
self.filenames = {}
|
||||
for partition, group in tqdm(self.data.groupby('partition')):
|
||||
min_uprn = group['UPRN'].min()
|
||||
max_uprn = group['UPRN'].max()
|
||||
self.filenames[partition] = f"{min_uprn}_{max_uprn}.csv"
|
||||
|
||||
self.data['filename'] = self.data['partition'].map(self.filenames)
|
||||
|
||||
@staticmethod
|
||||
def find_filename_for_uprn(uprn, filenames):
|
||||
for filename in filenames:
|
||||
min_uprn, max_uprn = map(int, filename.replace(".csv", "").split("_"))
|
||||
if min_uprn <= uprn <= max_uprn:
|
||||
return filename
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def convert_bng_data_to_gpd(df):
|
||||
|
||||
gpd_data = gpd.GeoDataFrame(
|
||||
df,
|
||||
geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE),
|
||||
crs="EPSG:27700" # British National Grid
|
||||
)
|
||||
|
||||
return gpd_data
|
||||
114
etl/spatial/SpecialBuildingsClient.py
Normal file
114
etl/spatial/SpecialBuildingsClient.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
import geopandas as gpd
|
||||
from shapely.geometry import Point
|
||||
from utils.logger import setup_logger
|
||||
from etl.spatial.ConservationAreaClient import read_shapefile_from_s3
|
||||
from datatypes.datatypes import OpenUprnCoordinateData
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class SpecialBuildingsClient:
|
||||
"""
|
||||
This class reads in data from Historic England, which can be used to determine if specific buildings are
|
||||
listed or heritage buildings
|
||||
"""
|
||||
|
||||
def __init__(self, historic_england_listed_buildings_path, historic_england_heritage_buildings_path, bucket):
|
||||
self.historic_england_listed_buildings_path = historic_england_listed_buildings_path
|
||||
self.historic_england_heritage_buildings_path = historic_england_heritage_buildings_path
|
||||
self.bucket = bucket
|
||||
|
||||
self.historic_england_listed_buildings = None
|
||||
self.historic_england_heritage_buildings = None
|
||||
|
||||
def read(self):
|
||||
"""
|
||||
Read the data
|
||||
"""
|
||||
logger.info("Reading in historic england listed buildings shapefile")
|
||||
self.historic_england_listed_buildings = read_shapefile_from_s3(
|
||||
bucket_name=self.bucket, s3_file_key=self.historic_england_listed_buildings_path
|
||||
)
|
||||
|
||||
logger.info("Reading in historic england heritage buildings shapefile")
|
||||
self.historic_england_heritage_buildings = read_shapefile_from_s3(
|
||||
bucket_name=self.bucket, s3_file_key=self.historic_england_heritage_buildings_path
|
||||
)
|
||||
|
||||
# Convert the gov data to british national grid co-ordinates
|
||||
self.historic_england_heritage_buildings = self.historic_england_heritage_buildings.to_crs("EPSG:27700")
|
||||
|
||||
def is_listed_building(self, coordinates: OpenUprnCoordinateData) -> bool:
|
||||
"""
|
||||
Check if a location specified by British National Grid coordinates is a listed building.
|
||||
|
||||
:param coordinates: dictionary, which should have the OpenUprnCoordinateData format
|
||||
:return: "listed_building" if the location is within a listed building polygon, "not_listed_building" otherwise
|
||||
"""
|
||||
# Convert the coordinates to a Shapely Point object
|
||||
point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
|
||||
|
||||
# Check if the point is within any of the listed building polygons
|
||||
within_listed_buildings = self.historic_england_listed_buildings.contains(point)
|
||||
|
||||
if within_listed_buildings.any():
|
||||
# If the point is within any listed building polygon, log the names of the buildings and return
|
||||
# "listed_building"
|
||||
names = self.historic_england_listed_buildings.loc[within_listed_buildings, "Name"]
|
||||
logger.info(f"The location is within the following listed buildings: {names.values}")
|
||||
return True
|
||||
|
||||
# If the point is not within any listed building polygon, return "not_listed_building"
|
||||
return False
|
||||
|
||||
def is_listed_building_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
||||
# Check against historic England listed buildings data
|
||||
joined_gdf_listed = gpd.sjoin(uprn_gdf, self.historic_england_listed_buildings, how="left", predicate="within")
|
||||
|
||||
# Identify where we have matches
|
||||
uprn_is_listed = joined_gdf_listed[~joined_gdf_listed.index_right.isna()]["UPRN"].unique()
|
||||
|
||||
# Populate the results in the input GeoDataFrame
|
||||
uprn_gdf['is_listed_building'] = False
|
||||
uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_listed), 'is_listed_building'] = True
|
||||
|
||||
return uprn_gdf
|
||||
|
||||
def is_heritage_building_at_risk(self, coordinates: OpenUprnCoordinateData) -> bool:
|
||||
"""
|
||||
Check if a location specified by British National Grid coordinates is a heritage building at risk.
|
||||
|
||||
:param coordinates: dictionary, which should have the OpenUprnCoordinateData format
|
||||
:return: "heritage_building_at_risk" if the location is within a heritage building at risk polygon,
|
||||
"not_heritage_building_at_risk" otherwise
|
||||
"""
|
||||
# Convert the coordinates to a Shapely Point object
|
||||
point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
|
||||
|
||||
# Check if the point is within any of the heritage building at risk polygons
|
||||
within_heritage_buildings_at_risk = self.historic_england_heritage_buildings.contains(point)
|
||||
|
||||
if within_heritage_buildings_at_risk.any():
|
||||
# If the point is within any heritage building at risk polygon, log the names of the buildings and return
|
||||
# "heritage_building_at_risk"
|
||||
names = self.historic_england_heritage_buildings.loc[within_heritage_buildings_at_risk, "EntryName"]
|
||||
logger.info(f"The location is within the following heritage buildings at risk: {names.values}")
|
||||
return True
|
||||
|
||||
# If the point is not within any heritage building at risk polygon, return "not_heritage_building_at_risk"
|
||||
return False
|
||||
|
||||
def is_heritage_building_at_risk_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
||||
# Check against historic England heritage buildings data
|
||||
joined_gdf_heritage = gpd.sjoin(
|
||||
uprn_gdf, self.historic_england_heritage_buildings, how="left", predicate="within"
|
||||
)
|
||||
|
||||
# Identify where we have matches
|
||||
uprn_is_heritage = joined_gdf_heritage[~joined_gdf_heritage.index_right.isna()]["UPRN"].unique()
|
||||
|
||||
# Populate the results in the input GeoDataFrame
|
||||
uprn_gdf['is_heritage_building'] = False
|
||||
uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_heritage), 'is_heritage_building'] = True
|
||||
|
||||
return uprn_gdf
|
||||
151
etl/spatial/app.py
Normal file
151
etl/spatial/app.py
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
"""
|
||||
This application reads in the open uprn data from a static location and loads it into
|
||||
our database for querying from other services
|
||||
"""
|
||||
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
from etl.spatial.ConservationAreaClient import ConservationAreaClient
|
||||
from etl.spatial.OpenUprnClient import OpenUprnClient
|
||||
from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
|
||||
from datatypes.datatypes import OpenUprnCoordinateData
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import save_dataframe_to_s3_parquet
|
||||
|
||||
BUCKET = "retrofit-datalake-dev"
|
||||
OUTPUT_BUCKET = "retrofit-dev-dev"
|
||||
HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
|
||||
GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
|
||||
OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
|
||||
HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME = "spatial/National_Heritage_List_for_England_(" \
|
||||
"NHLE)/Listed_Building_polygons.shp"
|
||||
HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME = \
|
||||
"spatial/Historic_England_Heritage_at_Risk_Register_2022/Historic_England_Heritage_at_Risk_Register_2022.shp"
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def app():
|
||||
# TODO: Store the input data in S3 [x]
|
||||
# Read the input data from S3 [x]
|
||||
# Document the data source and where to find it [x]
|
||||
# Incorportate listed buildings [x]
|
||||
# Incorporate heritage buildings [x]
|
||||
# Write the outputs to S3 [ ]
|
||||
|
||||
"""
|
||||
This application uses the conservation area datasets to determine if a UPRN is
|
||||
in a conservation area or now
|
||||
|
||||
We use two sources of data for determining if homes are in conservation areas.
|
||||
The first is the Historic England dataset, which is a shapefile containing
|
||||
polygons of conservation areas. The second is the gov.uk dataset, which is a
|
||||
geojson file containing polygons of conservation areas.
|
||||
|
||||
The Historic England dataset can be found here:
|
||||
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||
|
||||
The listed building dataset is also found at Historic England at:
|
||||
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||
|
||||
The hertitige buildings dataset is also found at Historic England at:
|
||||
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||
|
||||
The Gov.uk dataset can be found here:
|
||||
https://www.planning.data.gov.uk/dataset/conservation-area
|
||||
|
||||
The open UPRN data can be found here:
|
||||
https://osdatahub.os.uk/downloads/open/OpenUPRN
|
||||
|
||||
The Office for National Statistics Postcode Lookup can be found here:
|
||||
https://geoportal.statistics.gov.uk/datasets/9ac0331178b0435e839f62f41cc61c16/about
|
||||
|
||||
For the moment, these data sources are downloaded manually and uploaded to S3.
|
||||
This application then processes those files and writes the results to s3
|
||||
"""
|
||||
|
||||
conservation_area_client = ConservationAreaClient(
|
||||
historic_england_path=HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME,
|
||||
gov_path=GOV_CONSERVARION_AREAS_PATHNAME,
|
||||
bucket=BUCKET
|
||||
)
|
||||
conservation_area_client.read()
|
||||
|
||||
special_buildings_client = SpecialBuildingsClient(
|
||||
historic_england_listed_buildings_path=HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME,
|
||||
historic_england_heritage_buildings_path=HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME,
|
||||
bucket=BUCKET
|
||||
)
|
||||
special_buildings_client.read()
|
||||
|
||||
# Local version
|
||||
OPEN_UPRN_PATHNAME = "/Users/khalimconn-kowlessar/Documents/hestia/Model/model_data/local_data" \
|
||||
"/osopenuprn_202306_csv/osopenuprn_202305.csv"
|
||||
open_uprn_client = OpenUprnClient(
|
||||
path=OPEN_UPRN_PATHNAME,
|
||||
bucket=BUCKET
|
||||
)
|
||||
open_uprn_client.read()
|
||||
open_uprn_client.read_local()
|
||||
|
||||
# We want to sort the data and split it into filenames on UPRN.
|
||||
# We'll split the data into chunks of 50,000
|
||||
open_uprn_client.create_file_partitions()
|
||||
|
||||
# special_buildings_client = SpecialBuildingsClient(
|
||||
# historic_england_listed_buildings_path=None,
|
||||
# historic_england_heritage_buildings_path=None,
|
||||
# bucket=None
|
||||
# )
|
||||
# special_buildings_client.historic_england_listed_buildings = \
|
||||
# special_buildings_client2.historic_england_listed_buildings
|
||||
# special_buildings_client.historic_england_heritage_buildings = \
|
||||
# special_buildings_client2.historic_england_heritage_buildings
|
||||
|
||||
logger.info("Extracting spatial data for uprn partitions")
|
||||
to_loop_over = open_uprn_client.data.groupby("filename")
|
||||
|
||||
for filename, uprn_df in tqdm(open_uprn_client.data.groupby("filename"), total=len(to_loop_over)):
|
||||
uprn_gdf = OpenUprnClient.convert_bng_data_to_gpd(uprn_df)
|
||||
|
||||
uprn_gdf = conservation_area_client.is_in_conservation_area_vectorised(uprn_gdf=uprn_gdf)
|
||||
uprn_gdf = special_buildings_client.is_listed_building_vectorised(uprn_gdf=uprn_gdf)
|
||||
uprn_gdf = special_buildings_client.is_heritage_building_at_risk_vectorised(uprn_gdf=uprn_gdf)
|
||||
|
||||
# Convert back to a regular dataframe
|
||||
uprn_gdf = uprn_gdf.drop(columns=["geometry"])
|
||||
uprn_gdf = pd.DataFrame(uprn_gdf)
|
||||
|
||||
save_dataframe_to_s3_parquet(
|
||||
df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
|
||||
)
|
||||
|
||||
# We need to iterate through the open uprn data and check if the coordinates are in a conservation area
|
||||
open_uprn_data = [
|
||||
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
|
||||
'LONGITUDE': -0.0540506},
|
||||
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
|
||||
'LONGITUDE': -0.0498772},
|
||||
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
|
||||
'LONGITUDE': -0.226392},
|
||||
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
||||
'LONGITUDE': -0.0792445},
|
||||
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
||||
'LONGITUDE': -0.0792445},
|
||||
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
|
||||
'LONGITUDE': -0.0468833},
|
||||
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
|
||||
'LONGITUDE': -0.1362513},
|
||||
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
|
||||
'LONGITUDE': -0.0823165}
|
||||
]
|
||||
|
||||
result = [
|
||||
{
|
||||
"uprn": coordinates["UPRN"],
|
||||
"is_in_conservation_area": conservation_area_client.is_in_conservation_area(
|
||||
OpenUprnCoordinateData(**coordinates))
|
||||
} for coordinates in
|
||||
open_uprn_data
|
||||
]
|
||||
|
|
@ -564,6 +564,12 @@ def app():
|
|||
|
||||
output = pd.concat(dataset)
|
||||
|
||||
# Remove any records that have huge swings in their floor area
|
||||
output["tfa_diff_abs"] = abs(output["TOTAL_FLOOR_AREA_ENDING"] - output["TOTAL_FLOOR_AREA_STARTING"])
|
||||
output["tfa_diff_prop"] = output["tfa_diff_abs"] / output["TOTAL_FLOOR_AREA_STARTING"]
|
||||
output = output[output["tfa_diff_prop"] < 0.5]
|
||||
output = output.drop(columns=["tfa_diff_abs", "tfa_diff_prop"])
|
||||
|
||||
uvalue_columns = [col for col in output.columns if "thermal_transmittance" in col]
|
||||
for uvalue_col in uvalue_columns:
|
||||
output[uvalue_col] = pd.to_numeric(output[uvalue_col])
|
||||
|
|
@ -571,15 +577,7 @@ def app():
|
|||
save_dataframe_to_s3_parquet(
|
||||
df=output,
|
||||
bucket_name="retrofit-data-dev",
|
||||
file_key="sap_change_model/dataset_without_differencing.parquet",
|
||||
)
|
||||
|
||||
output = DataProcessor.difference_data(output)
|
||||
|
||||
save_dataframe_to_s3_parquet(
|
||||
df=output,
|
||||
bucket_name="retrofit-data-dev",
|
||||
file_key="sap_change_model/dataset_with_differencing.parquet",
|
||||
file_key="sap_change_model/dataset.parquet",
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,31 +0,0 @@
|
|||
import pandas as pd
|
||||
from utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class OpenUprnClient:
|
||||
"""
|
||||
Specs for this dataset can be found here:
|
||||
https://www.ordnancesurvey.co.uk/documents/product-support/tech-spec/open-uprn-techspec-v1.pdf
|
||||
"""
|
||||
|
||||
# TODO: Document this
|
||||
|
||||
def __init__(self, path, uprns=None):
|
||||
self.path = path
|
||||
self.uprns = [int(x) for x in uprns] if uprns else None
|
||||
self.data = None
|
||||
|
||||
def read(self):
|
||||
"""
|
||||
This methodology is placeholder, while data sits localls
|
||||
:return:
|
||||
"""
|
||||
logger.info("Reading in open uprn data")
|
||||
|
||||
df = pd.read_csv(self.path)
|
||||
if self.uprns:
|
||||
df = df[df["UPRN"].isin(self.uprns)]
|
||||
|
||||
self.data = df
|
||||
|
|
@ -1,18 +0,0 @@
|
|||
"""
|
||||
This application reads in the open uprn data from a static location and loads it into
|
||||
our database for querying from other services
|
||||
"""
|
||||
|
||||
import os
|
||||
from open_uprn.OpenUprnClient import OpenUprnClient
|
||||
|
||||
|
||||
def app():
|
||||
open_uprn_client = OpenUprnClient(
|
||||
path=os.path.abspath(
|
||||
os.path.dirname(__file__)
|
||||
) + "/model_data/local_data/osopenuprn_202306_csv/osopenuprn_202305.csv",
|
||||
)
|
||||
open_uprn_client.read()
|
||||
|
||||
# TODO: Add a method to write to the database
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
numpy==1.25.1
|
||||
pandas==2.0.3
|
||||
python-dateutil==2.8.2
|
||||
pytz==2023.3
|
||||
six==1.16.0
|
||||
tzdata==2023.3
|
||||
click==8.1.6
|
||||
joblib==1.3.1
|
||||
nltk==3.8.1
|
||||
regex==2023.6.3
|
||||
textblob==0.17.1
|
||||
tqdm==4.65.0
|
||||
|
||||
Loading…
Add table
Reference in a new issue