From c2674963534cebb67200c10ccb80cb4ef1f15aa0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 2 Oct 2023 22:54:15 +0100 Subject: [PATCH] vectorised conservation area, heritage building and listedbuilding check --- etl/conservation_areas/app.py | 86 ---------- .../ConservationAreaClient.py | 47 +++++- etl/spatial/OpenUprnClient.py | 95 +++++++++++ etl/spatial/SpecialBuildingsClient.py | 114 +++++++++++++ etl/spatial/app.py | 151 ++++++++++++++++++ .../requirements.txt | 0 .../generate_rdsap_change.py | 16 +- open_uprn/OpenUprnClient.py | 31 ---- open_uprn/__init__.py | 0 open_uprn/app.py | 18 --- open_uprn/requirements.txt | 13 -- 11 files changed, 410 insertions(+), 161 deletions(-) delete mode 100644 etl/conservation_areas/app.py rename etl/{conservation_areas => spatial}/ConservationAreaClient.py (77%) create mode 100644 etl/spatial/OpenUprnClient.py create mode 100644 etl/spatial/SpecialBuildingsClient.py create mode 100644 etl/spatial/app.py rename etl/{conservation_areas => spatial}/requirements.txt (100%) delete mode 100644 open_uprn/OpenUprnClient.py delete mode 100644 open_uprn/__init__.py delete mode 100644 open_uprn/app.py delete mode 100644 open_uprn/requirements.txt diff --git a/etl/conservation_areas/app.py b/etl/conservation_areas/app.py deleted file mode 100644 index dddaede6..00000000 --- a/etl/conservation_areas/app.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -This application reads in the open uprn data from a static location and loads it into -our database for querying from other services -""" - -from etl.conservation_areas.ConservationAreaClient import ConservationAreaClient -from datatypes.datatypes import OpenUprnCoordinateData - -BUCKET = "retrofit-data-dev" -HISTORIC_ENGLAND_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp" -GOV_PATHNAME = "spatial/gov-conservation-area.geojson" - - -def app(): - # TODO: Store the input data in S3 [x] - # Read the input data from S3 [ ] - # Document the data source and where to find it [x] - # Write the outputs to S3 - - """ - This application uses the conservation area datasets to determine if a UPRN is - in a conservation area or now - - We use two sources of data for determining if homes are in conservation areas. - The first is the Historic England dataset, which is a shapefile containing - polygons of conservation areas. The second is the gov.uk dataset, which is a - geojson file containing polygons of conservation areas. - - The Historic England dataset can be found here: - https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e - - The listed building dataset is also found at Historic England at: - https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e - - The hertitige buildings dataset is also found at Historic England at: - https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e - - The Gov.uk dataset can be found here: - https://www.planning.data.gov.uk/dataset/conservation-area - - For the moment, these data sources are downloaded manually and uploaded to S3. - This application then processes those files and writes the results to s3 - """ - - conservation_area_client = ConservationAreaClient( - historic_england_path=HISTORIC_ENGLAND_PATHNAME, - gov_path=GOV_PATHNAME, - bucket=BUCKET - ) - conservation_area_client.read() - - # We need to iterate through the open uprn data and check if the coordinates are in a conservation area - open_uprn_data = [ - {'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407, - 'LONGITUDE': -0.0540506}, - {'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492, - 'LONGITUDE': -0.0498772}, - {'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579, - 'LONGITUDE': -0.226392}, - {'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629, - 'LONGITUDE': -0.0792445}, - {'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629, - 'LONGITUDE': -0.0792445}, - {'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385, - 'LONGITUDE': -0.0468833}, - {'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908, - 'LONGITUDE': -0.1362513}, - {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309, - 'LONGITUDE': -0.0823165} - ] - - open_uprn_data = [ - {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309, - 'LONGITUDE': -0.0823165} - ] - - result = [ - { - "uprn": coordinates["UPRN"], - "is_in_conservation_area": conservation_area_client.is_in_conservation_area( - OpenUprnCoordinateData(**coordinates)) - } for coordinates in - open_uprn_data - ] - - # TODO: Add a method to write to the database diff --git a/etl/conservation_areas/ConservationAreaClient.py b/etl/spatial/ConservationAreaClient.py similarity index 77% rename from etl/conservation_areas/ConservationAreaClient.py rename to etl/spatial/ConservationAreaClient.py index fbf72704..57e94df8 100644 --- a/etl/conservation_areas/ConservationAreaClient.py +++ b/etl/spatial/ConservationAreaClient.py @@ -1,8 +1,8 @@ import boto3 import os import tempfile -import pandas as pd import geopandas as gpd +import numpy as np from enum import Enum from shapely.geometry import Point from utils.logger import setup_logger @@ -61,9 +61,9 @@ class ConservationAreaClient: """ SOURCES = ["historic_england"] - IN_CONSERVATION_AREA = "in_conservation_area" - NOT_IN_CONSERVATION_AREA = "not_in_conservation_area" - UNKNOWN = "unknown" + IN_CONSERVATION_AREA = True + NOT_IN_CONSERVATION_AREA = False + UNKNOWN = None def __init__(self, historic_england_path, gov_path, bucket): self.historic_england_path = historic_england_path @@ -91,6 +91,8 @@ class ConservationAreaClient: ) ) self.gov_data = self.gov_data.drop(columns=["dataset"]) + # Convert the gov data to british national grid co-ordinates + self.gov_data = self.gov_data.to_crs("EPSG:27700") def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData): @@ -123,6 +125,43 @@ class ConservationAreaClient: else: return ConservationAreaClient.UNKNOWN + def is_in_conservation_area_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: + + joined_gdf_he = gpd.sjoin(uprn_gdf, self.historic_england_data, how="left", predicate="within") + + # Identify where we have definitive information (not "unknown") + in_conservation_he = ~joined_gdf_he.index_right.isna() & ( + joined_gdf_he["NAME"] != "No data available for publication by HE" + ) + + uprn_in_conservation_he = joined_gdf_he[in_conservation_he]["UPRN"].unique() + # The right index will be missing when we don't have a match so the uprn is not in a conservation + # area + uprn_not_in_conservation_he = joined_gdf_he.loc[ + ~joined_gdf_he["UPRN"].isin(uprn_in_conservation_he) & joined_gdf_he.index_right.isna(), + "UPRN" + ].unique() + + # For unknowns, check against government data + unknown_uprns = uprn_gdf.loc[~uprn_gdf["UPRN"].isin(uprn_in_conservation_he)]["UPRN"] + unknown_gdf = uprn_gdf[uprn_gdf["UPRN"].isin(unknown_uprns)] + + joined_gdf_gov = gpd.sjoin(unknown_gdf, self.gov_data, how="left", predicate="within") + uprn_in_conservation_gov = joined_gdf_gov.loc[~joined_gdf_gov.index_right.isna(), "UPRN"].unique() + + uprn_gdf['conservation_status'] = self.UNKNOWN + uprn_gdf.loc[ + uprn_gdf["UPRN"].isin(uprn_in_conservation_he), 'conservation_status' + ] = self.IN_CONSERVATION_AREA + uprn_gdf.loc[ + uprn_gdf["UPRN"].isin(uprn_not_in_conservation_he), 'conservation_status' + ] = self.NOT_IN_CONSERVATION_AREA + uprn_gdf.loc[ + uprn_gdf["UPRN"].isin(uprn_in_conservation_gov), 'conservation_status' + ] = self.IN_CONSERVATION_AREA + + return uprn_gdf + def is_in_conservation_area_historic_england(self, x_bng: float, y_bng: float) -> str: """ Check if a property is in a conservation area diff --git a/etl/spatial/OpenUprnClient.py b/etl/spatial/OpenUprnClient.py new file mode 100644 index 00000000..90d78e8c --- /dev/null +++ b/etl/spatial/OpenUprnClient.py @@ -0,0 +1,95 @@ +from tqdm import tqdm +import pandas as pd +import geopandas as gpd +from utils.logger import setup_logger +from utils.s3 import read_io_from_s3 + +logger = setup_logger() + + +class OpenUprnClient: + """ + + This client reads in the Open UPRN data from s3 which can be downloaded from here: + https://osdatahub.os.uk/downloads/open/OpenUPRN + + This dataset contains a lookup of UPRNs to coordinates. + + Specs for this dataset can be found here: + https://www.ordnancesurvey.co.uk/documents/product-support/tech-spec/open-uprn-techspec-v1.pdf + """ + + def __init__(self, path, bucket, uprns=None): + self.path = path + self.bucket = bucket + self.uprns = [int(x) for x in uprns] if uprns else None + self.data = None + + # This will be stored in S3 and will be the complete list of filenames + # We'll then use this to determine which file the UPRN's data is contained in + self.filenames = None + + def read(self): + """ + This methodology is placeholder, while data sits localls + :return: + """ + logger.info("Reading in open uprn data") + + df = pd.read_csv( + read_io_from_s3( + bucket_name=self.bucket, + file_key=self.path + ) + ) + if self.uprns: + df = df[df["UPRN"].isin(self.uprns)] + + self.data = df + + def read_local(self): + """ + For local testing + :return: + """ + logger.info("Reading in open uprn data") + + df = pd.read_csv(self.path) + if self.uprns: + df = df[df["UPRN"].isin(self.uprns)] + + self.data = df + + def create_file_partitions(self, partition_size=50000): + logger.info("Sorting data by UPRN ascending") + self.data = self.data.sort_values("UPRN", ascending=True) + + logger.info("Creating partitions") + self.data['partition'] = self.data.index // partition_size + + self.filenames = {} + for partition, group in tqdm(self.data.groupby('partition')): + min_uprn = group['UPRN'].min() + max_uprn = group['UPRN'].max() + self.filenames[partition] = f"{min_uprn}_{max_uprn}.csv" + + self.data['filename'] = self.data['partition'].map(self.filenames) + + @staticmethod + def find_filename_for_uprn(uprn, filenames): + for filename in filenames: + min_uprn, max_uprn = map(int, filename.replace(".csv", "").split("_")) + if min_uprn <= uprn <= max_uprn: + return filename + return None + + @staticmethod + def convert_bng_data_to_gpd(df): + + gpd_data = gpd.GeoDataFrame( + df, + geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE), + crs="EPSG:27700" # British National Grid + ) + + return gpd_data diff --git a/etl/spatial/SpecialBuildingsClient.py b/etl/spatial/SpecialBuildingsClient.py new file mode 100644 index 00000000..16a9d2d4 --- /dev/null +++ b/etl/spatial/SpecialBuildingsClient.py @@ -0,0 +1,114 @@ +import geopandas as gpd +from shapely.geometry import Point +from utils.logger import setup_logger +from etl.spatial.ConservationAreaClient import read_shapefile_from_s3 +from datatypes.datatypes import OpenUprnCoordinateData + +logger = setup_logger() + + +class SpecialBuildingsClient: + """ + This class reads in data from Historic England, which can be used to determine if specific buildings are + listed or heritage buildings + """ + + def __init__(self, historic_england_listed_buildings_path, historic_england_heritage_buildings_path, bucket): + self.historic_england_listed_buildings_path = historic_england_listed_buildings_path + self.historic_england_heritage_buildings_path = historic_england_heritage_buildings_path + self.bucket = bucket + + self.historic_england_listed_buildings = None + self.historic_england_heritage_buildings = None + + def read(self): + """ + Read the data + """ + logger.info("Reading in historic england listed buildings shapefile") + self.historic_england_listed_buildings = read_shapefile_from_s3( + bucket_name=self.bucket, s3_file_key=self.historic_england_listed_buildings_path + ) + + logger.info("Reading in historic england heritage buildings shapefile") + self.historic_england_heritage_buildings = read_shapefile_from_s3( + bucket_name=self.bucket, s3_file_key=self.historic_england_heritage_buildings_path + ) + + # Convert the gov data to british national grid co-ordinates + self.historic_england_heritage_buildings = self.historic_england_heritage_buildings.to_crs("EPSG:27700") + + def is_listed_building(self, coordinates: OpenUprnCoordinateData) -> bool: + """ + Check if a location specified by British National Grid coordinates is a listed building. + + :param coordinates: dictionary, which should have the OpenUprnCoordinateData format + :return: "listed_building" if the location is within a listed building polygon, "not_listed_building" otherwise + """ + # Convert the coordinates to a Shapely Point object + point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE) + + # Check if the point is within any of the listed building polygons + within_listed_buildings = self.historic_england_listed_buildings.contains(point) + + if within_listed_buildings.any(): + # If the point is within any listed building polygon, log the names of the buildings and return + # "listed_building" + names = self.historic_england_listed_buildings.loc[within_listed_buildings, "Name"] + logger.info(f"The location is within the following listed buildings: {names.values}") + return True + + # If the point is not within any listed building polygon, return "not_listed_building" + return False + + def is_listed_building_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: + # Check against historic England listed buildings data + joined_gdf_listed = gpd.sjoin(uprn_gdf, self.historic_england_listed_buildings, how="left", predicate="within") + + # Identify where we have matches + uprn_is_listed = joined_gdf_listed[~joined_gdf_listed.index_right.isna()]["UPRN"].unique() + + # Populate the results in the input GeoDataFrame + uprn_gdf['is_listed_building'] = False + uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_listed), 'is_listed_building'] = True + + return uprn_gdf + + def is_heritage_building_at_risk(self, coordinates: OpenUprnCoordinateData) -> bool: + """ + Check if a location specified by British National Grid coordinates is a heritage building at risk. + + :param coordinates: dictionary, which should have the OpenUprnCoordinateData format + :return: "heritage_building_at_risk" if the location is within a heritage building at risk polygon, + "not_heritage_building_at_risk" otherwise + """ + # Convert the coordinates to a Shapely Point object + point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE) + + # Check if the point is within any of the heritage building at risk polygons + within_heritage_buildings_at_risk = self.historic_england_heritage_buildings.contains(point) + + if within_heritage_buildings_at_risk.any(): + # If the point is within any heritage building at risk polygon, log the names of the buildings and return + # "heritage_building_at_risk" + names = self.historic_england_heritage_buildings.loc[within_heritage_buildings_at_risk, "EntryName"] + logger.info(f"The location is within the following heritage buildings at risk: {names.values}") + return True + + # If the point is not within any heritage building at risk polygon, return "not_heritage_building_at_risk" + return False + + def is_heritage_building_at_risk_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: + # Check against historic England heritage buildings data + joined_gdf_heritage = gpd.sjoin( + uprn_gdf, self.historic_england_heritage_buildings, how="left", predicate="within" + ) + + # Identify where we have matches + uprn_is_heritage = joined_gdf_heritage[~joined_gdf_heritage.index_right.isna()]["UPRN"].unique() + + # Populate the results in the input GeoDataFrame + uprn_gdf['is_heritage_building'] = False + uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_heritage), 'is_heritage_building'] = True + + return uprn_gdf diff --git a/etl/spatial/app.py b/etl/spatial/app.py new file mode 100644 index 00000000..39fe1434 --- /dev/null +++ b/etl/spatial/app.py @@ -0,0 +1,151 @@ +""" +This application reads in the open uprn data from a static location and loads it into +our database for querying from other services +""" + +import os +from tqdm import tqdm +import pandas as pd +from etl.spatial.ConservationAreaClient import ConservationAreaClient +from etl.spatial.OpenUprnClient import OpenUprnClient +from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient +from datatypes.datatypes import OpenUprnCoordinateData +from utils.logger import setup_logger +from utils.s3 import save_dataframe_to_s3_parquet + +BUCKET = "retrofit-datalake-dev" +OUTPUT_BUCKET = "retrofit-dev-dev" +HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp" +GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson" +OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv" +HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME = "spatial/National_Heritage_List_for_England_(" \ + "NHLE)/Listed_Building_polygons.shp" +HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME = \ + "spatial/Historic_England_Heritage_at_Risk_Register_2022/Historic_England_Heritage_at_Risk_Register_2022.shp" + +logger = setup_logger() + + +def app(): + # TODO: Store the input data in S3 [x] + # Read the input data from S3 [x] + # Document the data source and where to find it [x] + # Incorportate listed buildings [x] + # Incorporate heritage buildings [x] + # Write the outputs to S3 [ ] + + """ + This application uses the conservation area datasets to determine if a UPRN is + in a conservation area or now + + We use two sources of data for determining if homes are in conservation areas. + The first is the Historic England dataset, which is a shapefile containing + polygons of conservation areas. The second is the gov.uk dataset, which is a + geojson file containing polygons of conservation areas. + + The Historic England dataset can be found here: + https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e + + The listed building dataset is also found at Historic England at: + https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e + + The hertitige buildings dataset is also found at Historic England at: + https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e + + The Gov.uk dataset can be found here: + https://www.planning.data.gov.uk/dataset/conservation-area + + The open UPRN data can be found here: + https://osdatahub.os.uk/downloads/open/OpenUPRN + + The Office for National Statistics Postcode Lookup can be found here: + https://geoportal.statistics.gov.uk/datasets/9ac0331178b0435e839f62f41cc61c16/about + + For the moment, these data sources are downloaded manually and uploaded to S3. + This application then processes those files and writes the results to s3 + """ + + conservation_area_client = ConservationAreaClient( + historic_england_path=HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME, + gov_path=GOV_CONSERVARION_AREAS_PATHNAME, + bucket=BUCKET + ) + conservation_area_client.read() + + special_buildings_client = SpecialBuildingsClient( + historic_england_listed_buildings_path=HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME, + historic_england_heritage_buildings_path=HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME, + bucket=BUCKET + ) + special_buildings_client.read() + + # Local version + OPEN_UPRN_PATHNAME = "/Users/khalimconn-kowlessar/Documents/hestia/Model/model_data/local_data" \ + "/osopenuprn_202306_csv/osopenuprn_202305.csv" + open_uprn_client = OpenUprnClient( + path=OPEN_UPRN_PATHNAME, + bucket=BUCKET + ) + open_uprn_client.read() + open_uprn_client.read_local() + + # We want to sort the data and split it into filenames on UPRN. + # We'll split the data into chunks of 50,000 + open_uprn_client.create_file_partitions() + + # special_buildings_client = SpecialBuildingsClient( + # historic_england_listed_buildings_path=None, + # historic_england_heritage_buildings_path=None, + # bucket=None + # ) + # special_buildings_client.historic_england_listed_buildings = \ + # special_buildings_client2.historic_england_listed_buildings + # special_buildings_client.historic_england_heritage_buildings = \ + # special_buildings_client2.historic_england_heritage_buildings + + logger.info("Extracting spatial data for uprn partitions") + to_loop_over = open_uprn_client.data.groupby("filename") + + for filename, uprn_df in tqdm(open_uprn_client.data.groupby("filename"), total=len(to_loop_over)): + uprn_gdf = OpenUprnClient.convert_bng_data_to_gpd(uprn_df) + + uprn_gdf = conservation_area_client.is_in_conservation_area_vectorised(uprn_gdf=uprn_gdf) + uprn_gdf = special_buildings_client.is_listed_building_vectorised(uprn_gdf=uprn_gdf) + uprn_gdf = special_buildings_client.is_heritage_building_at_risk_vectorised(uprn_gdf=uprn_gdf) + + # Convert back to a regular dataframe + uprn_gdf = uprn_gdf.drop(columns=["geometry"]) + uprn_gdf = pd.DataFrame(uprn_gdf) + + save_dataframe_to_s3_parquet( + df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET + ) + + # We need to iterate through the open uprn data and check if the coordinates are in a conservation area + open_uprn_data = [ + {'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407, + 'LONGITUDE': -0.0540506}, + {'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492, + 'LONGITUDE': -0.0498772}, + {'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579, + 'LONGITUDE': -0.226392}, + {'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629, + 'LONGITUDE': -0.0792445}, + {'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629, + 'LONGITUDE': -0.0792445}, + {'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385, + 'LONGITUDE': -0.0468833}, + {'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908, + 'LONGITUDE': -0.1362513}, + {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309, + 'LONGITUDE': -0.0823165} + ] + + result = [ + { + "uprn": coordinates["UPRN"], + "is_in_conservation_area": conservation_area_client.is_in_conservation_area( + OpenUprnCoordinateData(**coordinates)) + } for coordinates in + open_uprn_data + ] diff --git a/etl/conservation_areas/requirements.txt b/etl/spatial/requirements.txt similarity index 100% rename from etl/conservation_areas/requirements.txt rename to etl/spatial/requirements.txt diff --git a/model_data/simulation_system/generate_rdsap_change.py b/model_data/simulation_system/generate_rdsap_change.py index b317e52c..e3d3bd0e 100644 --- a/model_data/simulation_system/generate_rdsap_change.py +++ b/model_data/simulation_system/generate_rdsap_change.py @@ -564,6 +564,12 @@ def app(): output = pd.concat(dataset) + # Remove any records that have huge swings in their floor area + output["tfa_diff_abs"] = abs(output["TOTAL_FLOOR_AREA_ENDING"] - output["TOTAL_FLOOR_AREA_STARTING"]) + output["tfa_diff_prop"] = output["tfa_diff_abs"] / output["TOTAL_FLOOR_AREA_STARTING"] + output = output[output["tfa_diff_prop"] < 0.5] + output = output.drop(columns=["tfa_diff_abs", "tfa_diff_prop"]) + uvalue_columns = [col for col in output.columns if "thermal_transmittance" in col] for uvalue_col in uvalue_columns: output[uvalue_col] = pd.to_numeric(output[uvalue_col]) @@ -571,15 +577,7 @@ def app(): save_dataframe_to_s3_parquet( df=output, bucket_name="retrofit-data-dev", - file_key="sap_change_model/dataset_without_differencing.parquet", - ) - - output = DataProcessor.difference_data(output) - - save_dataframe_to_s3_parquet( - df=output, - bucket_name="retrofit-data-dev", - file_key="sap_change_model/dataset_with_differencing.parquet", + file_key="sap_change_model/dataset.parquet", ) diff --git a/open_uprn/OpenUprnClient.py b/open_uprn/OpenUprnClient.py deleted file mode 100644 index 502ed25a..00000000 --- a/open_uprn/OpenUprnClient.py +++ /dev/null @@ -1,31 +0,0 @@ -import pandas as pd -from utils.logger import setup_logger - -logger = setup_logger() - - -class OpenUprnClient: - """ - Specs for this dataset can be found here: - https://www.ordnancesurvey.co.uk/documents/product-support/tech-spec/open-uprn-techspec-v1.pdf - """ - - # TODO: Document this - - def __init__(self, path, uprns=None): - self.path = path - self.uprns = [int(x) for x in uprns] if uprns else None - self.data = None - - def read(self): - """ - This methodology is placeholder, while data sits localls - :return: - """ - logger.info("Reading in open uprn data") - - df = pd.read_csv(self.path) - if self.uprns: - df = df[df["UPRN"].isin(self.uprns)] - - self.data = df diff --git a/open_uprn/__init__.py b/open_uprn/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/open_uprn/app.py b/open_uprn/app.py deleted file mode 100644 index 6ed62c44..00000000 --- a/open_uprn/app.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -This application reads in the open uprn data from a static location and loads it into -our database for querying from other services -""" - -import os -from open_uprn.OpenUprnClient import OpenUprnClient - - -def app(): - open_uprn_client = OpenUprnClient( - path=os.path.abspath( - os.path.dirname(__file__) - ) + "/model_data/local_data/osopenuprn_202306_csv/osopenuprn_202305.csv", - ) - open_uprn_client.read() - - # TODO: Add a method to write to the database diff --git a/open_uprn/requirements.txt b/open_uprn/requirements.txt deleted file mode 100644 index 11baa087..00000000 --- a/open_uprn/requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -numpy==1.25.1 -pandas==2.0.3 -python-dateutil==2.8.2 -pytz==2023.3 -six==1.16.0 -tzdata==2023.3 -click==8.1.6 -joblib==1.3.1 -nltk==3.8.1 -regex==2023.6.3 -textblob==0.17.1 -tqdm==4.65.0 -