vectorised conservation area, heritage building and listedbuilding check

2026-07-27 23:35:01 +00:00 · 2023-10-02 22:54:15 +01:00 · 2023-10-02 22:54:15 +01:00 · c267496353
commit c267496353
parent 64b6b67499
11 changed files with 410 additions and 161 deletions
--- a/etl/conservation_areas/app.py
+++ b/etl/conservation_areas/app.py
@ -1,86 +0,0 @@
-"""
-This application reads in the open uprn data from a static location and loads it into
-our database for querying from other services
-"""
-
-from etl.conservation_areas.ConservationAreaClient import ConservationAreaClient
-from datatypes.datatypes import OpenUprnCoordinateData
-
-BUCKET = "retrofit-data-dev"
-HISTORIC_ENGLAND_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
-GOV_PATHNAME = "spatial/gov-conservation-area.geojson"
-
-
-def app():
-    # TODO: Store the input data in S3 [x]
-    #       Read the input data from S3 [ ]
-    #       Document the data source and where to find it [x]
-    #       Write the outputs to S3
-
-    """
-    This application uses the conservation area datasets to determine if a UPRN is
-    in a conservation area or now
-
-    We use two sources of data for determining if homes are in conservation areas.
-    The first is the Historic England dataset, which is a shapefile containing
-    polygons of conservation areas. The second is the gov.uk dataset, which is a
-    geojson file containing polygons of conservation areas.
-
-    The Historic England dataset can be found here:
-    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
-
-    The listed building dataset is also found at Historic England at:
-    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
-
-    The hertitige buildings dataset is also found at Historic England at:
-    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
-
-    The Gov.uk dataset can be found here:
-    https://www.planning.data.gov.uk/dataset/conservation-area
-
-    For the moment, these data sources are downloaded manually and uploaded to S3.
-    This application then processes those files and writes the results to s3
-    """
-
-    conservation_area_client = ConservationAreaClient(
-        historic_england_path=HISTORIC_ENGLAND_PATHNAME,
-        gov_path=GOV_PATHNAME,
-        bucket=BUCKET
-    )
-    conservation_area_client.read()
-
-    # We need to iterate through the open uprn data and check if the coordinates are in a conservation area
-    open_uprn_data = [
-        {'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
-         'LONGITUDE': -0.0540506},
-        {'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
-         'LONGITUDE': -0.0498772},
-        {'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
-         'LONGITUDE': -0.226392},
-        {'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
-         'LONGITUDE': -0.0792445},
-        {'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
-         'LONGITUDE': -0.0792445},
-        {'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
-         'LONGITUDE': -0.0468833},
-        {'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
-         'LONGITUDE': -0.1362513},
-        {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
-         'LONGITUDE': -0.0823165}
-    ]
-
-    open_uprn_data = [
-        {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
-         'LONGITUDE': -0.0823165}
-    ]
-
-    result = [
-        {
-            "uprn": coordinates["UPRN"],
-            "is_in_conservation_area": conservation_area_client.is_in_conservation_area(
-                OpenUprnCoordinateData(**coordinates))
-        } for coordinates in
-        open_uprn_data
-    ]
-
-    # TODO: Add a method to write to the database
--- a/etl/conservation_areas/ConservationAreaClient.py
+++ b/etl/conservation_areas/ConservationAreaClient.py
@ -1,8 +1,8 @@
 import boto3
 import os
 import tempfile
-import pandas as pd
 import geopandas as gpd
+import numpy as np
 from enum import Enum
 from shapely.geometry import Point
 from utils.logger import setup_logger
@ -61,9 +61,9 @@ class ConservationAreaClient:
    """

    SOURCES = ["historic_england"]
-    IN_CONSERVATION_AREA = "in_conservation_area"
-    NOT_IN_CONSERVATION_AREA = "not_in_conservation_area"
-    UNKNOWN = "unknown"
+    IN_CONSERVATION_AREA = True
+    NOT_IN_CONSERVATION_AREA = False
+    UNKNOWN = None

    def __init__(self, historic_england_path, gov_path, bucket):
        self.historic_england_path = historic_england_path
@ -91,6 +91,8 @@ class ConservationAreaClient:
            )
        )
        self.gov_data = self.gov_data.drop(columns=["dataset"])
+        # Convert the gov data to british national grid co-ordinates
+        self.gov_data = self.gov_data.to_crs("EPSG:27700")

    def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):

@ -123,6 +125,43 @@ class ConservationAreaClient:
            else:
                return ConservationAreaClient.UNKNOWN

+    def is_in_conservation_area_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
+
+        joined_gdf_he = gpd.sjoin(uprn_gdf, self.historic_england_data, how="left", predicate="within")
+
+        # Identify where we have definitive information (not "unknown")
+        in_conservation_he = ~joined_gdf_he.index_right.isna() & (
+            joined_gdf_he["NAME"] != "No data available for publication by HE"
+        )
+
+        uprn_in_conservation_he = joined_gdf_he[in_conservation_he]["UPRN"].unique()
+        # The right index will be missing when we don't have a match so the uprn is not in a conservation
+        # area
+        uprn_not_in_conservation_he = joined_gdf_he.loc[
+            ~joined_gdf_he["UPRN"].isin(uprn_in_conservation_he) & joined_gdf_he.index_right.isna(),
+            "UPRN"
+        ].unique()
+
+        # For unknowns, check against government data
+        unknown_uprns = uprn_gdf.loc[~uprn_gdf["UPRN"].isin(uprn_in_conservation_he)]["UPRN"]
+        unknown_gdf = uprn_gdf[uprn_gdf["UPRN"].isin(unknown_uprns)]
+
+        joined_gdf_gov = gpd.sjoin(unknown_gdf, self.gov_data, how="left", predicate="within")
+        uprn_in_conservation_gov = joined_gdf_gov.loc[~joined_gdf_gov.index_right.isna(), "UPRN"].unique()
+
+        uprn_gdf['conservation_status'] = self.UNKNOWN
+        uprn_gdf.loc[
+            uprn_gdf["UPRN"].isin(uprn_in_conservation_he), 'conservation_status'
+        ] = self.IN_CONSERVATION_AREA
+        uprn_gdf.loc[
+            uprn_gdf["UPRN"].isin(uprn_not_in_conservation_he), 'conservation_status'
+        ] = self.NOT_IN_CONSERVATION_AREA
+        uprn_gdf.loc[
+            uprn_gdf["UPRN"].isin(uprn_in_conservation_gov), 'conservation_status'
+        ] = self.IN_CONSERVATION_AREA
+
+        return uprn_gdf
+
    def is_in_conservation_area_historic_england(self, x_bng: float, y_bng: float) -> str:
        """
        Check if a property is in a conservation area
--- a/etl/spatial/OpenUprnClient.py
+++ b/etl/spatial/OpenUprnClient.py
@ -0,0 +1,95 @@
+from tqdm import tqdm
+import pandas as pd
+import geopandas as gpd
+from utils.logger import setup_logger
+from utils.s3 import read_io_from_s3
+
+logger = setup_logger()
+
+
+class OpenUprnClient:
+    """
+
+    This client reads in the Open UPRN data from s3 which can be downloaded from here:
+    https://osdatahub.os.uk/downloads/open/OpenUPRN
+
+    This dataset contains a lookup of UPRNs to coordinates.
+
+    Specs for this dataset can be found here:
+    https://www.ordnancesurvey.co.uk/documents/product-support/tech-spec/open-uprn-techspec-v1.pdf
+    """
+
+    def __init__(self, path, bucket, uprns=None):
+        self.path = path
+        self.bucket = bucket
+        self.uprns = [int(x) for x in uprns] if uprns else None
+        self.data = None
+
+        # This will be stored in S3 and will be the complete list of filenames
+        # We'll then use this to determine which file the UPRN's data is contained in
+        self.filenames = None
+
+    def read(self):
+        """
+        This methodology is placeholder, while data sits localls
+        :return:
+        """
+        logger.info("Reading in open uprn data")
+
+        df = pd.read_csv(
+            read_io_from_s3(
+                bucket_name=self.bucket,
+                file_key=self.path
+            )
+        )
+        if self.uprns:
+            df = df[df["UPRN"].isin(self.uprns)]
+
+        self.data = df
+
+    def read_local(self):
+        """
+        For local testing
+        :return:
+        """
+        logger.info("Reading in open uprn data")
+
+        df = pd.read_csv(self.path)
+        if self.uprns:
+            df = df[df["UPRN"].isin(self.uprns)]
+
+        self.data = df
+
+    def create_file_partitions(self, partition_size=50000):
+        logger.info("Sorting data by UPRN ascending")
+        self.data = self.data.sort_values("UPRN", ascending=True)
+
+        logger.info("Creating partitions")
+        self.data['partition'] = self.data.index // partition_size
+
+        self.filenames = {}
+        for partition, group in tqdm(self.data.groupby('partition')):
+            min_uprn = group['UPRN'].min()
+            max_uprn = group['UPRN'].max()
+            self.filenames[partition] = f"{min_uprn}_{max_uprn}.csv"
+
+        self.data['filename'] = self.data['partition'].map(self.filenames)
+
+    @staticmethod
+    def find_filename_for_uprn(uprn, filenames):
+        for filename in filenames:
+            min_uprn, max_uprn = map(int, filename.replace(".csv", "").split("_"))
+            if min_uprn <= uprn <= max_uprn:
+                return filename
+        return None
+
+    @staticmethod
+    def convert_bng_data_to_gpd(df):
+        
+        gpd_data = gpd.GeoDataFrame(
+            df,
+            geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE),
+            crs="EPSG:27700"  # British National Grid
+        )
+
+        return gpd_data
--- a/etl/spatial/SpecialBuildingsClient.py
+++ b/etl/spatial/SpecialBuildingsClient.py
@ -0,0 +1,114 @@
+import geopandas as gpd
+from shapely.geometry import Point
+from utils.logger import setup_logger
+from etl.spatial.ConservationAreaClient import read_shapefile_from_s3
+from datatypes.datatypes import OpenUprnCoordinateData
+
+logger = setup_logger()
+
+
+class SpecialBuildingsClient:
+    """
+    This class reads in data from Historic England, which can be used to determine if specific buildings are
+    listed or heritage buildings
+    """
+
+    def __init__(self, historic_england_listed_buildings_path, historic_england_heritage_buildings_path, bucket):
+        self.historic_england_listed_buildings_path = historic_england_listed_buildings_path
+        self.historic_england_heritage_buildings_path = historic_england_heritage_buildings_path
+        self.bucket = bucket
+
+        self.historic_england_listed_buildings = None
+        self.historic_england_heritage_buildings = None
+
+    def read(self):
+        """
+        Read the data
+        """
+        logger.info("Reading in historic england listed buildings shapefile")
+        self.historic_england_listed_buildings = read_shapefile_from_s3(
+            bucket_name=self.bucket, s3_file_key=self.historic_england_listed_buildings_path
+        )
+
+        logger.info("Reading in historic england heritage buildings shapefile")
+        self.historic_england_heritage_buildings = read_shapefile_from_s3(
+            bucket_name=self.bucket, s3_file_key=self.historic_england_heritage_buildings_path
+        )
+
+        # Convert the gov data to british national grid co-ordinates
+        self.historic_england_heritage_buildings = self.historic_england_heritage_buildings.to_crs("EPSG:27700")
+
+    def is_listed_building(self, coordinates: OpenUprnCoordinateData) -> bool:
+        """
+        Check if a location specified by British National Grid coordinates is a listed building.
+
+        :param coordinates: dictionary, which should have the OpenUprnCoordinateData format
+        :return: "listed_building" if the location is within a listed building polygon, "not_listed_building" otherwise
+        """
+        # Convert the coordinates to a Shapely Point object
+        point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
+
+        # Check if the point is within any of the listed building polygons
+        within_listed_buildings = self.historic_england_listed_buildings.contains(point)
+
+        if within_listed_buildings.any():
+            # If the point is within any listed building polygon, log the names of the buildings and return
+            # "listed_building"
+            names = self.historic_england_listed_buildings.loc[within_listed_buildings, "Name"]
+            logger.info(f"The location is within the following listed buildings: {names.values}")
+            return True
+
+        # If the point is not within any listed building polygon, return "not_listed_building"
+        return False
+
+    def is_listed_building_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
+        # Check against historic England listed buildings data
+        joined_gdf_listed = gpd.sjoin(uprn_gdf, self.historic_england_listed_buildings, how="left", predicate="within")
+
+        # Identify where we have matches
+        uprn_is_listed = joined_gdf_listed[~joined_gdf_listed.index_right.isna()]["UPRN"].unique()
+
+        # Populate the results in the input GeoDataFrame
+        uprn_gdf['is_listed_building'] = False
+        uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_listed), 'is_listed_building'] = True
+
+        return uprn_gdf
+
+    def is_heritage_building_at_risk(self, coordinates: OpenUprnCoordinateData) -> bool:
+        """
+        Check if a location specified by British National Grid coordinates is a heritage building at risk.
+
+        :param coordinates: dictionary, which should have the OpenUprnCoordinateData format
+        :return: "heritage_building_at_risk" if the location is within a heritage building at risk polygon,
+                 "not_heritage_building_at_risk" otherwise
+        """
+        # Convert the coordinates to a Shapely Point object
+        point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
+
+        # Check if the point is within any of the heritage building at risk polygons
+        within_heritage_buildings_at_risk = self.historic_england_heritage_buildings.contains(point)
+
+        if within_heritage_buildings_at_risk.any():
+            # If the point is within any heritage building at risk polygon, log the names of the buildings and return
+            # "heritage_building_at_risk"
+            names = self.historic_england_heritage_buildings.loc[within_heritage_buildings_at_risk, "EntryName"]
+            logger.info(f"The location is within the following heritage buildings at risk: {names.values}")
+            return True
+
+        # If the point is not within any heritage building at risk polygon, return "not_heritage_building_at_risk"
+        return False
+
+    def is_heritage_building_at_risk_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
+        # Check against historic England heritage buildings data
+        joined_gdf_heritage = gpd.sjoin(
+            uprn_gdf, self.historic_england_heritage_buildings, how="left", predicate="within"
+        )
+
+        # Identify where we have matches
+        uprn_is_heritage = joined_gdf_heritage[~joined_gdf_heritage.index_right.isna()]["UPRN"].unique()
+
+        # Populate the results in the input GeoDataFrame
+        uprn_gdf['is_heritage_building'] = False
+        uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_heritage), 'is_heritage_building'] = True
+
+        return uprn_gdf
--- a/etl/spatial/app.py
+++ b/etl/spatial/app.py
@ -0,0 +1,151 @@
+"""
+This application reads in the open uprn data from a static location and loads it into
+our database for querying from other services
+"""
+
+import os
+from tqdm import tqdm
+import pandas as pd
+from etl.spatial.ConservationAreaClient import ConservationAreaClient
+from etl.spatial.OpenUprnClient import OpenUprnClient
+from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
+from datatypes.datatypes import OpenUprnCoordinateData
+from utils.logger import setup_logger
+from utils.s3 import save_dataframe_to_s3_parquet
+
+BUCKET = "retrofit-datalake-dev"
+OUTPUT_BUCKET = "retrofit-dev-dev"
+HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
+GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
+OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
+HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME = "spatial/National_Heritage_List_for_England_(" \
+                                             "NHLE)/Listed_Building_polygons.shp"
+HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME = \
+    "spatial/Historic_England_Heritage_at_Risk_Register_2022/Historic_England_Heritage_at_Risk_Register_2022.shp"
+
+logger = setup_logger()
+
+
+def app():
+    # TODO: Store the input data in S3 [x]
+    #       Read the input data from S3 [x]
+    #       Document the data source and where to find it [x]
+    #       Incorportate listed buildings [x]
+    #       Incorporate heritage buildings [x]
+    #       Write the outputs to S3 [ ]
+
+    """
+    This application uses the conservation area datasets to determine if a UPRN is
+    in a conservation area or now
+
+    We use two sources of data for determining if homes are in conservation areas.
+    The first is the Historic England dataset, which is a shapefile containing
+    polygons of conservation areas. The second is the gov.uk dataset, which is a
+    geojson file containing polygons of conservation areas.
+
+    The Historic England dataset can be found here:
+    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
+
+    The listed building dataset is also found at Historic England at:
+    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
+
+    The hertitige buildings dataset is also found at Historic England at:
+    https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
+
+    The Gov.uk dataset can be found here:
+    https://www.planning.data.gov.uk/dataset/conservation-area
+
+    The open UPRN data can be found here:
+    https://osdatahub.os.uk/downloads/open/OpenUPRN
+
+    The Office for National Statistics Postcode Lookup can be found here:
+    https://geoportal.statistics.gov.uk/datasets/9ac0331178b0435e839f62f41cc61c16/about
+
+    For the moment, these data sources are downloaded manually and uploaded to S3.
+    This application then processes those files and writes the results to s3
+    """
+
+    conservation_area_client = ConservationAreaClient(
+        historic_england_path=HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME,
+        gov_path=GOV_CONSERVARION_AREAS_PATHNAME,
+        bucket=BUCKET
+    )
+    conservation_area_client.read()
+
+    special_buildings_client = SpecialBuildingsClient(
+        historic_england_listed_buildings_path=HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME,
+        historic_england_heritage_buildings_path=HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME,
+        bucket=BUCKET
+    )
+    special_buildings_client.read()
+
+    # Local version
+    OPEN_UPRN_PATHNAME = "/Users/khalimconn-kowlessar/Documents/hestia/Model/model_data/local_data" \
+                         "/osopenuprn_202306_csv/osopenuprn_202305.csv"
+    open_uprn_client = OpenUprnClient(
+        path=OPEN_UPRN_PATHNAME,
+        bucket=BUCKET
+    )
+    open_uprn_client.read()
+    open_uprn_client.read_local()
+
+    # We want to sort the data and split it into filenames on UPRN.
+    # We'll split the data into chunks of 50,000
+    open_uprn_client.create_file_partitions()
+
+    # special_buildings_client = SpecialBuildingsClient(
+    #     historic_england_listed_buildings_path=None,
+    #     historic_england_heritage_buildings_path=None,
+    #     bucket=None
+    # )
+    # special_buildings_client.historic_england_listed_buildings = \
+    # special_buildings_client2.historic_england_listed_buildings
+    # special_buildings_client.historic_england_heritage_buildings = \
+    #     special_buildings_client2.historic_england_heritage_buildings
+
+    logger.info("Extracting spatial data for uprn partitions")
+    to_loop_over = open_uprn_client.data.groupby("filename")
+
+    for filename, uprn_df in tqdm(open_uprn_client.data.groupby("filename"), total=len(to_loop_over)):
+        uprn_gdf = OpenUprnClient.convert_bng_data_to_gpd(uprn_df)
+
+        uprn_gdf = conservation_area_client.is_in_conservation_area_vectorised(uprn_gdf=uprn_gdf)
+        uprn_gdf = special_buildings_client.is_listed_building_vectorised(uprn_gdf=uprn_gdf)
+        uprn_gdf = special_buildings_client.is_heritage_building_at_risk_vectorised(uprn_gdf=uprn_gdf)
+
+        # Convert back to a regular dataframe
+        uprn_gdf = uprn_gdf.drop(columns=["geometry"])
+        uprn_gdf = pd.DataFrame(uprn_gdf)
+
+        save_dataframe_to_s3_parquet(
+            df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
+        )
+
+    # We need to iterate through the open uprn data and check if the coordinates are in a conservation area
+    open_uprn_data = [
+        {'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
+         'LONGITUDE': -0.0540506},
+        {'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
+         'LONGITUDE': -0.0498772},
+        {'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
+         'LONGITUDE': -0.226392},
+        {'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
+         'LONGITUDE': -0.0792445},
+        {'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
+         'LONGITUDE': -0.0792445},
+        {'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
+         'LONGITUDE': -0.0468833},
+        {'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
+         'LONGITUDE': -0.1362513},
+        {'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
+         'LONGITUDE': -0.0823165}
+    ]
+
+    result = [
+        {
+            "uprn": coordinates["UPRN"],
+            "is_in_conservation_area": conservation_area_client.is_in_conservation_area(
+                OpenUprnCoordinateData(**coordinates))
+        } for coordinates in
+        open_uprn_data
+    ]
--- a/etl/conservation_areas/requirements.txt
+++ b/etl/conservation_areas/requirements.txt
--- a/model_data/simulation_system/generate_rdsap_change.py
+++ b/model_data/simulation_system/generate_rdsap_change.py
@ -564,6 +564,12 @@ def app():

    output = pd.concat(dataset)

+    # Remove any records that have huge swings in their floor area
+    output["tfa_diff_abs"] = abs(output["TOTAL_FLOOR_AREA_ENDING"] - output["TOTAL_FLOOR_AREA_STARTING"])
+    output["tfa_diff_prop"] = output["tfa_diff_abs"] / output["TOTAL_FLOOR_AREA_STARTING"]
+    output = output[output["tfa_diff_prop"] < 0.5]
+    output = output.drop(columns=["tfa_diff_abs", "tfa_diff_prop"])
+
    uvalue_columns = [col for col in output.columns if "thermal_transmittance" in col]
    for uvalue_col in uvalue_columns:
        output[uvalue_col] = pd.to_numeric(output[uvalue_col])
@ -571,15 +577,7 @@ def app():
    save_dataframe_to_s3_parquet(
        df=output,
        bucket_name="retrofit-data-dev",
-        file_key="sap_change_model/dataset_without_differencing.parquet",
-    )
-
-    output = DataProcessor.difference_data(output)
-
-    save_dataframe_to_s3_parquet(
-        df=output,
-        bucket_name="retrofit-data-dev",
-        file_key="sap_change_model/dataset_with_differencing.parquet",
+        file_key="sap_change_model/dataset.parquet",
    )


--- a/open_uprn/OpenUprnClient.py
+++ b/open_uprn/OpenUprnClient.py
@ -1,31 +0,0 @@
-import pandas as pd
-from utils.logger import setup_logger
-
-logger = setup_logger()
-
-
-class OpenUprnClient:
-    """
-    Specs for this dataset can be found here:
-    https://www.ordnancesurvey.co.uk/documents/product-support/tech-spec/open-uprn-techspec-v1.pdf
-    """
-
-    # TODO: Document this
-
-    def __init__(self, path, uprns=None):
-        self.path = path
-        self.uprns = [int(x) for x in uprns] if uprns else None
-        self.data = None
-
-    def read(self):
-        """
-        This methodology is placeholder, while data sits localls
-        :return:
-        """
-        logger.info("Reading in open uprn data")
-
-        df = pd.read_csv(self.path)
-        if self.uprns:
-            df = df[df["UPRN"].isin(self.uprns)]
-
-        self.data = df
--- a/open_uprn/init.py
+++ b/open_uprn/init.py
--- a/open_uprn/app.py
+++ b/open_uprn/app.py
@ -1,18 +0,0 @@
-"""
-This application reads in the open uprn data from a static location and loads it into
-our database for querying from other services
-"""
-
-import os
-from open_uprn.OpenUprnClient import OpenUprnClient
-
-
-def app():
-    open_uprn_client = OpenUprnClient(
-        path=os.path.abspath(
-            os.path.dirname(__file__)
-        ) + "/model_data/local_data/osopenuprn_202306_csv/osopenuprn_202305.csv",
-    )
-    open_uprn_client.read()
-
-    # TODO: Add a method to write to the database
--- a/open_uprn/requirements.txt
+++ b/open_uprn/requirements.txt
@ -1,13 +0,0 @@
-numpy==1.25.1
-pandas==2.0.3
-python-dateutil==2.8.2
-pytz==2023.3
-six==1.16.0
-tzdata==2023.3
-click==8.1.6
-joblib==1.3.1
-nltk==3.8.1
-regex==2023.6.3
-textblob==0.17.1
-tqdm==4.65.0
-