mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added s3 download for conservation area data
This commit is contained in:
parent
642a224a7b
commit
64b6b67499
6 changed files with 142 additions and 28 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -261,3 +261,6 @@ model_data/simulation_system/predictions/
|
|||
|
||||
.idea/Model.iml
|
||||
.idea/misc.iml
|
||||
|
||||
adhoc
|
||||
adhoc/*
|
||||
|
|
@ -79,17 +79,17 @@ def sap_to_epc(sap_points: int):
|
|||
if sap_points <= 0 or sap_points > 100:
|
||||
raise ValueError("SAP points should be between 1 and 100.")
|
||||
|
||||
if sap_points > 91:
|
||||
if sap_points >= 92:
|
||||
return "A"
|
||||
elif sap_points > 80:
|
||||
elif sap_points >= 81:
|
||||
return "B"
|
||||
elif sap_points > 69:
|
||||
elif sap_points >= 69:
|
||||
return "C"
|
||||
elif sap_points > 55:
|
||||
elif sap_points >= 55:
|
||||
return "D"
|
||||
elif sap_points > 39:
|
||||
elif sap_points >= 39:
|
||||
return "E"
|
||||
elif sap_points > 21:
|
||||
elif sap_points >= 21:
|
||||
return "F"
|
||||
else:
|
||||
return "G"
|
||||
|
|
@ -108,13 +108,13 @@ def epc_to_sap_lower_bound(epc: str):
|
|||
elif epc == "B":
|
||||
return 81
|
||||
elif epc == "C":
|
||||
return 70
|
||||
return 69
|
||||
elif epc == "D":
|
||||
return 56
|
||||
return 55
|
||||
elif epc == "E":
|
||||
return 40
|
||||
return 39
|
||||
elif epc == "F":
|
||||
return 22
|
||||
return 21
|
||||
elif epc == "G":
|
||||
return 1
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -1,12 +1,55 @@
|
|||
from enum import Enum
|
||||
import boto3
|
||||
import os
|
||||
import tempfile
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
from enum import Enum
|
||||
from shapely.geometry import Point
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_io_from_s3
|
||||
from datatypes.datatypes import OpenUprnCoordinateData
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def read_shapefile_from_s3(bucket_name, s3_file_key):
|
||||
"""
|
||||
Read a shapefile from S3 into a GeoDataFrame.
|
||||
|
||||
:param bucket_name: The name of the S3 bucket
|
||||
:param s3_file_key: The file path of the shape file
|
||||
:return: GeoDataFrame containing the shapefile data
|
||||
"""
|
||||
|
||||
s3_folder_key = "/".join(s3_file_key.split("/")[:-1])
|
||||
shape_file_key = s3_file_key.split("/")[-1]
|
||||
# Create a temporary directory
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
s3_client = boto3.client('s3')
|
||||
|
||||
# Ensure the temporary directory exists
|
||||
logger.info("Creating temporary directory at %s" % tmpdirname)
|
||||
os.makedirs(tmpdirname, exist_ok=True)
|
||||
|
||||
# List all files in the given S3 folder
|
||||
s3_objects = s3_client.list_objects(Bucket=bucket_name, Prefix=s3_folder_key)['Contents']
|
||||
|
||||
# Download each file to the temporary directory
|
||||
for s3_object in s3_objects:
|
||||
file_key = s3_object['Key']
|
||||
file_name = os.path.basename(file_key)
|
||||
local_file_path = os.path.join(tmpdirname, file_name)
|
||||
# Explicitly create the temporary file
|
||||
with open(local_file_path, 'wb') as tmpfile:
|
||||
s3_client.download_fileobj(bucket_name, file_key, tmpfile)
|
||||
|
||||
# Read the shapefile from the temporary directory into a GeoDataFrame
|
||||
shapefile_path = os.path.join(tmpdirname, shape_file_key)
|
||||
gdf = gpd.read_file(shapefile_path)
|
||||
|
||||
return gdf
|
||||
|
||||
|
||||
class ConservationAreaClient:
|
||||
"""
|
||||
Class to interact and manupulate convervation area data. The historic england data
|
||||
|
|
@ -22,9 +65,10 @@ class ConservationAreaClient:
|
|||
NOT_IN_CONSERVATION_AREA = "not_in_conservation_area"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
def __init__(self, historic_england_path, gov_path):
|
||||
def __init__(self, historic_england_path, gov_path, bucket):
|
||||
self.historic_england_path = historic_england_path
|
||||
self.gov_path = gov_path
|
||||
self.bucket = bucket
|
||||
|
||||
self.historic_england_data = None
|
||||
self.gov_data = None
|
||||
|
|
@ -34,10 +78,18 @@ class ConservationAreaClient:
|
|||
Read the data
|
||||
"""
|
||||
logger.info("Reading in historic england conservation area shapefile")
|
||||
self.historic_england_data = gpd.read_file(self.historic_england_path)
|
||||
self.historic_england_data = read_shapefile_from_s3(
|
||||
bucket_name=self.bucket, s3_file_key=self.historic_england_path
|
||||
)
|
||||
|
||||
logger.info("Reading in Govenment conservation area geojson")
|
||||
self.gov_data = gpd.read_file(self.gov_path)
|
||||
|
||||
self.gov_data = gpd.read_file(
|
||||
read_io_from_s3(
|
||||
bucket_name=self.bucket,
|
||||
file_key=self.gov_path
|
||||
)
|
||||
)
|
||||
self.gov_data = self.gov_data.drop(columns=["dataset"])
|
||||
|
||||
def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):
|
||||
|
|
@ -3,19 +3,49 @@ This application reads in the open uprn data from a static location and loads it
|
|||
our database for querying from other services
|
||||
"""
|
||||
|
||||
import os
|
||||
from conservation_areas.ConservationAreaClient import ConservationAreaClient
|
||||
from etl.conservation_areas.ConservationAreaClient import ConservationAreaClient
|
||||
from datatypes.datatypes import OpenUprnCoordinateData
|
||||
|
||||
BUCKET = "retrofit-data-dev"
|
||||
HISTORIC_ENGLAND_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
|
||||
GOV_PATHNAME = "spatial/gov-conservation-area.geojson"
|
||||
|
||||
|
||||
def app():
|
||||
# TODO: Store the input data in S3 [x]
|
||||
# Read the input data from S3 [ ]
|
||||
# Document the data source and where to find it [x]
|
||||
# Write the outputs to S3
|
||||
|
||||
"""
|
||||
This application uses the conservation area datasets to determine if a UPRN is
|
||||
in a conservation area or now
|
||||
|
||||
We use two sources of data for determining if homes are in conservation areas.
|
||||
The first is the Historic England dataset, which is a shapefile containing
|
||||
polygons of conservation areas. The second is the gov.uk dataset, which is a
|
||||
geojson file containing polygons of conservation areas.
|
||||
|
||||
The Historic England dataset can be found here:
|
||||
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||
|
||||
The listed building dataset is also found at Historic England at:
|
||||
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||
|
||||
The hertitige buildings dataset is also found at Historic England at:
|
||||
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||
|
||||
The Gov.uk dataset can be found here:
|
||||
https://www.planning.data.gov.uk/dataset/conservation-area
|
||||
|
||||
For the moment, these data sources are downloaded manually and uploaded to S3.
|
||||
This application then processes those files and writes the results to s3
|
||||
"""
|
||||
|
||||
conservation_area_client = ConservationAreaClient(
|
||||
historic_england_path=os.path.abspath(
|
||||
os.path.dirname(__file__)
|
||||
) + "/model_data/local_data/Historic_Eng_Conservation_Areas/Conservation_Areas.shp",
|
||||
gov_path=os.path.abspath(
|
||||
os.path.dirname(__file__)
|
||||
) + "/model_data/local_data/gov-conservation-area.geojson"
|
||||
historic_england_path=HISTORIC_ENGLAND_PATHNAME,
|
||||
gov_path=GOV_PATHNAME,
|
||||
bucket=BUCKET
|
||||
)
|
||||
conservation_area_client.read()
|
||||
|
||||
|
|
@ -39,6 +69,11 @@ def app():
|
|||
'LONGITUDE': -0.0823165}
|
||||
]
|
||||
|
||||
open_uprn_data = [
|
||||
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
|
||||
'LONGITUDE': -0.0823165}
|
||||
]
|
||||
|
||||
result = [
|
||||
{
|
||||
"uprn": coordinates["UPRN"],
|
||||
36
utils/s3.py
36
utils/s3.py
|
|
@ -2,6 +2,9 @@ import boto3
|
|||
from io import BytesIO
|
||||
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
|
||||
import pandas as pd
|
||||
from utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def read_from_s3(bucket_name, s3_file_name):
|
||||
|
|
@ -46,6 +49,27 @@ def save_data_to_s3(data, bucket_name, s3_file_name):
|
|||
print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')
|
||||
|
||||
|
||||
def read_io_from_s3(bucket_name, file_key):
|
||||
"""
|
||||
Read a file from S3 into a BytesIO object. This can be used by other methods to parse the response
|
||||
|
||||
Because we use
|
||||
|
||||
:param bucket_name: The name of the S3 bucket
|
||||
:param file_key: The file name of the shapefile in S3
|
||||
:return: Io file to be parsed by another method
|
||||
"""
|
||||
client = boto3.client('s3')
|
||||
|
||||
# Get the Parquet file from S3
|
||||
response = client.get_object(Bucket=bucket_name, Key=file_key)
|
||||
|
||||
# Read the file into an io object
|
||||
buffer = BytesIO(response['Body'].read())
|
||||
|
||||
return buffer
|
||||
|
||||
|
||||
def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
|
||||
"""
|
||||
Save a pandas DataFrame to S3 as a Parquet file.
|
||||
|
|
@ -75,14 +99,14 @@ def read_dataframe_from_s3_parquet(bucket_name, file_key):
|
|||
:return: A pandas DataFrame.
|
||||
"""
|
||||
|
||||
# Create the boto3 client
|
||||
client = boto3.client('s3')
|
||||
if not file_key.endswith(".parquet"):
|
||||
raise logger.warning("This file doesn't look like a parquet file")
|
||||
|
||||
# Get the Parquet file from S3
|
||||
response = client.get_object(Bucket=bucket_name, Key=file_key)
|
||||
parquet_buffer = read_io_from_s3(
|
||||
bucket_name=bucket_name,
|
||||
file_key=file_key
|
||||
)
|
||||
|
||||
# Read the file into a pandas DataFrame
|
||||
parquet_buffer = BytesIO(response['Body'].read())
|
||||
df = pd.read_parquet(parquet_buffer)
|
||||
|
||||
return df
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue