Model/model_data/simulation_system/area_data.py
2023-08-31 13:45:06 +01:00

84 lines
2.3 KiB
Python

"""
This script produces the dataset used to model the wall area of properties, which is used to estimate the cost
of insulation measures within homes
"""
import boto3
import PyPDF2
import tempfile
bucket = "retrofit-datalake-dev"
def list_files_in_s3_folder(bucket_name, folder_name):
"""
List files in a specific S3 bucket and folder.
Parameters:
- bucket_name: Name of the S3 bucket.
- folder_name: Name of the folder (prefix) within the bucket.
Returns:
- A list of file names within the specified folder.
"""
# Ensure folder name ends with a '/'
if not folder_name.endswith('/'):
folder_name += '/'
s3_client = boto3.client('s3')
# Initialize empty list to store file names
files = []
# Initialize paginator
paginator = s3_client.get_paginator('list_objects_v2')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_name)
for page in page_iterator:
# Extract file names from the current page and append to the list
files.extend([item['Key'] for item in page.get('Contents', [])])
return files
def fetch_pdf_from_s3(bucket_name, pdf_key, local_path):
"""
Fetch a PDF from an S3 bucket and save it locally.
Parameters:
- bucket_name: Name of the S3 bucket.
- pdf_key: Path (key) of the PDF file within the bucket.
- local_path: Local path where the PDF should be saved.
"""
s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket=bucket_name, Key=pdf_key)
# Read the PDF bytes and save locally
with open(local_path, 'wb') as f:
f.write(response['Body'].read())
# Usage
bucket_name = 'YOUR_BUCKET_NAME'
pdf_key = 'path/to/your/pdf_file.pdf'
local_path = 'local_file_name.pdf'
fetch_pdf_from_s3(bucket_name, pdf_key, local_path)
def handler():
files = list_files_in_s3_folder(bucket, "full_sap_calculations")
# get pdfs
sap_calulation_pdfs = [file for file in files if file.endswith(".pdf")]
# For each pdf, we pull out the net & gross wall areas
data = []
for sap_calculation_file in sap_calulation_pdfs:
# Create a temp file to store the PDF
temp_filename = tempfile.NamedTemporaryFile(suffix=".pdf").name
pdf_file = fetch_pdf_from_s3(bucket, sap_calculation_file, temp_filename)