mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
84 lines
2.3 KiB
Python
84 lines
2.3 KiB
Python
"""
|
|
This script produces the dataset used to model the wall area of properties, which is used to estimate the cost
|
|
of insulation measures within homes
|
|
"""
|
|
import boto3
|
|
import PyPDF2
|
|
import tempfile
|
|
|
|
bucket = "retrofit-datalake-dev"
|
|
|
|
|
|
def list_files_in_s3_folder(bucket_name, folder_name):
|
|
"""
|
|
List files in a specific S3 bucket and folder.
|
|
|
|
Parameters:
|
|
- bucket_name: Name of the S3 bucket.
|
|
- folder_name: Name of the folder (prefix) within the bucket.
|
|
|
|
Returns:
|
|
- A list of file names within the specified folder.
|
|
"""
|
|
|
|
# Ensure folder name ends with a '/'
|
|
if not folder_name.endswith('/'):
|
|
folder_name += '/'
|
|
|
|
s3_client = boto3.client('s3')
|
|
|
|
# Initialize empty list to store file names
|
|
files = []
|
|
|
|
# Initialize paginator
|
|
paginator = s3_client.get_paginator('list_objects_v2')
|
|
|
|
# Create a PageIterator from the Paginator
|
|
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_name)
|
|
|
|
for page in page_iterator:
|
|
# Extract file names from the current page and append to the list
|
|
files.extend([item['Key'] for item in page.get('Contents', [])])
|
|
|
|
return files
|
|
|
|
|
|
def fetch_pdf_from_s3(bucket_name, pdf_key, local_path):
|
|
"""
|
|
Fetch a PDF from an S3 bucket and save it locally.
|
|
|
|
Parameters:
|
|
- bucket_name: Name of the S3 bucket.
|
|
- pdf_key: Path (key) of the PDF file within the bucket.
|
|
- local_path: Local path where the PDF should be saved.
|
|
"""
|
|
|
|
s3_client = boto3.client('s3')
|
|
response = s3_client.get_object(Bucket=bucket_name, Key=pdf_key)
|
|
|
|
# Read the PDF bytes and save locally
|
|
with open(local_path, 'wb') as f:
|
|
f.write(response['Body'].read())
|
|
|
|
|
|
# Usage
|
|
bucket_name = 'YOUR_BUCKET_NAME'
|
|
pdf_key = 'path/to/your/pdf_file.pdf'
|
|
local_path = 'local_file_name.pdf'
|
|
fetch_pdf_from_s3(bucket_name, pdf_key, local_path)
|
|
|
|
|
|
def handler():
|
|
files = list_files_in_s3_folder(bucket, "full_sap_calculations")
|
|
|
|
# get pdfs
|
|
sap_calulation_pdfs = [file for file in files if file.endswith(".pdf")]
|
|
|
|
# For each pdf, we pull out the net & gross wall areas
|
|
|
|
data = []
|
|
for sap_calculation_file in sap_calulation_pdfs:
|
|
# Create a temp file to store the PDF
|
|
temp_filename = tempfile.NamedTemporaryFile(suffix=".pdf").name
|
|
|
|
pdf_file = fetch_pdf_from_s3(bucket, sap_calculation_file, temp_filename)
|