""" This script produces the dataset used to model the wall area of properties, which is used to estimate the cost of insulation measures within homes """ import boto3 import PyPDF2 import tempfile bucket = "retrofit-datalake-dev" def list_files_in_s3_folder(bucket_name, folder_name): """ List files in a specific S3 bucket and folder. Parameters: - bucket_name: Name of the S3 bucket. - folder_name: Name of the folder (prefix) within the bucket. Returns: - A list of file names within the specified folder. """ # Ensure folder name ends with a '/' if not folder_name.endswith('/'): folder_name += '/' s3_client = boto3.client('s3') # Initialize empty list to store file names files = [] # Initialize paginator paginator = s3_client.get_paginator('list_objects_v2') # Create a PageIterator from the Paginator page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_name) for page in page_iterator: # Extract file names from the current page and append to the list files.extend([item['Key'] for item in page.get('Contents', [])]) return files def fetch_pdf_from_s3(bucket_name, pdf_key, local_path): """ Fetch a PDF from an S3 bucket and save it locally. Parameters: - bucket_name: Name of the S3 bucket. - pdf_key: Path (key) of the PDF file within the bucket. - local_path: Local path where the PDF should be saved. """ s3_client = boto3.client('s3') response = s3_client.get_object(Bucket=bucket_name, Key=pdf_key) # Read the PDF bytes and save locally with open(local_path, 'wb') as f: f.write(response['Body'].read()) # Usage bucket_name = 'YOUR_BUCKET_NAME' pdf_key = 'path/to/your/pdf_file.pdf' local_path = 'local_file_name.pdf' fetch_pdf_from_s3(bucket_name, pdf_key, local_path) def handler(): files = list_files_in_s3_folder(bucket, "full_sap_calculations") # get pdfs sap_calulation_pdfs = [file for file in files if file.endswith(".pdf")] # For each pdf, we pull out the net & gross wall areas data = [] for sap_calculation_file in sap_calulation_pdfs: # Create a temp file to store the PDF temp_filename = tempfile.NamedTemporaryFile(suffix=".pdf").name pdf_file = fetch_pdf_from_s3(bucket, sap_calculation_file, temp_filename)