diff --git a/.idea/Model.iml b/.idea/Model.iml index b03b31b1..44faa37d 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index ca0e1cd9..f0144d5b 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,6 +1,6 @@ - + diff --git a/model_data/simulation_system/area_data.py b/model_data/simulation_system/area_data.py index 603b71c9..ee74012b 100644 --- a/model_data/simulation_system/area_data.py +++ b/model_data/simulation_system/area_data.py @@ -2,3 +2,83 @@ This script produces the dataset used to model the wall area of properties, which is used to estimate the cost of insulation measures within homes """ +import boto3 +import PyPDF2 +import tempfile + +bucket = "retrofit-datalake-dev" + + +def list_files_in_s3_folder(bucket_name, folder_name): + """ + List files in a specific S3 bucket and folder. + + Parameters: + - bucket_name: Name of the S3 bucket. + - folder_name: Name of the folder (prefix) within the bucket. + + Returns: + - A list of file names within the specified folder. + """ + + # Ensure folder name ends with a '/' + if not folder_name.endswith('/'): + folder_name += '/' + + s3_client = boto3.client('s3') + + # Initialize empty list to store file names + files = [] + + # Initialize paginator + paginator = s3_client.get_paginator('list_objects_v2') + + # Create a PageIterator from the Paginator + page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_name) + + for page in page_iterator: + # Extract file names from the current page and append to the list + files.extend([item['Key'] for item in page.get('Contents', [])]) + + return files + + +def fetch_pdf_from_s3(bucket_name, pdf_key, local_path): + """ + Fetch a PDF from an S3 bucket and save it locally. + + Parameters: + - bucket_name: Name of the S3 bucket. + - pdf_key: Path (key) of the PDF file within the bucket. + - local_path: Local path where the PDF should be saved. + """ + + s3_client = boto3.client('s3') + response = s3_client.get_object(Bucket=bucket_name, Key=pdf_key) + + # Read the PDF bytes and save locally + with open(local_path, 'wb') as f: + f.write(response['Body'].read()) + + +# Usage +bucket_name = 'YOUR_BUCKET_NAME' +pdf_key = 'path/to/your/pdf_file.pdf' +local_path = 'local_file_name.pdf' +fetch_pdf_from_s3(bucket_name, pdf_key, local_path) + + +def handler(): + files = list_files_in_s3_folder(bucket, "full_sap_calculations") + + # get pdfs + sap_calulation_pdfs = [file for file in files if file.endswith(".pdf")] + + # For each pdf, we pull out the net & gross wall areas + + data = [] + for sap_calculation_file in sap_calulation_pdfs: + # Create a temp file to store the PDF + temp_filename = tempfile.NamedTemporaryFile(suffix=".pdf").name + + pdf_file = fetch_pdf_from_s3(bucket, sap_calculation_file, temp_filename) diff --git a/model_data/simulation_system/requirements/area_data.txt b/model_data/simulation_system/requirements/area_data.txt new file mode 100644 index 00000000..f6bff53c --- /dev/null +++ b/model_data/simulation_system/requirements/area_data.txt @@ -0,0 +1,2 @@ +boto3==1.28.38 +PyPDF2==3.0.1 \ No newline at end of file