diff --git a/.idea/Model.iml b/.idea/Model.iml
index b03b31b1..44faa37d 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
-
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index ca0e1cd9..f0144d5b 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
-
+
diff --git a/model_data/simulation_system/area_data.py b/model_data/simulation_system/area_data.py
index 603b71c9..ee74012b 100644
--- a/model_data/simulation_system/area_data.py
+++ b/model_data/simulation_system/area_data.py
@@ -2,3 +2,83 @@
This script produces the dataset used to model the wall area of properties, which is used to estimate the cost
of insulation measures within homes
"""
+import boto3
+import PyPDF2
+import tempfile
+
+bucket = "retrofit-datalake-dev"
+
+
+def list_files_in_s3_folder(bucket_name, folder_name):
+ """
+ List files in a specific S3 bucket and folder.
+
+ Parameters:
+ - bucket_name: Name of the S3 bucket.
+ - folder_name: Name of the folder (prefix) within the bucket.
+
+ Returns:
+ - A list of file names within the specified folder.
+ """
+
+ # Ensure folder name ends with a '/'
+ if not folder_name.endswith('/'):
+ folder_name += '/'
+
+ s3_client = boto3.client('s3')
+
+ # Initialize empty list to store file names
+ files = []
+
+ # Initialize paginator
+ paginator = s3_client.get_paginator('list_objects_v2')
+
+ # Create a PageIterator from the Paginator
+ page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_name)
+
+ for page in page_iterator:
+ # Extract file names from the current page and append to the list
+ files.extend([item['Key'] for item in page.get('Contents', [])])
+
+ return files
+
+
+def fetch_pdf_from_s3(bucket_name, pdf_key, local_path):
+ """
+ Fetch a PDF from an S3 bucket and save it locally.
+
+ Parameters:
+ - bucket_name: Name of the S3 bucket.
+ - pdf_key: Path (key) of the PDF file within the bucket.
+ - local_path: Local path where the PDF should be saved.
+ """
+
+ s3_client = boto3.client('s3')
+ response = s3_client.get_object(Bucket=bucket_name, Key=pdf_key)
+
+ # Read the PDF bytes and save locally
+ with open(local_path, 'wb') as f:
+ f.write(response['Body'].read())
+
+
+# Usage
+bucket_name = 'YOUR_BUCKET_NAME'
+pdf_key = 'path/to/your/pdf_file.pdf'
+local_path = 'local_file_name.pdf'
+fetch_pdf_from_s3(bucket_name, pdf_key, local_path)
+
+
+def handler():
+ files = list_files_in_s3_folder(bucket, "full_sap_calculations")
+
+ # get pdfs
+ sap_calulation_pdfs = [file for file in files if file.endswith(".pdf")]
+
+ # For each pdf, we pull out the net & gross wall areas
+
+ data = []
+ for sap_calculation_file in sap_calulation_pdfs:
+ # Create a temp file to store the PDF
+ temp_filename = tempfile.NamedTemporaryFile(suffix=".pdf").name
+
+ pdf_file = fetch_pdf_from_s3(bucket, sap_calculation_file, temp_filename)
diff --git a/model_data/simulation_system/requirements/area_data.txt b/model_data/simulation_system/requirements/area_data.txt
new file mode 100644
index 00000000..f6bff53c
--- /dev/null
+++ b/model_data/simulation_system/requirements/area_data.txt
@@ -0,0 +1,2 @@
+boto3==1.28.38
+PyPDF2==3.0.1
\ No newline at end of file