Model/model_data/simulation_system/area_data.py

"""
This script produces the dataset used to model the wall area of properties, which is used to estimate the cost
of insulation measures within homes
"""
import boto3
import PyPDF2
import tempfile

bucket = "retrofit-datalake-dev"


def list_files_in_s3_folder(bucket_name, folder_name):
    """
    List files in a specific S3 bucket and folder.

    Parameters:
    - bucket_name: Name of the S3 bucket.
    - folder_name: Name of the folder (prefix) within the bucket.

    Returns:
    - A list of file names within the specified folder.
    """

    # Ensure folder name ends with a '/'
    if not folder_name.endswith('/'):
        folder_name += '/'

    s3_client = boto3.client('s3')

    # Initialize empty list to store file names
    files = []

    # Initialize paginator
    paginator = s3_client.get_paginator('list_objects_v2')

    # Create a PageIterator from the Paginator
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_name)

    for page in page_iterator:
        # Extract file names from the current page and append to the list
        files.extend([item['Key'] for item in page.get('Contents', [])])

    return files


def fetch_pdf_from_s3(bucket_name, pdf_key, local_path):
    """
    Fetch a PDF from an S3 bucket and save it locally.

    Parameters:
    - bucket_name: Name of the S3 bucket.
    - pdf_key: Path (key) of the PDF file within the bucket.
    - local_path: Local path where the PDF should be saved.
    """

    s3_client = boto3.client('s3')
    response = s3_client.get_object(Bucket=bucket_name, Key=pdf_key)

    # Read the PDF bytes and save locally
    with open(local_path, 'wb') as f:
        f.write(response['Body'].read())


# Usage
bucket_name = 'YOUR_BUCKET_NAME'
pdf_key = 'path/to/your/pdf_file.pdf'
local_path = 'local_file_name.pdf'
fetch_pdf_from_s3(bucket_name, pdf_key, local_path)


def handler():
    files = list_files_in_s3_folder(bucket, "full_sap_calculations")

    # get pdfs
    sap_calulation_pdfs = [file for file in files if file.endswith(".pdf")]

    # For each pdf, we pull out the net & gross wall areas

    data = []
    for sap_calculation_file in sap_calulation_pdfs:
        # Create a temp file to store the PDF
        temp_filename = tempfile.NamedTemporaryFile(suffix=".pdf").name

        pdf_file = fetch_pdf_from_s3(bucket, sap_calculation_file, temp_filename)