survey-extraction/etl/read_stuff_from_s3_example.py

import boto3
import os


def print_hello_from_etl_module():
    print("You are printing from a etl module we made in poetry")

def split_s3_url(s3_url):
    if not s3_url.startswith("s3://"):
        raise ValueError("Invalid S3 URL. Must start with 's3://'")

    path = s3_url[5:]
    parts = path.split('/', 1)

    if len(parts) != 2:
        raise ValueError("S3 URL must include a key after the bucket name")
    return parts[0], parts[1]

def create_temp_file(content_bytes, relative_path):
    # Save under /tmp/s3/
    full_path = os.path.join("/tmp/s3", relative_path)

    # Make sure the directory exists
    os.makedirs(os.path.dirname(full_path), exist_ok=True)

    # Write content to file
    with open(full_path, 'wb') as temp_file:
        temp_file.write(content_bytes)

    print(f"Temporary file created at: {full_path}")
    return full_path

def download_data_from_s3(s3_uri):
    s3 = boto3.resource('s3')
    bucket_name, file_name = split_s3_url(s3_uri)

    obj = s3.Object(bucket_name, file_name)
    data = obj.get()['Body'].read()

    # Save using full S3 key as relative path
    return create_temp_file(data, file_name)

# Example usage
# download_data_from_s3("s3://retrofit-energy-assessments-dev/JAFFERSONS ENERGY CONSULTANTS/VDE001/12103116/docs & plans/77 Perryn Road, W3 7LT EPR.pdf")