mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-08 11:17:29 +00:00
44 lines
1.3 KiB
Python
44 lines
1.3 KiB
Python
import boto3
|
|
import os
|
|
|
|
|
|
def print_hello_from_etl_module():
|
|
print("You are printing from a etl module we made in poetry")
|
|
|
|
def split_s3_url(s3_url):
|
|
if not s3_url.startswith("s3://"):
|
|
raise ValueError("Invalid S3 URL. Must start with 's3://'")
|
|
|
|
path = s3_url[5:]
|
|
parts = path.split('/', 1)
|
|
|
|
if len(parts) != 2:
|
|
raise ValueError("S3 URL must include a key after the bucket name")
|
|
return parts[0], parts[1]
|
|
|
|
def create_temp_file(content_bytes, relative_path):
|
|
# Save under /tmp/s3/
|
|
full_path = os.path.join("/tmp/s3", relative_path)
|
|
|
|
# Make sure the directory exists
|
|
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
|
|
|
# Write content to file
|
|
with open(full_path, 'wb') as temp_file:
|
|
temp_file.write(content_bytes)
|
|
|
|
print(f"Temporary file created at: {full_path}")
|
|
return full_path
|
|
|
|
def download_data_from_s3(s3_uri):
|
|
s3 = boto3.resource('s3')
|
|
bucket_name, file_name = split_s3_url(s3_uri)
|
|
|
|
obj = s3.Object(bucket_name, file_name)
|
|
data = obj.get()['Body'].read()
|
|
|
|
# Save using full S3 key as relative path
|
|
return create_temp_file(data, file_name)
|
|
|
|
# Example usage
|
|
# download_data_from_s3("s3://retrofit-energy-assessments-dev/JAFFERSONS ENERGY CONSULTANTS/VDE001/12103116/docs & plans/77 Perryn Road, W3 7LT EPR.pdf")
|