mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-30 13:10:56 +00:00
example for khalim
This commit is contained in:
parent
bedfc1dab4
commit
982dd3c598
3 changed files with 112 additions and 1 deletions
40
etl/read_stuff_from_s3_example.py
Normal file
40
etl/read_stuff_from_s3_example.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
import boto3
|
||||
import os
|
||||
|
||||
def split_s3_url(s3_url):
|
||||
if not s3_url.startswith("s3://"):
|
||||
raise ValueError("Invalid S3 URL. Must start with 's3://'")
|
||||
|
||||
path = s3_url[5:]
|
||||
parts = path.split('/', 1)
|
||||
|
||||
if len(parts) != 2:
|
||||
raise ValueError("S3 URL must include a key after the bucket name")
|
||||
return parts[0], parts[1]
|
||||
|
||||
def create_temp_file(content_bytes, relative_path):
|
||||
# Save under /tmp/s3/
|
||||
full_path = os.path.join("/tmp/s3", relative_path)
|
||||
|
||||
# Make sure the directory exists
|
||||
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
||||
|
||||
# Write content to file
|
||||
with open(full_path, 'wb') as temp_file:
|
||||
temp_file.write(content_bytes)
|
||||
|
||||
print(f"Temporary file created at: {full_path}")
|
||||
return full_path
|
||||
|
||||
def download_data_from_s3(s3_uri):
|
||||
s3 = boto3.resource('s3')
|
||||
bucket_name, file_name = split_s3_url(s3_uri)
|
||||
|
||||
obj = s3.Object(bucket_name, file_name)
|
||||
data = obj.get()['Body'].read()
|
||||
|
||||
# Save using full S3 key as relative path
|
||||
return create_temp_file(data, file_name)
|
||||
|
||||
# Example usage
|
||||
download_data_from_s3("s3://retrofit-energy-assessments-dev/JAFFERSONS ENERGY CONSULTANTS/VDE001/12103116/docs & plans/77 Perryn Road, W3 7LT EPR.pdf")
|
||||
72
poetry.lock
generated
72
poetry.lock
generated
|
|
@ -84,6 +84,46 @@ charset-normalizer = ["charset-normalizer"]
|
|||
html5lib = ["html5lib"]
|
||||
lxml = ["lxml"]
|
||||
|
||||
[[package]]
|
||||
name = "boto3"
|
||||
version = "1.39.6"
|
||||
description = "The AWS SDK for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "boto3-1.39.6-py3-none-any.whl", hash = "sha256:db965dc9019df7b1d20e8d8ab7a653956f275865175a8652419ebfd03de03d83"},
|
||||
{file = "boto3-1.39.6.tar.gz", hash = "sha256:e75bfcd444e199767642f28ef8dc4f972846dc3118e48a7e09f9c458dae2021e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
botocore = ">=1.39.6,<1.40.0"
|
||||
jmespath = ">=0.7.1,<2.0.0"
|
||||
s3transfer = ">=0.13.0,<0.14.0"
|
||||
|
||||
[package.extras]
|
||||
crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
|
||||
|
||||
[[package]]
|
||||
name = "botocore"
|
||||
version = "1.39.6"
|
||||
description = "Low-level, data-driven core of boto 3."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "botocore-1.39.6-py3-none-any.whl", hash = "sha256:9c002724e9b97cec610dbbb3bb019b3248ff6bf58407835621f0461e740af90b"},
|
||||
{file = "botocore-1.39.6.tar.gz", hash = "sha256:d3a6c207d233ddee3289c1d56646047bef18b21a1faebb3d83a6fca149fd0f59"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
jmespath = ">=0.7.1,<2.0.0"
|
||||
python-dateutil = ">=2.1,<3.0.0"
|
||||
urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""}
|
||||
|
||||
[package.extras]
|
||||
crt = ["awscrt (==0.23.8)"]
|
||||
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
version = "2025.4.26"
|
||||
|
|
@ -717,6 +757,18 @@ docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alab
|
|||
qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"]
|
||||
testing = ["Django", "attrs", "colorama", "docopt", "pytest (<9.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "jmespath"
|
||||
version = "1.0.1"
|
||||
description = "JSON Matching Expressions"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"},
|
||||
{file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jupyter-client"
|
||||
version = "8.6.3"
|
||||
|
|
@ -1682,6 +1734,24 @@ urllib3 = ">=1.21.1,<3"
|
|||
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
|
||||
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
|
||||
|
||||
[[package]]
|
||||
name = "s3transfer"
|
||||
version = "0.13.0"
|
||||
description = "An Amazon S3 Transfer Manager"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "s3transfer-0.13.0-py3-none-any.whl", hash = "sha256:0148ef34d6dd964d0d8cf4311b2b21c474693e57c2e069ec708ce043d2b527be"},
|
||||
{file = "s3transfer-0.13.0.tar.gz", hash = "sha256:f5e6db74eb7776a37208001113ea7aa97695368242b364d73e91c981ac522177"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
botocore = ">=1.37.4,<2.0a.0"
|
||||
|
||||
[package.extras]
|
||||
crt = ["botocore[crt] (>=1.37.4,<2.0a.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "six"
|
||||
version = "1.17.0"
|
||||
|
|
@ -1969,4 +2039,4 @@ files = [
|
|||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.12"
|
||||
content-hash = "1d5c1e0bfc12e88ca9b4c46141c848064a45e9cc4b60990fa3ec7ecb5ef71209"
|
||||
content-hash = "7f71e898554f0285d708a349589cc96fe8cfc210e7d39caf66c1dc6faae010da"
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ dependencies = [
|
|||
"beautifulsoup4 (>=4.13.4,<5.0.0)",
|
||||
"tqdm (>=4.67.1,<5.0.0)",
|
||||
"hubspot-api-client (>=12.0.0,<13.0.0)",
|
||||
"boto3 (>=1.39.6,<2.0.0)",
|
||||
]
|
||||
|
||||
[tool.poetry]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue