added code for s3 url

This commit is contained in:
Jun-te Kim 2025-11-07 17:12:38 +00:00
parent 8daddf6cb7
commit 2fe2e5053f
5 changed files with 89 additions and 3 deletions

View file

@ -0,0 +1,33 @@
"""s3 add
Revision ID: c8af22cece92
Revises: ed6aaa298de4
Create Date: 2025-11-07 15:00:32.917157
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
import sqlmodel
# revision identifiers, used by Alembic.
revision: str = 'c8af22cece92'
down_revision: Union[str, None] = 'ed6aaa298de4'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
"""Upgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('hubspot_deal_data', sa.Column('major_condition_issue_evidence_s3_url', sqlmodel.sql.sqltypes.AutoString(), nullable=True))
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('hubspot_deal_data', 'major_condition_issue_evidence_s3_url')
# ### end Alembic commands ###

View file

@ -6,7 +6,7 @@ terraform {
}
}
backend "s3" {
bucket = "survey-extractor-tf-state"
= "survey-extractor-tf-state"
region = "eu-west-2"
key = "env:/dev/terraform.tfstate"
}

View file

@ -3,6 +3,8 @@ from enum import Enum
from etl.utils.logger import Logger
import logging
from hubspot.crm.associations import ApiException
import os
import requests
class Companies(Enum):
ABRI = "237615001799"
@ -208,4 +210,54 @@ class HubSpotClient():
except Exception as e:
self.logger.error(f"Error retrieving deal stages: {e}")
return []
return []
def download_file_from_url(self, download_url: str, save_path: str = None) -> str:
"""
Download a file from a HubSpot file URL (public or private), keeping its original file type.
"""
import mimetypes
import requests
import os
try:
headers = {}
if "hubspotusercontent" not in download_url:
headers["Authorization"] = f"Bearer {self.access_token}"
self.logger.info(f"Downloading HubSpot file: {download_url}")
response = requests.get(download_url, headers=headers, stream=True, allow_redirects=True)
response.raise_for_status()
# Try to infer filename from Content-Disposition header
content_disposition = response.headers.get("content-disposition")
if content_disposition and "filename=" in content_disposition:
filename = content_disposition.split("filename=")[1].strip('"')
else:
# fallback: extract from URL or content-type
filename = os.path.basename(download_url.split("?")[0]) or "hubspot_download"
if "." not in filename:
content_type = response.headers.get("content-type")
ext = mimetypes.guess_extension(content_type.split(";")[0]) if content_type else None
if ext:
filename += ext
# Make sure save_path is valid
if save_path is None:
save_path = os.path.abspath(filename)
elif os.path.isdir(save_path):
save_path = os.path.join(save_path, filename)
else:
# if user passes a file path directly, leave it
save_path = os.path.abspath(save_path)
with open(save_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
self.logger.info(f"File downloaded successfully → {save_path}")
return save_path
except requests.exceptions.RequestException as e:
self.logger.error(f"Failed to download file from HubSpot: {e}")
raise

View file

@ -83,6 +83,7 @@ class HubspotDealData(SQLModel, table=True):
major_condition_issue_description: Optional[str] = Field(default=None)
major_condition_issue_photos: Optional[str] = Field(default=None)
major_condition_issue_evidence_s3_url: Optional[str] = Field(default=None)
created_at: datetime = Field(
sa_column=Column(

View file

@ -1,4 +1,4 @@
#poetry run alembic revision --autogenerate -m "added major condition issue things"
#poetry run alembic revision --autogenerate -m "s3 add "
poetry run alembic upgrade head