diff --git a/alembic/versions/c8af22cece92_s3_add.py b/alembic/versions/c8af22cece92_s3_add.py new file mode 100644 index 0000000..9c3db63 --- /dev/null +++ b/alembic/versions/c8af22cece92_s3_add.py @@ -0,0 +1,33 @@ +"""s3 add + +Revision ID: c8af22cece92 +Revises: ed6aaa298de4 +Create Date: 2025-11-07 15:00:32.917157 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +import sqlmodel + + +# revision identifiers, used by Alembic. +revision: str = 'c8af22cece92' +down_revision: Union[str, None] = 'ed6aaa298de4' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('hubspot_deal_data', sa.Column('major_condition_issue_evidence_s3_url', sqlmodel.sql.sqltypes.AutoString(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('hubspot_deal_data', 'major_condition_issue_evidence_s3_url') + # ### end Alembic commands ### diff --git a/deployment/database/provider.tf b/deployment/database/provider.tf index 8f8274a..9ec544e 100644 --- a/deployment/database/provider.tf +++ b/deployment/database/provider.tf @@ -6,7 +6,7 @@ terraform { } } backend "s3" { - bucket = "survey-extractor-tf-state" + = "survey-extractor-tf-state" region = "eu-west-2" key = "env:/dev/terraform.tfstate" } diff --git a/etl/hubSpotClient/hubspotClient.py b/etl/hubSpotClient/hubspotClient.py index a913a60..adbec50 100644 --- a/etl/hubSpotClient/hubspotClient.py +++ b/etl/hubSpotClient/hubspotClient.py @@ -3,6 +3,8 @@ from enum import Enum from etl.utils.logger import Logger import logging from hubspot.crm.associations import ApiException +import os +import requests class Companies(Enum): ABRI = "237615001799" @@ -208,4 +210,54 @@ class HubSpotClient(): except Exception as e: self.logger.error(f"Error retrieving deal stages: {e}") - return [] \ No newline at end of file + return [] + + def download_file_from_url(self, download_url: str, save_path: str = None) -> str: + """ + Download a file from a HubSpot file URL (public or private), keeping its original file type. + """ + import mimetypes + import requests + import os + + try: + headers = {} + if "hubspotusercontent" not in download_url: + headers["Authorization"] = f"Bearer {self.access_token}" + + self.logger.info(f"Downloading HubSpot file: {download_url}") + response = requests.get(download_url, headers=headers, stream=True, allow_redirects=True) + response.raise_for_status() + + # Try to infer filename from Content-Disposition header + content_disposition = response.headers.get("content-disposition") + if content_disposition and "filename=" in content_disposition: + filename = content_disposition.split("filename=")[1].strip('"') + else: + # fallback: extract from URL or content-type + filename = os.path.basename(download_url.split("?")[0]) or "hubspot_download" + if "." not in filename: + content_type = response.headers.get("content-type") + ext = mimetypes.guess_extension(content_type.split(";")[0]) if content_type else None + if ext: + filename += ext + + # Make sure save_path is valid + if save_path is None: + save_path = os.path.abspath(filename) + elif os.path.isdir(save_path): + save_path = os.path.join(save_path, filename) + else: + # if user passes a file path directly, leave it + save_path = os.path.abspath(save_path) + + with open(save_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + self.logger.info(f"File downloaded successfully → {save_path}") + return save_path + + except requests.exceptions.RequestException as e: + self.logger.error(f"Failed to download file from HubSpot: {e}") + raise diff --git a/etl/models/topLevel.py b/etl/models/topLevel.py index de7139b..cdf1f9f 100644 --- a/etl/models/topLevel.py +++ b/etl/models/topLevel.py @@ -83,6 +83,7 @@ class HubspotDealData(SQLModel, table=True): major_condition_issue_description: Optional[str] = Field(default=None) major_condition_issue_photos: Optional[str] = Field(default=None) + major_condition_issue_evidence_s3_url: Optional[str] = Field(default=None) created_at: datetime = Field( sa_column=Column( diff --git a/migration_db.sh b/migration_db.sh index 1443b18..c0046bc 100644 --- a/migration_db.sh +++ b/migration_db.sh @@ -1,4 +1,4 @@ -#poetry run alembic revision --autogenerate -m "added major condition issue things" +#poetry run alembic revision --autogenerate -m "s3 add " poetry run alembic upgrade head