From e9db66b6b423f455697193af36922a3ace130da9 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 12 Mar 2026 11:58:46 +0000 Subject: [PATCH 01/47] added hubspot dependency to backend --- .devcontainer/backend/Dockerfile | 3 ++- etl/hubspot/hubspotClient.py | 5 +++++ etl/hubspot/requirements.txt | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 etl/hubspot/hubspotClient.py create mode 100644 etl/hubspot/requirements.txt diff --git a/.devcontainer/backend/Dockerfile b/.devcontainer/backend/Dockerfile index 662f53b0..6a1cc120 100644 --- a/.devcontainer/backend/Dockerfile +++ b/.devcontainer/backend/Dockerfile @@ -35,7 +35,8 @@ ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1 ADD backend/engine/requirements.txt requirements1.txt ADD backend/app/requirements/requirements.txt requirements2.txt ADD .devcontainer/backend/requirements.txt requirements3.txt -RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt +ADD etl/hubspot/requirements.txt requirements4.txt +RUN cat requirements1.txt requirements2.txt requirements3.txt requirements4.txt > requirements.txt RUN pip install -r requirements.txt # 5) Workdir diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py new file mode 100644 index 00000000..39cea6a1 --- /dev/null +++ b/etl/hubspot/hubspotClient.py @@ -0,0 +1,5 @@ +import hubspot + +class HubspotClient(): + + def \ No newline at end of file diff --git a/etl/hubspot/requirements.txt b/etl/hubspot/requirements.txt new file mode 100644 index 00000000..105cba07 --- /dev/null +++ b/etl/hubspot/requirements.txt @@ -0,0 +1 @@ +hubspot \ No newline at end of file From 76dbde602b1ff2d5cb29d4a946411283b951b7e2 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Mar 2026 13:27:52 +0000 Subject: [PATCH 02/47] added tests and hubspot client --- .devcontainer/backend/requirements.txt | 2 +- backend/app/config.py | 2 + conftest.py | 1 + etl/hubspot/hubspotClient.py | 442 +++++++++++++++++- etl/hubspot/requirements.txt | 2 +- etl/hubspot/tests/__init__.py | 0 .../tests/test_hubspot_client_integration.py | 117 +++++ pyrightconfig.json | 2 +- pytest.ini | 2 +- 9 files changed, 563 insertions(+), 7 deletions(-) create mode 100644 etl/hubspot/tests/__init__.py create mode 100644 etl/hubspot/tests/test_hubspot_client_integration.py diff --git a/.devcontainer/backend/requirements.txt b/.devcontainer/backend/requirements.txt index 5cd40ced..f6e1f665 100644 --- a/.devcontainer/backend/requirements.txt +++ b/.devcontainer/backend/requirements.txt @@ -23,4 +23,4 @@ psycopg[binary] pytest-postgresql # Formatting black==26.1.0 -boto3-stubs \ No newline at end of file +boto3-stubs diff --git a/backend/app/config.py b/backend/app/config.py index 6604fec9..46301e30 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -65,6 +65,8 @@ class Settings(BaseSettings): ORDNANCE_SURVEY_API_KEY: str = "changeme" + HUBSPOT_API_KEY: Optional[str] = None + # Optional AWS creds (only required in local) AWS_ACCESS_KEY_ID: Optional[str] = None AWS_SECRET_KEY_ID: Optional[str] = None diff --git a/conftest.py b/conftest.py index d93f0023..2ea20ebb 100644 --- a/conftest.py +++ b/conftest.py @@ -30,6 +30,7 @@ DEFAULT_ENV = { "HEATING_KWH_PREDICTIONS_BUCKET": "test", "HOTWATER_KWH_PREDICTIONS_BUCKET": "test", "ENERGY_ASSESSMENTS_BUCKET": "test", + "HUBSPOT_API_KEY": "changeme", } # runs immediately when pytest starts, BEFORE collection diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py index 39cea6a1..9c1cd31e 100644 --- a/etl/hubspot/hubspotClient.py +++ b/etl/hubspot/hubspotClient.py @@ -1,5 +1,441 @@ -import hubspot +import os +from enum import Enum +from typing import Optional, cast -class HubspotClient(): +from hubspot.client import Client # type: ignore[reportMissingTypeStubs] +from hubspot.crm.associations import ApiException # type: ignore[reportMissingTypeStubs] +from hubspot.crm.objects import SimplePublicObjectInput # type: ignore[reportMissingTypeStubs] +from hubspot.crm.objects.api.basic_api import BasicApi as ObjectsBasicApi # type: ignore[reportMissingTypeStubs] +from hubspot.crm.deals.api.basic_api import BasicApi as DealsBasicApi # type: ignore[reportMissingTypeStubs] +from hubspot.crm.companies.api.basic_api import BasicApi as CompaniesBasicApi # type: ignore[reportMissingTypeStubs] +from hubspot.crm.products.api.basic_api import BasicApi as ProductsBasicApi # type: ignore[reportMissingTypeStubs] +from hubspot.crm.line_items.api.basic_api import BasicApi as LineItemsBasicApi # type: ignore[reportMissingTypeStubs] +from hubspot.crm.pipelines.api.pipelines_api import PipelinesApi # type: ignore[reportMissingTypeStubs] +from hubspot.crm.pipelines.models import ( # type: ignore[reportMissingTypeStubs] + CollectionResponsePipelineNoPaging as PipelinesResponse, +) +from hubspot.crm.pipelines.models import Pipeline as HubspotPipeline # type: ignore[reportMissingTypeStubs] +from hubspot.crm.pipelines.models import PipelineStage as HubspotPipelineStage # type: ignore[reportMissingTypeStubs] +from hubspot.crm.objects.models import SimplePublicObject as HubspotObject # type: ignore[reportMissingTypeStubs] +from hubspot.crm.associations.v4 import AssociationSpec # type: ignore[reportMissingTypeStubs] +from hubspot.crm.associations.v4.api.basic_api import BasicApi as AssociationsBasicApi # type: ignore[reportMissingTypeStubs] +from hubspot.crm.associations.v4.models import ( # type: ignore[reportMissingTypeStubs] + CollectionResponseMultiAssociatedObjectWithLabelForwardPaging as AssociationsPageResponse, + MultiAssociatedObjectWithLabel as AssociationsResult, + ForwardPaging as AssociationsPaging, + NextPage as AssociationsPagingNext, +) - def \ No newline at end of file + +from backend.app.config import get_settings +from utils.logger import setup_logger + +import mimetypes +import requests + + +class Companies(Enum): + ABRI = "237615001799" + SOUTHERN_HOUSING_GROUP = "109343619305" + LIVEWEST = "86205872354" + SURESERVE = "301745289413" + HOMEGROUP = "94946071794" + APPLE = "184769046716" + THE_GUINESS_PARTNERSHIP = "86970043613" + + +class DealStage(Enum): + SURVEYED_COMPLETE_NEEDS_SIGN_OFF = "1617223914" + SURVEYED_NO_ACCESS_NEED_SIGN_OFF = "1617223915" + CUSTOMER_CONTACTED = "888730834" + SURVEYED_COMPLETED_SIGNED_OFF = "1617223916" + FILES_MISSING_FROM_ASSESSOR = "1887736000" + + +class Pipeline(Enum): + OPERATIONS_SOCIAL_HOUSING = "1167582403" + + +# TODO get guiness working from here + + +class HubspotClient: + + def __init__(self): + """ + Hey Tech Team, Hubspot Library doesn't do type hitting. + We have type hinted stuff but pylance never becomes happy. + However, because I added the type hinting to the best of ability + and you'll still get sensible ide suggestions. + """ + settings = get_settings() + access_token = settings.HUBSPOT_API_KEY + if access_token is None: + raise RuntimeError("Missing HUBSPOT_API_KEY in env") + self.access_token: str = access_token + self.logger = setup_logger() + self.client: Client = Client.create(access_token=self.access_token) # type: ignore[reportUnknownMemberType] + # [Developer Only] + # Add a dot in front of client and see the wonders of ide suggestions + # This wouldn't work if we didn't add ': Client' to self.client. + # Sorry - not sorry but enjoy, Past Junte 13/03/2026 + # self.client + + def get_deal_ids_from_company(self, company_id: str) -> list[str]: + associations_api: AssociationsBasicApi = ( # type: ignore[reportUnknownMemberType] + self.client.crm.associations.v4.basic_api # type: ignore[reportUnknownMemberType] + ) + + deal_ids: list[str] = [] + after: Optional[str] = None + + while True: + response: AssociationsPageResponse = associations_api.get_page( # type: ignore[reportUnknownMemberType] + object_type="companies", + object_id=company_id, + to_object_type="deals", + limit=100, + after=after, + ) + + results: list[AssociationsResult] = cast(list[AssociationsResult], response.results) # type: ignore[reportUnknownMemberType] + for assoc in results: + assoc: AssociationsResult + object_id: str = cast(str, assoc.to_object_id) # type: ignore[reportUnknownMemberType, reportUnknownVariableType] + deal_ids.append(object_id) + + paging: Optional[AssociationsPaging] = cast(Optional[AssociationsPaging], response.paging) # type: ignore[reportUnknownMemberType] + if not paging: + break + + paging_next: Optional[AssociationsPagingNext] = cast(Optional[AssociationsPagingNext], paging.next) # type: ignore[reportUnknownMemberType, reportUnknownVariableType] + if not paging_next: + break + + after = cast(str, paging_next.after) # type: ignore[reportUnknownMemberType, reportUnknownVariableType] + + return deal_ids + + def from_deal_id_get_associated_company_id(self, deal_id: str) -> Optional[str]: + """ + Get the associated company ID from a given deal ID. + Returns the associated company ID, or None if not found. + """ + try: + associations_api: AssociationsBasicApi = self.client.crm.associations.v4.basic_api # type: ignore[reportUnknownMemberType] + + # Fetch associations for this specific deal only + response: AssociationsPageResponse = associations_api.get_page( # type: ignore[reportUnknownMemberType] + object_type="deals", + object_id=deal_id, + to_object_type="companies", + limit=1, # Expect only one associated company + ) + + results: list[AssociationsResult] = cast(list[AssociationsResult], response.results) # type: ignore[reportUnknownMemberType] + if not results: + self.logger.info(f"No company association found for deal {deal_id}") + return None + + first: AssociationsResult = results[0] + company_id: str = cast(str, first.to_object_id) # type: ignore[reportUnknownMemberType, reportUnknownVariableType] + self.logger.info(f"Associated company ID for deal {deal_id}: {company_id}") + return company_id + + except ApiException as e: + self.logger.error( + f"Error fetching associated company for deal {deal_id}: {e}" + ) + return None + + def from_deal_id_get_associated_listing( + self, deal_id: str + ) -> Optional[dict[str, str]]: + """ + Get the associated listing information for a given deal. + Returns a dictionary of listing properties, or None if not found. + """ + associations_api: AssociationsBasicApi = self.client.crm.associations.v4.basic_api # type: ignore[reportUnknownMemberType] + listings_api: ObjectsBasicApi = self.client.crm.objects.basic_api # type: ignore[reportUnknownMemberType] # works for custom objects like "listing" + + # Fetch associated listing(s) + response: AssociationsPageResponse = associations_api.get_page( # type: ignore[reportUnknownMemberType] + object_type="deals", + object_id=deal_id, + to_object_type="0-420", # <-- use your exact custom object name slug here + limit=1, + ) + + results: list[AssociationsResult] = cast(list[AssociationsResult], response.results) # type: ignore[reportUnknownMemberType] + if not results: + self.logger.info(f"No listing association found for deal {deal_id}") + return None + + first: AssociationsResult = results[0] + listing_id: str = cast(str, first.to_object_id) # type: ignore[reportUnknownMemberType, reportUnknownVariableType] + self.logger.info(f"Associated listing ID for deal {deal_id}: {listing_id}") + + # Fetch listing details (the "listing information") + listing: HubspotObject = listings_api.get_by_id( # type: ignore[reportUnknownMemberType] + object_type="0-420", # again, must match your HubSpot object name + object_id=listing_id, + properties=[ + "national_uprn", + "domna_property_id", + "owner_property_id", + ], + ) + + listing_info: dict[str, str] = cast(dict[str, str], listing.properties) # type: ignore[reportUnknownMemberType] + self.logger.info(f"Listing info for deal {deal_id}: {listing_info}") + return listing_info + + def from_deal_id_get_info(self, deal_id: str) -> dict[str, str]: + deals_api: DealsBasicApi = self.client.crm.deals.basic_api # type: ignore[reportUnknownMemberType] + + deal: HubspotObject = deals_api.get_by_id( # type: ignore[reportUnknownMemberType] + deal_id, + properties=[ + "dealname", + "dealstage", + "pipeline", + "outcome", # outcome, + "outcome_notes", # outcome notes + "project_code", + "major_condition_issue_description", + "major_condition_issue_photos", + "coordination_status__stage_1_", # Coordiantion Status (Stage 1), + "retrofit_design_status", # Retrofit Design Status + ], + ) + + deal_info: dict[str, str] = cast(dict[str, str], deal.properties) # type: ignore[reportUnknownMemberType] + return deal_info + + def get_deal_info_for_db( + self, deal_id: str + ) -> tuple[dict[str, str], Optional[str], Optional[dict[str, str]]]: + deal: dict[str, str] = self.from_deal_id_get_info(deal_id) + company: Optional[str] = self.from_deal_id_get_associated_company_id(deal_id) + listing: Optional[dict[str, str]] = self.from_deal_id_get_associated_listing( + deal_id + ) + + return deal, company, listing + + def get_company_information(self, company_id: str) -> dict[str, str]: + companies_api: CompaniesBasicApi = self.client.crm.companies.basic_api # type: ignore[reportUnknownMemberType] + + company: HubspotObject = companies_api.get_by_id( # type: ignore[reportUnknownMemberType] + company_id, + properties=[ + "name", + ], + ) + + company_info: dict[str, str] = cast(dict[str, str], company.properties) # type: ignore[reportUnknownMemberType] + return company_info + + def get_all_pipelines(self) -> list[dict[str, str]]: + """ + Retrieve all pipelines for deals, returning a list of dicts with pipeline names and IDs. + """ + try: + pipelines_api: PipelinesApi = self.client.crm.pipelines.pipelines_api # type: ignore[reportUnknownMemberType] + response: PipelinesResponse = pipelines_api.get_all(object_type="deals") # type: ignore[reportUnknownMemberType] + + results: list[HubspotPipeline] = cast(list[HubspotPipeline], response.results) # type: ignore[reportUnknownMemberType] + pipelines: list[dict[str, str]] = [] + for pipeline in results: + pipeline: HubspotPipeline + pipelines.append( + { + "name": cast(str, pipeline.label), # type: ignore[reportUnknownMemberType] + "id": cast(str, pipeline.id), # type: ignore[reportUnknownMemberType] + } + ) + + self.logger.info(f"Retrieved {len(pipelines)} pipelines.") + return pipelines + + except Exception as e: + self.logger.error(f"Error retrieving pipelines: {e}") + return [] + + def get_deal_stages_from_pipeline_id( + self, pipeline_id: Optional[str] = None + ) -> list[dict[str, str]]: + """ + Retrieve all deal stages for a given pipeline. + If no pipeline_id is provided, retrieves all stages for all pipelines. + Returns a list of dicts with pipeline name, stage name, and stage ID. + """ + try: + pipelines_api: PipelinesApi = self.client.crm.pipelines.pipelines_api # type: ignore[reportUnknownMemberType] + response: PipelinesResponse = pipelines_api.get_all(object_type="deals") # type: ignore[reportUnknownMemberType] + + all_stages: list[dict[str, str]] = [] + + for pipeline in cast(list[HubspotPipeline], response.results): # type: ignore[reportUnknownMemberType] + pipeline: HubspotPipeline + # Skip other pipelines if a specific one is requested + pipeline_id_str: str = cast(str, pipeline.id) # type: ignore[reportUnknownMemberType] + if pipeline_id and pipeline_id_str != str(pipeline_id): + continue + + for stage in cast(list[HubspotPipelineStage], pipeline.stages): # type: ignore[reportUnknownMemberType] + stage: HubspotPipelineStage + all_stages.append( + { + "pipeline_name": cast(str, pipeline.label), # type: ignore[reportUnknownMemberType] + "pipeline_id": pipeline_id_str, + "stage_name": cast(str, stage.label), # type: ignore[reportUnknownMemberType] + "stage_id": cast(str, stage.id), # type: ignore[reportUnknownMemberType] + } + ) + + if not all_stages: + self.logger.info( + f"No deal stages found for pipeline {pipeline_id if pipeline_id else 'ALL'}" + ) + else: + self.logger.info(f"Retrieved {len(all_stages)} deal stages.") + + return all_stages + + except Exception as e: + self.logger.error(f"Error retrieving deal stages: {e}") + return [] + + def download_file_from_url( + self, download_url: str, save_path: Optional[str] = None + ) -> str: + """ + Download a file from a HubSpot file URL (public or private), keeping its original file type. + """ + + try: + headers: dict[str, str] = {} + if "hubspotusercontent" not in download_url: + headers["Authorization"] = f"Bearer {self.access_token}" + + self.logger.info(f"Downloading HubSpot file: {download_url}") + response = requests.get( + download_url, headers=headers, stream=True, allow_redirects=True + ) + response.raise_for_status() + + # Try to infer filename from Content-Disposition header + content_disposition = response.headers.get("content-disposition") + if content_disposition and "filename=" in content_disposition: + filename = content_disposition.split("filename=")[1].strip('"') + else: + # fallback: extract from URL or content-type + filename = ( + os.path.basename(download_url.split("?")[0]) or "hubspot_download" + ) + if "." not in filename: + content_type = response.headers.get("content-type") + ext = ( + mimetypes.guess_extension(content_type.split(";")[0]) + if content_type + else None + ) + if ext: + filename += ext + + # Make sure save_path is valid + if save_path is None: + save_path = os.path.abspath(filename) + elif os.path.isdir(save_path): + save_path = os.path.join(save_path, filename) + else: + # if user passes a file path directly, leave it + save_path = os.path.abspath(save_path) + + with open(save_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + self.logger.info(f"File downloaded successfully → {save_path}") + return save_path + + except requests.exceptions.RequestException as e: + self.logger.error(f"Failed to download file from HubSpot: {e}") + raise + + def create_line_item_from_product(self, product_id: str, quantity: int = 1) -> str: + products_api: ProductsBasicApi = self.client.crm.products.basic_api # type: ignore[reportUnknownMemberType] + + # Fetch product mapping + product: HubspotObject = products_api.get_by_id( # type: ignore[reportUnknownMemberType] + product_id, properties=["name", "price", "hs_price"] + ) + product_properties: dict[str, str] = cast(dict[str, str], product.properties) # type: ignore[reportUnknownMemberType] + + name: Optional[str] = product_properties.get("name") + price: str = product_properties.get("price") or product_properties.get("hs_price") or "0" + + # Build line item payload + line_item_input = SimplePublicObjectInput( + properties={ + "hs_product_id": product_id, + "name": name, + "quantity": str(quantity), + "price": price, + "amount": str(float(price) * quantity), + "invoiced": "Outstanding", + } + ) + + line_items_api: LineItemsBasicApi = self.client.crm.line_items.basic_api # type: ignore[reportUnknownMemberType] + + # Create line item + line_item: HubspotObject = line_items_api.create(line_item_input) # type: ignore[reportUnknownMemberType] + return cast(str, line_item.id) # type: ignore[reportUnknownMemberType] + + def associate_line_item_to_deal(self, line_item_id: str, deal_id: str) -> None: + self.logger.info(f"Associating line item {line_item_id} → deal {deal_id}") + + association_api: AssociationsBasicApi = self.client.crm.associations.v4.basic_api # type: ignore[reportUnknownMemberType] + + association_api.create( # type: ignore[reportUnknownMemberType] + "0-3", # to object type + deal_id, # to object id + "line_items", # from object type + line_item_id, # from object id + [ + AssociationSpec( + association_category="HUBSPOT_DEFINED", + association_type_id=19, # line_item → deal + ) + ], + ) + + def add_product_line_item_to_deal( + self, deal_id: str, product_id: str, quantity: int = 1 + ) -> str: + # Step 1: Create the line item from product mapping + line_item_id: str = self.create_line_item_from_product(product_id, quantity) + + # Step 2: Associate the created line item to the deal + self.associate_line_item_to_deal(line_item_id, deal_id) + + return line_item_id + + def delete_line_item(self, line_item_id: str) -> bool: + """ + Delete (archive) a line item in HubSpot by its ID. + """ + try: + self.logger.info(f"Deleting line item {line_item_id}...") + + line_items_api: LineItemsBasicApi = self.client.crm.line_items.basic_api # type: ignore[reportUnknownMemberType] + line_items_api.archive(line_item_id) # type: ignore[reportUnknownMemberType] + + self.logger.info(f"Line item {line_item_id} deleted successfully.") + return True + + except ApiException as e: + self.logger.error(f"Failed to delete line item {line_item_id}: {e}") + return False diff --git a/etl/hubspot/requirements.txt b/etl/hubspot/requirements.txt index 105cba07..ef8e3ebc 100644 --- a/etl/hubspot/requirements.txt +++ b/etl/hubspot/requirements.txt @@ -1 +1 @@ -hubspot \ No newline at end of file +hubspot-api-client \ No newline at end of file diff --git a/etl/hubspot/tests/__init__.py b/etl/hubspot/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/etl/hubspot/tests/test_hubspot_client_integration.py b/etl/hubspot/tests/test_hubspot_client_integration.py new file mode 100644 index 00000000..d7cf46fd --- /dev/null +++ b/etl/hubspot/tests/test_hubspot_client_integration.py @@ -0,0 +1,117 @@ +import os +from typing import Optional + +import pytest +from etl.hubspot.hubspotClient import HubspotClient, Companies, Pipeline, DealStage + + +class TestHubspotClientIntegration: + """Integration tests using real HubSpot API calls.""" + + @pytest.fixture + def client(self): + """Initialize HubSpot client with env variables.""" + return HubspotClient() + + def test_client_initialization(self, client: HubspotClient): + """Test that client initializes successfully with API key.""" + assert client.access_token is not None + assert client.client is not None + assert client.logger is not None + + def test_get_deal_ids_from_company(self, client: HubspotClient): + """Test getting deal IDs from Apple company includes expected deal.""" + company_id: str = Companies.APPLE.value + + deal_ids: list[str] = client.get_deal_ids_from_company(company_id) + + # https://app-eu1.hubspot.com/contacts/145275138/record/0-3/263490768079 + assert "263490768079" in deal_ids + + def test_get_company_id_from_deal_id(self, client: HubspotClient): + deal_id: str = "263490768079" + + company_id: Optional[str] = client.from_deal_id_get_associated_company_id( + deal_id + ) + # https://app-eu1.hubspot.com/contacts/145275138/record/0-3/263490768079 + assert company_id == Companies.APPLE.value + + def test_from_deal_id_get_associated_listing(self, client: HubspotClient): + deal_id: str = "263490768079" + + listing_info: Optional[dict[str, str]] = ( + client.from_deal_id_get_associated_listing(deal_id) + ) + + assert listing_info is not None + assert "hs_object_id" in listing_info + assert "national_uprn" in listing_info + assert "owner_property_id" in listing_info + assert "domna_property_id" in listing_info + + def test_from_deal_id_get_info(self, client: HubspotClient): + deal_id: str = "263490768079" + + deal_info: dict[str, str] = client.from_deal_id_get_info(deal_id) + + assert "dealname" in deal_info + assert "dealstage" in deal_info + assert "pipeline" in deal_info + assert "outcome" in deal_info # outcome + assert "outcome_notes" in deal_info # outcome notes + assert "project_code" in deal_info + assert "major_condition_issue_description" in deal_info + assert "major_condition_issue_photos" in deal_info + assert ( + "coordination_status__stage_1_" in deal_info + ) # Coordiantion Status (Stage 1) + assert "retrofit_design_status" in deal_info # Retrofit Design Status + + def test_get_deal_info_for_db(self, client: HubspotClient): + deal_id: str = "263490768079" + + deal, company, listing = client.get_deal_info_for_db(deal_id) + + assert "dealname" in deal + assert "dealstage" in deal + assert "pipeline" in deal + + assert company == Companies.APPLE.value + + assert listing is None or "hs_object_id" in listing + + def test_get_company_information(self, client: HubspotClient): + company_id: str = Companies.APPLE.value + + company_info: dict[str, str] = client.get_company_information(company_id) + + assert "name" in company_info + assert company_info["name"].lower() == "Apple".lower() + + def test_get_all_pipelines(self, client: HubspotClient): + pipelines: list[dict[str, str]] = client.get_all_pipelines() + + assert len(pipelines) > 0 + pipeline_ids: list[str] = [p["id"] for p in pipelines] + assert Pipeline.OPERATIONS_SOCIAL_HOUSING.value in pipeline_ids + + def test_get_deal_stages_from_pipeline_id(self, client: HubspotClient): + stages: list[dict[str, str]] = client.get_deal_stages_from_pipeline_id( + Pipeline.OPERATIONS_SOCIAL_HOUSING.value + ) + + assert len(stages) > 0 + stage_ids: list[str] = [s["stage_id"] for s in stages] + assert DealStage.SURVEYED_COMPLETE_NEEDS_SIGN_OFF.value in stage_ids + + def test_download_file_from_url( + self, client: HubspotClient, tmp_path: Optional[str] + ): + deal_info: dict[str, str] = client.from_deal_id_get_info("254427203793") + download_url: str = deal_info["major_condition_issue_photos"] + + save_path: str = client.download_file_from_url(download_url, str(tmp_path)) + + assert os.path.exists(save_path) + assert os.path.getsize(save_path) > 0 diff --git a/pyrightconfig.json b/pyrightconfig.json index d4e0e2a4..18f578a5 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -2,7 +2,7 @@ "typeCheckingMode": "strict", "venvPath": "/Users/khalimconn-kowlessar/opt/anaconda3/envs/", "venv": "Fastapi-backend", - "include": [ +"include": [ "." ] } \ No newline at end of file diff --git a/pytest.ini b/pytest.ini index 608d5e0c..c9dd8ca8 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,4 +3,4 @@ pythonpath = . log_cli = true log_cli_level = INFO addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests +testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests From 3970d70518f1432bc68f2af2532eec63308e5ff4 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Mar 2026 14:36:53 +0000 Subject: [PATCH 03/47] its now perfect --- etl/hubspot/hubspotClient.py | 64 ++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py index 9c1cd31e..b41d71f8 100644 --- a/etl/hubspot/hubspotClient.py +++ b/etl/hubspot/hubspotClient.py @@ -2,24 +2,22 @@ import os from enum import Enum from typing import Optional, cast -from hubspot.client import Client # type: ignore[reportMissingTypeStubs] -from hubspot.crm.associations import ApiException # type: ignore[reportMissingTypeStubs] -from hubspot.crm.objects import SimplePublicObjectInput # type: ignore[reportMissingTypeStubs] -from hubspot.crm.objects.api.basic_api import BasicApi as ObjectsBasicApi # type: ignore[reportMissingTypeStubs] -from hubspot.crm.deals.api.basic_api import BasicApi as DealsBasicApi # type: ignore[reportMissingTypeStubs] -from hubspot.crm.companies.api.basic_api import BasicApi as CompaniesBasicApi # type: ignore[reportMissingTypeStubs] -from hubspot.crm.products.api.basic_api import BasicApi as ProductsBasicApi # type: ignore[reportMissingTypeStubs] -from hubspot.crm.line_items.api.basic_api import BasicApi as LineItemsBasicApi # type: ignore[reportMissingTypeStubs] -from hubspot.crm.pipelines.api.pipelines_api import PipelinesApi # type: ignore[reportMissingTypeStubs] -from hubspot.crm.pipelines.models import ( # type: ignore[reportMissingTypeStubs] +from hubspot.client import Client +from hubspot.crm.associations import ApiException +from hubspot.crm.objects import SimplePublicObjectInput +from hubspot.crm.objects.api.basic_api import BasicApi as ObjectsBasicApi +from hubspot.crm.deals.api.basic_api import BasicApi as DealsBasicApi +from hubspot.crm.companies.api.basic_api import BasicApi as CompaniesBasicApi +from hubspot.crm.pipelines.api.pipelines_api import PipelinesApi +from hubspot.crm.pipelines.models import ( CollectionResponsePipelineNoPaging as PipelinesResponse, ) -from hubspot.crm.pipelines.models import Pipeline as HubspotPipeline # type: ignore[reportMissingTypeStubs] -from hubspot.crm.pipelines.models import PipelineStage as HubspotPipelineStage # type: ignore[reportMissingTypeStubs] -from hubspot.crm.objects.models import SimplePublicObject as HubspotObject # type: ignore[reportMissingTypeStubs] -from hubspot.crm.associations.v4 import AssociationSpec # type: ignore[reportMissingTypeStubs] -from hubspot.crm.associations.v4.api.basic_api import BasicApi as AssociationsBasicApi # type: ignore[reportMissingTypeStubs] -from hubspot.crm.associations.v4.models import ( # type: ignore[reportMissingTypeStubs] +from hubspot.crm.pipelines.models import Pipeline as HubspotPipeline +from hubspot.crm.pipelines.models import PipelineStage as HubspotPipelineStage +from hubspot.crm.objects.models import SimplePublicObject as HubspotObject +from hubspot.crm.associations.v4 import AssociationSpec +from hubspot.crm.associations.v4.api.basic_api import BasicApi as AssociationsBasicApi +from hubspot.crm.associations.v4.models import ( CollectionResponseMultiAssociatedObjectWithLabelForwardPaging as AssociationsPageResponse, MultiAssociatedObjectWithLabel as AssociationsResult, ForwardPaging as AssociationsPaging, @@ -364,17 +362,16 @@ class HubspotClient: self.logger.error(f"Failed to download file from HubSpot: {e}") raise - def create_line_item_from_product(self, product_id: str, quantity: int = 1) -> str: - products_api: ProductsBasicApi = self.client.crm.products.basic_api # type: ignore[reportUnknownMemberType] - + def create_line_item_from_product(self, product_id: str, quantity: int = 1): # Fetch product mapping - product: HubspotObject = products_api.get_by_id( # type: ignore[reportUnknownMemberType] + product = self.client.crm.products.basic_api.get_by_id( product_id, properties=["name", "price", "hs_price"] ) - product_properties: dict[str, str] = cast(dict[str, str], product.properties) # type: ignore[reportUnknownMemberType] - name: Optional[str] = product_properties.get("name") - price: str = product_properties.get("price") or product_properties.get("hs_price") or "0" + name = product.properties.get("name") + price = ( + product.properties.get("price") or product.properties.get("hs_price") or "0" + ) # Build line item payload line_item_input = SimplePublicObjectInput( @@ -388,18 +385,16 @@ class HubspotClient: } ) - line_items_api: LineItemsBasicApi = self.client.crm.line_items.basic_api # type: ignore[reportUnknownMemberType] - # Create line item - line_item: HubspotObject = line_items_api.create(line_item_input) # type: ignore[reportUnknownMemberType] - return cast(str, line_item.id) # type: ignore[reportUnknownMemberType] + line_item = self.client.crm.line_items.basic_api.create(line_item_input) + return line_item.id - def associate_line_item_to_deal(self, line_item_id: str, deal_id: str) -> None: + def associate_line_item_to_deal(self, line_item_id: str, deal_id: str): self.logger.info(f"Associating line item {line_item_id} → deal {deal_id}") - association_api: AssociationsBasicApi = self.client.crm.associations.v4.basic_api # type: ignore[reportUnknownMemberType] + association_api = self.client.crm.associations.v4.basic_api - association_api.create( # type: ignore[reportUnknownMemberType] + association_api.create( "0-3", # to object type deal_id, # to object id "line_items", # from object type @@ -414,24 +409,23 @@ class HubspotClient: def add_product_line_item_to_deal( self, deal_id: str, product_id: str, quantity: int = 1 - ) -> str: + ): # Step 1: Create the line item from product mapping - line_item_id: str = self.create_line_item_from_product(product_id, quantity) + line_item_id = self.create_line_item_from_product(product_id, quantity) # Step 2: Associate the created line item to the deal self.associate_line_item_to_deal(line_item_id, deal_id) return line_item_id - def delete_line_item(self, line_item_id: str) -> bool: + def delete_line_item(self, line_item_id: str): """ Delete (archive) a line item in HubSpot by its ID. """ try: self.logger.info(f"Deleting line item {line_item_id}...") - line_items_api: LineItemsBasicApi = self.client.crm.line_items.basic_api # type: ignore[reportUnknownMemberType] - line_items_api.archive(line_item_id) # type: ignore[reportUnknownMemberType] + self.client.crm.line_items.basic_api.archive(line_item_id) self.logger.info(f"Line item {line_item_id} deleted successfully.") return True From cca72928d91ebb03d4b5fc5aa92715f264221c5b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Mar 2026 14:38:10 +0000 Subject: [PATCH 04/47] its now perfect --- etl/hubspot/hubspotClient.py | 60 +++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py index b41d71f8..1946bcdf 100644 --- a/etl/hubspot/hubspotClient.py +++ b/etl/hubspot/hubspotClient.py @@ -2,22 +2,22 @@ import os from enum import Enum from typing import Optional, cast -from hubspot.client import Client -from hubspot.crm.associations import ApiException -from hubspot.crm.objects import SimplePublicObjectInput -from hubspot.crm.objects.api.basic_api import BasicApi as ObjectsBasicApi -from hubspot.crm.deals.api.basic_api import BasicApi as DealsBasicApi -from hubspot.crm.companies.api.basic_api import BasicApi as CompaniesBasicApi -from hubspot.crm.pipelines.api.pipelines_api import PipelinesApi -from hubspot.crm.pipelines.models import ( +from hubspot.client import Client # type: ignore[reportMissingTypeStubs] +from hubspot.crm.associations import ApiException # type: ignore[reportMissingTypeStubs] +from hubspot.crm.objects import SimplePublicObjectInput # type: ignore[reportMissingTypeStubs] +from hubspot.crm.objects.api.basic_api import BasicApi as ObjectsBasicApi # type: ignore[reportMissingTypeStubs] +from hubspot.crm.deals.api.basic_api import BasicApi as DealsBasicApi # type: ignore[reportMissingTypeStubs] +from hubspot.crm.companies.api.basic_api import BasicApi as CompaniesBasicApi # type: ignore[reportMissingTypeStubs] +from hubspot.crm.pipelines.api.pipelines_api import PipelinesApi # type: ignore[reportMissingTypeStubs] +from hubspot.crm.pipelines.models import ( # type: ignore[reportMissingTypeStubs] CollectionResponsePipelineNoPaging as PipelinesResponse, ) -from hubspot.crm.pipelines.models import Pipeline as HubspotPipeline -from hubspot.crm.pipelines.models import PipelineStage as HubspotPipelineStage -from hubspot.crm.objects.models import SimplePublicObject as HubspotObject -from hubspot.crm.associations.v4 import AssociationSpec -from hubspot.crm.associations.v4.api.basic_api import BasicApi as AssociationsBasicApi -from hubspot.crm.associations.v4.models import ( +from hubspot.crm.pipelines.models import Pipeline as HubspotPipeline # type: ignore[reportMissingTypeStubs] +from hubspot.crm.pipelines.models import PipelineStage as HubspotPipelineStage # type: ignore[reportMissingTypeStubs] +from hubspot.crm.objects.models import SimplePublicObject as HubspotObject # type: ignore[reportMissingTypeStubs] +from hubspot.crm.associations.v4 import AssociationSpec # type: ignore[reportMissingTypeStubs] +from hubspot.crm.associations.v4.api.basic_api import BasicApi as AssociationsBasicApi # type: ignore[reportMissingTypeStubs] +from hubspot.crm.associations.v4.models import ( # type: ignore[reportMissingTypeStubs] CollectionResponseMultiAssociatedObjectWithLabelForwardPaging as AssociationsPageResponse, MultiAssociatedObjectWithLabel as AssociationsResult, ForwardPaging as AssociationsPaging, @@ -362,15 +362,17 @@ class HubspotClient: self.logger.error(f"Failed to download file from HubSpot: {e}") raise - def create_line_item_from_product(self, product_id: str, quantity: int = 1): + def create_line_item_from_product(self, product_id: str, quantity: int = 1) -> str: # Fetch product mapping - product = self.client.crm.products.basic_api.get_by_id( + products_api: ProductsBasicApi = self.client.crm.products.basic_api # type: ignore[reportUnknownMemberType] + product: HubspotObject = products_api.get_by_id( # type: ignore[reportUnknownMemberType] product_id, properties=["name", "price", "hs_price"] ) + properties: dict[str, str] = cast(dict[str, str], product.properties) # type: ignore[reportUnknownMemberType] - name = product.properties.get("name") - price = ( - product.properties.get("price") or product.properties.get("hs_price") or "0" + name: str = properties.get("name") or "" + price: str = ( + properties.get("price") or properties.get("hs_price") or "0" ) # Build line item payload @@ -386,15 +388,16 @@ class HubspotClient: ) # Create line item - line_item = self.client.crm.line_items.basic_api.create(line_item_input) - return line_item.id + line_items_api: LineItemsBasicApi = self.client.crm.line_items.basic_api # type: ignore[reportUnknownMemberType] + line_item: HubspotObject = line_items_api.create(line_item_input) # type: ignore[reportUnknownMemberType] + return cast(str, line_item.id) # type: ignore[reportUnknownMemberType] - def associate_line_item_to_deal(self, line_item_id: str, deal_id: str): + def associate_line_item_to_deal(self, line_item_id: str, deal_id: str) -> None: self.logger.info(f"Associating line item {line_item_id} → deal {deal_id}") - association_api = self.client.crm.associations.v4.basic_api + association_api: AssociationsBasicApi = self.client.crm.associations.v4.basic_api # type: ignore[reportUnknownMemberType] - association_api.create( + association_api.create( # type: ignore[reportUnknownMemberType] "0-3", # to object type deal_id, # to object id "line_items", # from object type @@ -409,23 +412,24 @@ class HubspotClient: def add_product_line_item_to_deal( self, deal_id: str, product_id: str, quantity: int = 1 - ): + ) -> str: # Step 1: Create the line item from product mapping - line_item_id = self.create_line_item_from_product(product_id, quantity) + line_item_id: str = self.create_line_item_from_product(product_id, quantity) # Step 2: Associate the created line item to the deal self.associate_line_item_to_deal(line_item_id, deal_id) return line_item_id - def delete_line_item(self, line_item_id: str): + def delete_line_item(self, line_item_id: str) -> bool: """ Delete (archive) a line item in HubSpot by its ID. """ try: self.logger.info(f"Deleting line item {line_item_id}...") - self.client.crm.line_items.basic_api.archive(line_item_id) + line_items_api: LineItemsBasicApi = self.client.crm.line_items.basic_api # type: ignore[reportUnknownMemberType] + line_items_api.archive(line_item_id) # type: ignore[reportUnknownMemberType] self.logger.info(f"Line item {line_item_id} deleted successfully.") return True From 2349eba89e7239f7768d021171480c6e06e1cdfd Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Mar 2026 14:38:32 +0000 Subject: [PATCH 05/47] its now perfect --- etl/hubspot/hubspotClient.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py index 1946bcdf..f93a736c 100644 --- a/etl/hubspot/hubspotClient.py +++ b/etl/hubspot/hubspotClient.py @@ -8,6 +8,8 @@ from hubspot.crm.objects import SimplePublicObjectInput # type: ignore[reportMi from hubspot.crm.objects.api.basic_api import BasicApi as ObjectsBasicApi # type: ignore[reportMissingTypeStubs] from hubspot.crm.deals.api.basic_api import BasicApi as DealsBasicApi # type: ignore[reportMissingTypeStubs] from hubspot.crm.companies.api.basic_api import BasicApi as CompaniesBasicApi # type: ignore[reportMissingTypeStubs] +from hubspot.crm.products.api.basic_api import BasicApi as ProductsBasicApi # type: ignore[reportMissingTypeStubs] +from hubspot.crm.line_items.api.basic_api import BasicApi as LineItemsBasicApi # type: ignore[reportMissingTypeStubs] from hubspot.crm.pipelines.api.pipelines_api import PipelinesApi # type: ignore[reportMissingTypeStubs] from hubspot.crm.pipelines.models import ( # type: ignore[reportMissingTypeStubs] CollectionResponsePipelineNoPaging as PipelinesResponse, From f8187634058d05fa7f2961b9980f2ec720911b84 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Mar 2026 14:43:55 +0000 Subject: [PATCH 06/47] make tests work --- test.requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test.requirements.txt b/test.requirements.txt index d8b8b777..4bd89caa 100644 --- a/test.requirements.txt +++ b/test.requirements.txt @@ -4,4 +4,5 @@ pytest-cov pytest-mock dotenv psycopg[binary] -pytest-postgresql \ No newline at end of file +pytest-postgresql +hubspot-api-client \ No newline at end of file From 6e8f29afc8dbd385c9e526f70b43cb4ec9613b04 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Mar 2026 14:52:41 +0000 Subject: [PATCH 07/47] added to rerun --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 9268ba25..b470e12c 100644 --- a/README.md +++ b/README.md @@ -39,3 +39,4 @@ pytest --cov-config=model_data/.coveragerc --cov=model_data This will produce the test results and coverage reports + From 8294a80fdfd05be346ffcef8d38331ba5744b2a4 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Mar 2026 15:12:43 +0000 Subject: [PATCH 08/47] change the way the tests are ran as i don't like makefile --- .github/workflows/unit_tests.yml | 18 ++++++++---------- Makefile | 30 ------------------------------ 2 files changed, 8 insertions(+), 40 deletions(-) delete mode 100644 Makefile diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index cc6431b8..91ca7e26 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -14,17 +14,15 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Set up Python 3.11 - uses: actions/setup-python@v4 - with: - python-version: '3.11' + - name: Build test image + run: docker build -f Dockerfile.test -t model-test . - - name: Install tox via Makefile - run: | - make setup - - - name: Run tests with tox via Makefile + - name: Run tests env: EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }} + run: | - make test \ No newline at end of file + docker run --rm \ + -e EPC_AUTH_TOKEN=${{ secrets.DEV_EPC_AUTH_TOKEN }} \ + -e HUBSPOT_API_KEY=${{ secrets.HUBSPOT_API_KEY }} \ + model-test pytest diff --git a/Makefile b/Makefile deleted file mode 100644 index 00942acd..00000000 --- a/Makefile +++ /dev/null @@ -1,30 +0,0 @@ -# Project Makefile - -PYTHON = python - -.PHONY: setup test lint typecheck check clean - -# Install dev dependencies + tox -setup: - $(PYTHON) -m pip install --upgrade pip - $(PYTHON) -m pip install tox black ruff mypy - -# Run tests (pass ARGS="..." for specific tests) -test: - tox -- $(ARGS) - -# Code formatting check + linting -lint: - ruff . - black --check . - -# Static type checks -typecheck: - mypy . - -# Full quality check (all checks + tests) -check: lint typecheck test - -# Clean up tox environments -clean: - rm -rf .tox From 81d84368cfd88232239ec3c92d8e77e6fc5d8417 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Mar 2026 15:15:14 +0000 Subject: [PATCH 09/47] we are going to use docker instead --- Dockerfile.test | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 Dockerfile.test diff --git a/Dockerfile.test b/Dockerfile.test new file mode 100644 index 00000000..d566c435 --- /dev/null +++ b/Dockerfile.test @@ -0,0 +1,23 @@ +FROM python:3.11-slim + +# Install PostgreSQL binaries — required by pytest-postgresql to spawn ephemeral test databases +RUN apt-get update \ + && apt-get install -y --no-install-recommends postgresql \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy requirements first so Docker can cache the install layer +COPY backend/engine/requirements.txt backend/engine/requirements.txt +COPY backend/app/requirements/requirements.txt backend/app/requirements/requirements.txt +COPY test.requirements.txt test.requirements.txt + +RUN pip install --no-cache-dir \ + -r backend/engine/requirements.txt \ + -r backend/app/requirements/requirements.txt \ + -r test.requirements.txt + +# Copy source +COPY . . + +CMD ["pytest"] From 7fb8ee9202fc3c739942b96269a657e519a22d13 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Mar 2026 15:20:15 +0000 Subject: [PATCH 10/47] re run --- Dockerfile.test | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile.test b/Dockerfile.test index d566c435..debbfa8b 100644 --- a/Dockerfile.test +++ b/Dockerfile.test @@ -6,6 +6,7 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* WORKDIR /app +ENV PYTHONPATH=/app # Copy requirements first so Docker can cache the install layer COPY backend/engine/requirements.txt backend/engine/requirements.txt From 6f6aa62efee423692dcf6eb332a636c4d7bc6bff Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Mar 2026 15:26:44 +0000 Subject: [PATCH 11/47] add more requirements --- Dockerfile.test | 2 ++ test.requirements.txt | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Dockerfile.test b/Dockerfile.test index debbfa8b..6091aa50 100644 --- a/Dockerfile.test +++ b/Dockerfile.test @@ -11,11 +11,13 @@ ENV PYTHONPATH=/app # Copy requirements first so Docker can cache the install layer COPY backend/engine/requirements.txt backend/engine/requirements.txt COPY backend/app/requirements/requirements.txt backend/app/requirements/requirements.txt +COPY asset_list/requirements.txt asset_list/requirements.txt COPY test.requirements.txt test.requirements.txt RUN pip install --no-cache-dir \ -r backend/engine/requirements.txt \ -r backend/app/requirements/requirements.txt \ + -r asset_list/requirements.txt \ -r test.requirements.txt # Copy source diff --git a/test.requirements.txt b/test.requirements.txt index 4bd89caa..936e2f7d 100644 --- a/test.requirements.txt +++ b/test.requirements.txt @@ -5,4 +5,5 @@ pytest-mock dotenv psycopg[binary] pytest-postgresql -hubspot-api-client \ No newline at end of file +hubspot-api-client +fuzzywuzzy \ No newline at end of file From 27f17563d46ecf05a901e092d74b5d6654706179 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Mar 2026 15:41:43 +0000 Subject: [PATCH 12/47] pytest ini --- .github/workflows/unit_tests.yml | 1 + Dockerfile.test | 2 -- Dockerfile.test.dockerignore | 11 +++++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 Dockerfile.test.dockerignore diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 91ca7e26..116bc265 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -20,6 +20,7 @@ jobs: - name: Run tests env: EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }} + HUBSPOT_API_KEY: ${{ secrets.HUBSPOT_API_KEY }} run: | docker run --rm \ diff --git a/Dockerfile.test b/Dockerfile.test index 6091aa50..debbfa8b 100644 --- a/Dockerfile.test +++ b/Dockerfile.test @@ -11,13 +11,11 @@ ENV PYTHONPATH=/app # Copy requirements first so Docker can cache the install layer COPY backend/engine/requirements.txt backend/engine/requirements.txt COPY backend/app/requirements/requirements.txt backend/app/requirements/requirements.txt -COPY asset_list/requirements.txt asset_list/requirements.txt COPY test.requirements.txt test.requirements.txt RUN pip install --no-cache-dir \ -r backend/engine/requirements.txt \ -r backend/app/requirements/requirements.txt \ - -r asset_list/requirements.txt \ -r test.requirements.txt # Copy source diff --git a/Dockerfile.test.dockerignore b/Dockerfile.test.dockerignore new file mode 100644 index 00000000..8a846047 --- /dev/null +++ b/Dockerfile.test.dockerignore @@ -0,0 +1,11 @@ +# We need this file otherwise it'll use .dockerignore +# Exclude large/irrelevant directories that are not needed for testing +model_data/local_data/ +backend/node_modules/ +backend/.idea/ +infrastructure/ +data_collection/ +node_modules/ +conservation_areas/ +open_uprn/ +land_registry/ From 08478b17fb838584cc3a63641700da6586d3cfa5 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Mar 2026 15:44:16 +0000 Subject: [PATCH 13/47] run tests --- Dockerfile.test.dockerignore | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile.test.dockerignore b/Dockerfile.test.dockerignore index 8a846047..4f79c6ee 100644 --- a/Dockerfile.test.dockerignore +++ b/Dockerfile.test.dockerignore @@ -3,6 +3,7 @@ model_data/local_data/ backend/node_modules/ backend/.idea/ +backend/.env infrastructure/ data_collection/ node_modules/ From ad189b4cacf56f2944b3a519cec4fff17b27c7fc Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 13 Mar 2026 15:56:13 +0000 Subject: [PATCH 14/47] post gres can't be ran as root --- Dockerfile.test | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Dockerfile.test b/Dockerfile.test index debbfa8b..802eb3a4 100644 --- a/Dockerfile.test +++ b/Dockerfile.test @@ -21,4 +21,8 @@ RUN pip install --no-cache-dir \ # Copy source COPY . . +# pg_ctl refuses to run as root — create an unprivileged user +RUN useradd -m testuser && chown -R testuser /app +USER testuser + CMD ["pytest"] From 1b53b47048500ef30142714c13211f5f740f43a1 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 17 Mar 2026 12:37:50 +0000 Subject: [PATCH 15/47] add this in a sensible branch --- backend/address2UPRN/README.md | 14 ++++++++------ backend/address2UPRN/main.py | 9 ++++++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/backend/address2UPRN/README.md b/backend/address2UPRN/README.md index 6d26f281..e34e45f6 100644 --- a/backend/address2UPRN/README.md +++ b/backend/address2UPRN/README.md @@ -5,10 +5,11 @@ Before you run: Step 1) Get the list and ensure the following columns exists +I believe lower and upper case matter: * Address 1 * Address 2 * Address 3 -* postcode +* Postcode And save it as a .csv file @@ -23,16 +24,17 @@ For this example I'll be using "s3://retrofit-data-dev/ara_raw_inputs/calico/Cal Go to Ara DB and make a new task_id with a randomly generated uuid as the primarily key -task_id = a7b70a02-4df4-45b5-a50b-196e095910bb -sub_task_id = 567cf73b-1210-4909-9ecc-36ae7e23420e +task_id = 169ea9b0-01b5-48dc-9f90-ae1989491d09 +sub_task_id = e5704f9e-29fe-43c8-8913-05be09f2440f +s3 => s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv Step 3) Alright, now lets make the input for postcode-splitter sqs to get the ball rolling postcode-splitter-sqs => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2Fpostcode-splitter-queue-dev { - "task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb", - "sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e", - "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv" + "task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09", + "sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f", + "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching (1)(Sheet1).csv" } Each batch of csv should be saved in retrofit-data-dev/ara_postcode_splitter_batches///.csv diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index d0ba36e6..c458e40d 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -351,9 +351,9 @@ def handler(event, context, local=False): { "body": json.dumps( { - "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", - "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d", - "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv", + "task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09", + "sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f", + "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv", } ) } @@ -441,6 +441,9 @@ def handler(event, context, local=False): # Process the rows logger.info(f"Processing {len(df)} rows for task {task_id}") + df["postcode_clean"] = ( + df["Postcode"].astype(str).str.upper().str.strip().str.replace(" ", "") + ) clean_df = df.dropna(subset=["postcode_clean"]) postcode_to_addresses = { From d3e9fd41e683001f360e042c14c08168b63bc720 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 17 Mar 2026 12:40:44 +0000 Subject: [PATCH 16/47] fixed address 2 uprn now usees POSTCODE --- etl/hubspot/tests/test_hubspot_client_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/hubspot/tests/test_hubspot_client_integration.py b/etl/hubspot/tests/test_hubspot_client_integration.py index d7cf46fd..a3d8ae54 100644 --- a/etl/hubspot/tests/test_hubspot_client_integration.py +++ b/etl/hubspot/tests/test_hubspot_client_integration.py @@ -14,7 +14,7 @@ class TestHubspotClientIntegration: return HubspotClient() def test_client_initialization(self, client: HubspotClient): - """Test that client initializes successfully with API key.""" + """Checks initialisation of HubspotClient and fails early if env variables is not set""" assert client.access_token is not None assert client.client is not None assert client.logger is not None From 547f50550bf4cba3493e1bdfa94579d9323cb3f5 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 17 Mar 2026 12:58:35 +0000 Subject: [PATCH 17/47] readded per khalims request --- Makefile | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..00942acd --- /dev/null +++ b/Makefile @@ -0,0 +1,30 @@ +# Project Makefile + +PYTHON = python + +.PHONY: setup test lint typecheck check clean + +# Install dev dependencies + tox +setup: + $(PYTHON) -m pip install --upgrade pip + $(PYTHON) -m pip install tox black ruff mypy + +# Run tests (pass ARGS="..." for specific tests) +test: + tox -- $(ARGS) + +# Code formatting check + linting +lint: + ruff . + black --check . + +# Static type checks +typecheck: + mypy . + +# Full quality check (all checks + tests) +check: lint typecheck test + +# Clean up tox environments +clean: + rm -rf .tox From 6bfeeeb1b180e50247adad7401222730189860c7 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 17 Mar 2026 13:14:16 +0000 Subject: [PATCH 18/47] go back to origional --- backend/address2UPRN/main.py | 272 +++++++++++++++++++++++++++++------ 1 file changed, 226 insertions(+), 46 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index c458e40d..af29a095 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -1,11 +1,13 @@ -from typing import Optional - from epc_api.client import EpcClient import os from urllib.parse import urlencode import pandas as pd +from difflib import SequenceMatcher from utils.logger import setup_logger +import re +from typing import Set import json +import requests from uuid import UUID import uuid from backend.app.db.functions.tasks.Tasks import SubTaskInterface @@ -16,8 +18,6 @@ from utils.s3 import ( ) from datetime import datetime -from backend.utils.addressMatch import AddressMatch - logger = setup_logger() @@ -29,6 +29,191 @@ if EPC_AUTH_TOKEN is None: raise RuntimeError("EPC_AUTH_TOKEN not defined in env") +def is_valid_postcode(postcode_clean: str) -> bool: + """ + Validate postcode using postcodes.io. + + Expects a sanitised postcode (e.g. E84SQ). + Returns True if valid, False otherwise. + """ + POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate" + if not postcode_clean: + return False + + try: + resp = requests.get( + POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), + timeout=5, + ) + resp.raise_for_status() + return resp.json().get("result", False) + except requests.RequestException: + # Network issues, rate limits, etc. + return False + + +def levenshtein(a: str, b: str) -> float: + """ + Address similarity score in [0, 1]. + + Strategy: + - Normalise + - Strongly penalise mismatched house/flat numbers + - Combine token overlap + character similarity + """ + + def extract_number_sequence(s: str) -> list[str]: + return re.findall(r"\d+[a-z]?", s) + + def extract_numbers(s: str) -> Set[str]: + return set(extract_number_sequence(s)) + + def tokenise(s: str) -> Set[str]: + return set(s.split()) + + def extract_building_number(s: str) -> str | None: + """ + Extract the main building number (NOT flat/unit). + Assumes formats like: + - '42 moreton road' + - 'flat 3 42 moreton road' + """ + tokens = s.split() + + # remove flat/unit context + cleaned = [] + skip_next = False + for t in tokens: + if t in ("flat", "apt", "apartment", "unit"): + skip_next = True + continue + if skip_next: + skip_next = False + continue + cleaned.append(t) + + # first remaining number is building number + for t in cleaned: + if re.fullmatch(r"\d+[a-z]?", t): + return t + + return None + + a_norm = normalise_address(a) + b_norm = normalise_address(b) + + # --- hard signal: numbers --- + nums_a = extract_numbers(a_norm) + nums_b = extract_numbers(b_norm) + + if nums_a and not nums_b: + return 0.0 + + # No shared numbers at all → impossible match + if nums_a and nums_b and nums_a.isdisjoint(nums_b): + return 0.0 + + # 🔒 HARD GUARD: building number must match + bld_a = extract_building_number(a_norm) + bld_b = extract_building_number(b_norm) + + if bld_a and bld_b and bld_a != bld_b: + return 0.0 + + # --- order-sensitive flat/building guard --- + seq_a = extract_number_sequence(a_norm) + seq_b = extract_number_sequence(b_norm) + + has_flat_token_user = any( + tok in a_norm for tok in ("flat", "apt", "apartment", "unit") + ) + has_flat_token_epc = "flat" in b_norm + + if ( + len(seq_a) == 2 + and len(seq_b) >= 2 + and has_flat_token_epc + and not has_flat_token_user + and seq_a != seq_b[:2] + ): + return 0.0 + + # --- token similarity (order-independent) --- + toks_a = tokenise(a_norm) + toks_b = tokenise(b_norm) + + if not toks_a or not toks_b: + token_score = 0.0 + else: + token_score = len(toks_a & toks_b) / len(toks_a | toks_b) + + # --- character similarity (soft signal) --- + char_score = SequenceMatcher(None, a_norm, b_norm).ratio() + + # --- weighted blend --- + return round( + 0.65 * token_score + 0.35 * char_score, + 4, + ) + + +def normalise_address(s: str) -> str: + """ + Canonical UK-focused address normalisation. + + - Lowercases + - Removes punctuation (keeps / for flats) + - Normalises whitespace + - Applies synonym compression at token level + """ + + if not s: + return "" + + ADDRESS_SYNONYMS = { + # street types + "rd": "road", + "rd.": "road", + "st": "street", + "st.": "street", + "ave": "avenue", + "ave.": "avenue", + "ln": "lane", + "ln.": "lane", + "cres": "crescent", + "ct": "court", + "dr": "drive", + # flats / units + "apt": "flat", + "apartment": "flat", + "unit": "flat", + "ste": "suite", + # numbering noise + "no": "", + "no.": "", + } + # 1. lowercase + s = s.lower() + + # 1.5 split digit-letter suffixes + s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s) + + # 2. remove punctuation except / + s = re.sub(r"[^\w\s/]", " ", s) + + # 3. normalise whitespace + s = re.sub(r"\s+", " ", s).strip() + + # 4. tokenise + synonym normalisation + tokens = [] + for tok in s.split(): + replacement = ADDRESS_SYNONYMS.get(tok, tok) + if replacement: + tokens.append(replacement) + + return " ".join(tokens) + + def score_addresses( df: pd.DataFrame, user_address: str, @@ -37,7 +222,7 @@ def score_addresses( if column not in df.columns: raise ValueError(f"Missing column: {column}") - return df[column].apply(lambda x: AddressMatch.score(user_address, x)) + return df[column].apply(lambda x: levenshtein(user_address, x)) def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): @@ -129,11 +314,9 @@ def get_uprn_candidates( out = df.copy() - user_norm = AddressMatch.normalise_address(user_address) + user_norm = normalise_address(user_address) - out["lexiscore"] = out[address_column].apply( - lambda x: AddressMatch.levenshtein(user_norm, x) - ) + out["lexiscore"] = out[address_column].apply(lambda x: levenshtein(user_norm, x)) # Normalise UPRN to string out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) @@ -297,10 +480,7 @@ def resolve_uprns_for_postcode_group( def save_results_to_s3( - results_df: pd.DataFrame, - task_id: str, - sub_task_id: str, - bucket_name: Optional[str] = None, + results_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None ) -> bool: """ Save results DataFrame to S3 as CSV. @@ -351,9 +531,9 @@ def handler(event, context, local=False): { "body": json.dumps( { - "task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09", - "sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f", - "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv", + "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", + "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d", + "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-16T12:00:20.257856_7b520c0e.csv", } ) } @@ -441,9 +621,19 @@ def handler(event, context, local=False): # Process the rows logger.info(f"Processing {len(df)} rows for task {task_id}") - df["postcode_clean"] = ( - df["Postcode"].astype(str).str.upper().str.strip().str.replace(" ", "") - ) + # Create user_input column by concatenating Address columns if not already present + if "user_input" not in df.columns: + df["user_input"] = ( + df["Address 1"].fillna("") + + " " + + df["Address 2"].fillna("") + + " " + + df["Address 3"].fillna("") + ).str.strip() + logger.info(f"Created user_input column from Address 1 and Address 2") + else: + logger.info(f"user_input column already present in data") + clean_df = df.dropna(subset=["postcode_clean"]) postcode_to_addresses = { @@ -463,7 +653,7 @@ def handler(event, context, local=False): ) # Validate postcode before processing - if not AddressMatch.is_valid_postcode(postcode): + if not is_valid_postcode(postcode): logger.warning(f"Postcode {postcode} is invalid, skipping") continue @@ -482,67 +672,57 @@ def handler(event, context, local=False): # Process each address in this postcode with the same EPC data for row in postcode_rows: try: - # Concatenate Address columns directly - address2uprn_user_input = ( - str(row.get("Address 1", "")).strip() - + " " - + str(row.get("Address 2", "")).strip() - + " " - + str(row.get("Address 3", "")).strip() - ).strip() - - if not address2uprn_user_input: + user_input = row.get("user_input", "") + if not user_input: logger.warning( - f"Skipping row with missing address components for postcode {postcode}" + f"Skipping row with missing user_input for postcode {postcode}" ) continue # Get UPRN using the pre-fetched EPC data with all return options result = get_uprn_with_epc_df( - user_inputed_address=address2uprn_user_input, - epc_df=epc_df, - verbose=True, + user_inputed_address=user_input, epc_df=epc_df, verbose=True ) # Parse result tuple if successful if result: uprn, found_address, score = result logger.info( - f"Found UPRN for {address2uprn_user_input} in {postcode}: {uprn} (score: {score})" + f"Found UPRN for {user_input} in {postcode}: {uprn} (score: {score})" ) results_data.append( { **row, # Include all original data - "address2uprn_uprn": uprn, - "address2uprn_address": found_address, - "address2uprn_lexiscore": score, + "uprn": uprn, + "domna_found_address": found_address, + "domna_lexiscore": score, } ) else: logger.warning( - f"No UPRN found for {address2uprn_user_input} in {postcode}" + f"No UPRN found for {user_input} in {postcode}" ) results_data.append( { **row, # Include all original data - "address2uprn_uprn": None, - "address2uprn_address": None, - "address2uprn_lexiscore": None, + "uprn": None, + "domna_found_address": None, + "domna_lexiscore": None, } ) except Exception as e: logger.error( - f"Error processing address {row.get('address2uprn_user_input', 'unknown')}: {e}" + f"Error processing address {row.get('user_input', 'unknown')}: {e}" ) # Still add the row with error markers results_data.append( { **row, - "address2uprn_uprn": None, - "address2uprn_address": None, - "address2uprn_lexiscore": None, + "uprn": None, + "domna_found_address": None, + "domna_lexiscore": None, "error": str(e), } ) From f69a6151404f9d30c6ff85a91921c5eff563b050 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 17 Mar 2026 17:13:49 +0000 Subject: [PATCH 19/47] revert to old one --- backend/address2UPRN/main.py | 265 ++++++----------------------------- 1 file changed, 41 insertions(+), 224 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index af29a095..d0ba36e6 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -1,13 +1,11 @@ +from typing import Optional + from epc_api.client import EpcClient import os from urllib.parse import urlencode import pandas as pd -from difflib import SequenceMatcher from utils.logger import setup_logger -import re -from typing import Set import json -import requests from uuid import UUID import uuid from backend.app.db.functions.tasks.Tasks import SubTaskInterface @@ -18,6 +16,8 @@ from utils.s3 import ( ) from datetime import datetime +from backend.utils.addressMatch import AddressMatch + logger = setup_logger() @@ -29,191 +29,6 @@ if EPC_AUTH_TOKEN is None: raise RuntimeError("EPC_AUTH_TOKEN not defined in env") -def is_valid_postcode(postcode_clean: str) -> bool: - """ - Validate postcode using postcodes.io. - - Expects a sanitised postcode (e.g. E84SQ). - Returns True if valid, False otherwise. - """ - POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate" - if not postcode_clean: - return False - - try: - resp = requests.get( - POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), - timeout=5, - ) - resp.raise_for_status() - return resp.json().get("result", False) - except requests.RequestException: - # Network issues, rate limits, etc. - return False - - -def levenshtein(a: str, b: str) -> float: - """ - Address similarity score in [0, 1]. - - Strategy: - - Normalise - - Strongly penalise mismatched house/flat numbers - - Combine token overlap + character similarity - """ - - def extract_number_sequence(s: str) -> list[str]: - return re.findall(r"\d+[a-z]?", s) - - def extract_numbers(s: str) -> Set[str]: - return set(extract_number_sequence(s)) - - def tokenise(s: str) -> Set[str]: - return set(s.split()) - - def extract_building_number(s: str) -> str | None: - """ - Extract the main building number (NOT flat/unit). - Assumes formats like: - - '42 moreton road' - - 'flat 3 42 moreton road' - """ - tokens = s.split() - - # remove flat/unit context - cleaned = [] - skip_next = False - for t in tokens: - if t in ("flat", "apt", "apartment", "unit"): - skip_next = True - continue - if skip_next: - skip_next = False - continue - cleaned.append(t) - - # first remaining number is building number - for t in cleaned: - if re.fullmatch(r"\d+[a-z]?", t): - return t - - return None - - a_norm = normalise_address(a) - b_norm = normalise_address(b) - - # --- hard signal: numbers --- - nums_a = extract_numbers(a_norm) - nums_b = extract_numbers(b_norm) - - if nums_a and not nums_b: - return 0.0 - - # No shared numbers at all → impossible match - if nums_a and nums_b and nums_a.isdisjoint(nums_b): - return 0.0 - - # 🔒 HARD GUARD: building number must match - bld_a = extract_building_number(a_norm) - bld_b = extract_building_number(b_norm) - - if bld_a and bld_b and bld_a != bld_b: - return 0.0 - - # --- order-sensitive flat/building guard --- - seq_a = extract_number_sequence(a_norm) - seq_b = extract_number_sequence(b_norm) - - has_flat_token_user = any( - tok in a_norm for tok in ("flat", "apt", "apartment", "unit") - ) - has_flat_token_epc = "flat" in b_norm - - if ( - len(seq_a) == 2 - and len(seq_b) >= 2 - and has_flat_token_epc - and not has_flat_token_user - and seq_a != seq_b[:2] - ): - return 0.0 - - # --- token similarity (order-independent) --- - toks_a = tokenise(a_norm) - toks_b = tokenise(b_norm) - - if not toks_a or not toks_b: - token_score = 0.0 - else: - token_score = len(toks_a & toks_b) / len(toks_a | toks_b) - - # --- character similarity (soft signal) --- - char_score = SequenceMatcher(None, a_norm, b_norm).ratio() - - # --- weighted blend --- - return round( - 0.65 * token_score + 0.35 * char_score, - 4, - ) - - -def normalise_address(s: str) -> str: - """ - Canonical UK-focused address normalisation. - - - Lowercases - - Removes punctuation (keeps / for flats) - - Normalises whitespace - - Applies synonym compression at token level - """ - - if not s: - return "" - - ADDRESS_SYNONYMS = { - # street types - "rd": "road", - "rd.": "road", - "st": "street", - "st.": "street", - "ave": "avenue", - "ave.": "avenue", - "ln": "lane", - "ln.": "lane", - "cres": "crescent", - "ct": "court", - "dr": "drive", - # flats / units - "apt": "flat", - "apartment": "flat", - "unit": "flat", - "ste": "suite", - # numbering noise - "no": "", - "no.": "", - } - # 1. lowercase - s = s.lower() - - # 1.5 split digit-letter suffixes - s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s) - - # 2. remove punctuation except / - s = re.sub(r"[^\w\s/]", " ", s) - - # 3. normalise whitespace - s = re.sub(r"\s+", " ", s).strip() - - # 4. tokenise + synonym normalisation - tokens = [] - for tok in s.split(): - replacement = ADDRESS_SYNONYMS.get(tok, tok) - if replacement: - tokens.append(replacement) - - return " ".join(tokens) - - def score_addresses( df: pd.DataFrame, user_address: str, @@ -222,7 +37,7 @@ def score_addresses( if column not in df.columns: raise ValueError(f"Missing column: {column}") - return df[column].apply(lambda x: levenshtein(user_address, x)) + return df[column].apply(lambda x: AddressMatch.score(user_address, x)) def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): @@ -314,9 +129,11 @@ def get_uprn_candidates( out = df.copy() - user_norm = normalise_address(user_address) + user_norm = AddressMatch.normalise_address(user_address) - out["lexiscore"] = out[address_column].apply(lambda x: levenshtein(user_norm, x)) + out["lexiscore"] = out[address_column].apply( + lambda x: AddressMatch.levenshtein(user_norm, x) + ) # Normalise UPRN to string out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) @@ -480,7 +297,10 @@ def resolve_uprns_for_postcode_group( def save_results_to_s3( - results_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None + results_df: pd.DataFrame, + task_id: str, + sub_task_id: str, + bucket_name: Optional[str] = None, ) -> bool: """ Save results DataFrame to S3 as CSV. @@ -533,7 +353,7 @@ def handler(event, context, local=False): { "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d", - "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-16T12:00:20.257856_7b520c0e.csv", + "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv", } ) } @@ -621,19 +441,6 @@ def handler(event, context, local=False): # Process the rows logger.info(f"Processing {len(df)} rows for task {task_id}") - # Create user_input column by concatenating Address columns if not already present - if "user_input" not in df.columns: - df["user_input"] = ( - df["Address 1"].fillna("") - + " " - + df["Address 2"].fillna("") - + " " - + df["Address 3"].fillna("") - ).str.strip() - logger.info(f"Created user_input column from Address 1 and Address 2") - else: - logger.info(f"user_input column already present in data") - clean_df = df.dropna(subset=["postcode_clean"]) postcode_to_addresses = { @@ -653,7 +460,7 @@ def handler(event, context, local=False): ) # Validate postcode before processing - if not is_valid_postcode(postcode): + if not AddressMatch.is_valid_postcode(postcode): logger.warning(f"Postcode {postcode} is invalid, skipping") continue @@ -672,57 +479,67 @@ def handler(event, context, local=False): # Process each address in this postcode with the same EPC data for row in postcode_rows: try: - user_input = row.get("user_input", "") - if not user_input: + # Concatenate Address columns directly + address2uprn_user_input = ( + str(row.get("Address 1", "")).strip() + + " " + + str(row.get("Address 2", "")).strip() + + " " + + str(row.get("Address 3", "")).strip() + ).strip() + + if not address2uprn_user_input: logger.warning( - f"Skipping row with missing user_input for postcode {postcode}" + f"Skipping row with missing address components for postcode {postcode}" ) continue # Get UPRN using the pre-fetched EPC data with all return options result = get_uprn_with_epc_df( - user_inputed_address=user_input, epc_df=epc_df, verbose=True + user_inputed_address=address2uprn_user_input, + epc_df=epc_df, + verbose=True, ) # Parse result tuple if successful if result: uprn, found_address, score = result logger.info( - f"Found UPRN for {user_input} in {postcode}: {uprn} (score: {score})" + f"Found UPRN for {address2uprn_user_input} in {postcode}: {uprn} (score: {score})" ) results_data.append( { **row, # Include all original data - "uprn": uprn, - "domna_found_address": found_address, - "domna_lexiscore": score, + "address2uprn_uprn": uprn, + "address2uprn_address": found_address, + "address2uprn_lexiscore": score, } ) else: logger.warning( - f"No UPRN found for {user_input} in {postcode}" + f"No UPRN found for {address2uprn_user_input} in {postcode}" ) results_data.append( { **row, # Include all original data - "uprn": None, - "domna_found_address": None, - "domna_lexiscore": None, + "address2uprn_uprn": None, + "address2uprn_address": None, + "address2uprn_lexiscore": None, } ) except Exception as e: logger.error( - f"Error processing address {row.get('user_input', 'unknown')}: {e}" + f"Error processing address {row.get('address2uprn_user_input', 'unknown')}: {e}" ) # Still add the row with error markers results_data.append( { **row, - "uprn": None, - "domna_found_address": None, - "domna_lexiscore": None, + "address2uprn_uprn": None, + "address2uprn_address": None, + "address2uprn_lexiscore": None, "error": str(e), } ) From fc425b8b66d38305967e22d4a040a316848ddf35 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 17 Mar 2026 17:18:09 +0000 Subject: [PATCH 20/47] better comments --- etl/hubspot/hubspotClient.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py index f93a736c..ed456478 100644 --- a/etl/hubspot/hubspotClient.py +++ b/etl/hubspot/hubspotClient.py @@ -162,7 +162,7 @@ class HubspotClient: response: AssociationsPageResponse = associations_api.get_page( # type: ignore[reportUnknownMemberType] object_type="deals", object_id=deal_id, - to_object_type="0-420", # <-- use your exact custom object name slug here + to_object_type="0-420", # <-- to get an listing object limit=1, ) @@ -373,9 +373,7 @@ class HubspotClient: properties: dict[str, str] = cast(dict[str, str], product.properties) # type: ignore[reportUnknownMemberType] name: str = properties.get("name") or "" - price: str = ( - properties.get("price") or properties.get("hs_price") or "0" - ) + price: str = properties.get("price") or properties.get("hs_price") or "0" # Build line item payload line_item_input = SimplePublicObjectInput( From e01b7225bbd62d501faeb85982ad873c6b26eedd Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 24 Mar 2026 13:01:56 +0000 Subject: [PATCH 21/47] save --- asset_list/app.py | 19 +++++++++---------- backend/address2UPRN/README.md | 15 ++++++++------- .../scripts/combine_address2uprn_outputs.py | 12 ++++++------ sfr/principal_pitch/2_export_data.py | 6 +++--- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/asset_list/app.py b/asset_list/app.py index 7858146d..02c94f10 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -74,24 +74,23 @@ def app(): """ data_folder = "/workspaces/model/asset_list" - # data_filename = "For Modelling - Final - reviewed.xlsx" - data_filename = "assests.xlsx" - sheet_name = "Sheet1" - postcode_column = "POSTCODE" - address1_column = "ADDRESS" + data_filename = "Calico ARA Upload Review.xlsx" + sheet_name = "Upload to Ara - Needs Sign Off" + postcode_column = "Postcode" + address1_column = "Address 1" address1_method = None - fulladdress_column = "ADDRESS" + fulladdress_column = "Address 1" address_cols_to_concat = [] missing_postcodes_method = None landlord_year_built = None - landlord_os_uprn = None - landlord_property_type = None - landlord_built_form = None + landlord_os_uprn = "ara_found_uprn" + landlord_property_type = "Property Type" + landlord_built_form = "Property Type" landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "UPRN" + landlord_property_id = "Asset Reference" landlord_sap = None outcomes_filename = None outcomes_sheetname = None diff --git a/backend/address2UPRN/README.md b/backend/address2UPRN/README.md index e34e45f6..646fec01 100644 --- a/backend/address2UPRN/README.md +++ b/backend/address2UPRN/README.md @@ -9,7 +9,7 @@ I believe lower and upper case matter: * Address 1 * Address 2 * Address 3 -* Postcode +* postcode And save it as a .csv file @@ -24,18 +24,19 @@ For this example I'll be using "s3://retrofit-data-dev/ara_raw_inputs/calico/Cal Go to Ara DB and make a new task_id with a randomly generated uuid as the primarily key -task_id = 169ea9b0-01b5-48dc-9f90-ae1989491d09 -sub_task_id = e5704f9e-29fe-43c8-8913-05be09f2440f -s3 => s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv +task_id = ea615ac3-ac28-46c4-8bff-2431c5b9c13d +sub_task_id = 85a23b67-8f18-4299-9bf0-69bfb87adbc7 +s3 => s3://retrofit-data-dev/ara_raw_inputs/eon/North Tyneside Council.csv Step 3) Alright, now lets make the input for postcode-splitter sqs to get the ball rolling postcode-splitter-sqs => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2Fpostcode-splitter-queue-dev { - "task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09", - "sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f", - "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching (1)(Sheet1).csv" + "task_id": "ea615ac3-ac28-46c4-8bff-2431c5b9c13d", + "sub_task_id": "85a23b67-8f18-4299-9bf0-69bfb87adbc7", + "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/eon/eon(Sheet1).csv" } + Each batch of csv should be saved in retrofit-data-dev/ara_postcode_splitter_batches///.csv outputs of address2uprn ( which is automatically triggered on postcodesplitter) will be saved on retrofit-data-dev/ara_raw_outputs///.csv diff --git a/backend/scripts/combine_address2uprn_outputs.py b/backend/scripts/combine_address2uprn_outputs.py index be17f610..f065c676 100644 --- a/backend/scripts/combine_address2uprn_outputs.py +++ b/backend/scripts/combine_address2uprn_outputs.py @@ -55,11 +55,11 @@ def main(task_id, output): print(f"Total rows: {len(combined)}") -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("task_id", help="Task ID folder in S3") - parser.add_argument("--output", default="combined.csv") +# if __name__ == "__main__": +# parser = argparse.ArgumentParser() +# parser.add_argument("task_id", help="Task ID folder in S3") +# parser.add_argument("--output", default="combined.csv") - args = parser.parse_args() +# args = parser.parse_args() - main(args.task_id, args.output) +# main(args.task_id, args.output) diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index 519636be..df54749e 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -28,10 +28,10 @@ from sqlalchemy import func # PORTFOLIO_ID = 206 # SCENARIOS = [389] -PORTFOLIO_ID = 581 -SCENARIOS = [1124] +PORTFOLIO_ID = 633 +SCENARIOS = [1146] scenario_names = { - 1124: "EPC C - Solar Focused", + 1146: "Most Economic", } project_name = "WCHG EPC D rated properties" From a362e1dd99f83352414e5663679ebbc125740716 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 24 Mar 2026 15:34:33 +0000 Subject: [PATCH 22/47] added company information --- backend/app/db/models/organisation.py | 13 +++++ etl/hubspot/hubspotDataTodB.py | 50 +++++++++++++++++++ .../scripts/onboarding/new_organisation.py | 19 +++++++ 3 files changed, 82 insertions(+) create mode 100644 backend/app/db/models/organisation.py create mode 100644 etl/hubspot/hubspotDataTodB.py create mode 100644 etl/hubspot/scripts/onboarding/new_organisation.py diff --git a/backend/app/db/models/organisation.py b/backend/app/db/models/organisation.py new file mode 100644 index 00000000..774a05af --- /dev/null +++ b/backend/app/db/models/organisation.py @@ -0,0 +1,13 @@ +from sqlmodel import SQLModel, Field +from datetime import datetime, timezone +from typing import Optional +import uuid + + +class Organisation(SQLModel, table=True): + __tablename__ = "organisation" + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + hubspot_company_id: Optional[str] = None + name: Optional[str] = None diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py new file mode 100644 index 00000000..24df240e --- /dev/null +++ b/etl/hubspot/hubspotDataTodB.py @@ -0,0 +1,50 @@ +from backend.app.db.connection import db_session +from backend.app.db.models.organisation import Organisation +from sqlmodel import select +from datetime import datetime, timezone +from typing import TypedDict + + +class CompanyData(TypedDict): + hs_object_id: str + name: str + + +class HubspotDataToDb: + def __init__(self): + pass + + def read_org_table(self, limit: int = 10): + with db_session() as session: + records = session.exec(select(Organisation).limit(limit)).all() + return records + + def upsert_company(self, company_data: CompanyData) -> Organisation: + """Upserts a company record. Updates if hubspot_company_id exists, otherwise creates new.""" + with db_session() as session: + hubspot_id = company_data.get("hs_object_id") + company_name = company_data.get("name") + + # Check if company already exists + existing = session.exec( + select(Organisation).where( + Organisation.hubspot_company_id == hubspot_id + ) + ).first() + + if existing: + # Update existing record + existing.name = company_name + existing.updated_at = datetime.now(timezone.utc) + session.add(existing) + record = existing + else: + # Create new record + record = Organisation( + hubspot_company_id=hubspot_id, + name=company_name, + ) + session.add(record) + + session.commit() + return record diff --git a/etl/hubspot/scripts/onboarding/new_organisation.py b/etl/hubspot/scripts/onboarding/new_organisation.py new file mode 100644 index 00000000..f5faead3 --- /dev/null +++ b/etl/hubspot/scripts/onboarding/new_organisation.py @@ -0,0 +1,19 @@ +from etl.hubspot.hubspotClient import HubspotClient, Companies + +from etl.hubspot.hubspotDataTodB import HubspotDataToDb + +hubspot = HubspotClient() + +companies_to_add_or_ensure_it_exists = [ + Companies.THE_GUINESS_PARTNERSHIP, + Companies.SOUTHERN_HOUSING_GROUP, +] + +for company in companies_to_add_or_ensure_it_exists: + company_info = hubspot.get_company_information(company.value) + company_info + break + +dbRead = HubspotDataToDb() + +dbRead.read_org_table() From 29ab9ecfd778940cfac77161dbb8859c9fba9394 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 24 Mar 2026 15:43:10 +0000 Subject: [PATCH 23/47] added hubspot company data --- etl/hubspot/hubspotClient.py | 5 +++-- etl/hubspot/hubspotDataTodB.py | 11 ++++++++--- etl/hubspot/scripts/onboarding/new_organisation.py | 14 +++++++------- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py index ed456478..c87ea872 100644 --- a/etl/hubspot/hubspotClient.py +++ b/etl/hubspot/hubspotClient.py @@ -25,6 +25,7 @@ from hubspot.crm.associations.v4.models import ( # type: ignore[reportMissingTy ForwardPaging as AssociationsPaging, NextPage as AssociationsPagingNext, ) +from etl.hubspot.hubspotDataTodB import CompanyData from backend.app.config import get_settings @@ -223,7 +224,7 @@ class HubspotClient: return deal, company, listing - def get_company_information(self, company_id: str) -> dict[str, str]: + def get_company_information(self, company_id: str) -> CompanyData: companies_api: CompaniesBasicApi = self.client.crm.companies.basic_api # type: ignore[reportUnknownMemberType] company: HubspotObject = companies_api.get_by_id( # type: ignore[reportUnknownMemberType] @@ -233,7 +234,7 @@ class HubspotClient: ], ) - company_info: dict[str, str] = cast(dict[str, str], company.properties) # type: ignore[reportUnknownMemberType] + company_info: CompanyData = company.properties # type: ignore[reportUnknownMemberType] return company_info def get_all_pipelines(self) -> list[dict[str, str]]: diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index 24df240e..8fe61a3e 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -1,4 +1,4 @@ -from backend.app.db.connection import db_session +from backend.app.db.connection import db_read_session from backend.app.db.models.organisation import Organisation from sqlmodel import select from datetime import datetime, timezone @@ -15,13 +15,18 @@ class HubspotDataToDb: pass def read_org_table(self, limit: int = 10): - with db_session() as session: + with db_read_session() as session: records = session.exec(select(Organisation).limit(limit)).all() return records + def get_org_names(self, limit: int = 10) -> list[str]: + """Returns a list of organisation names.""" + records = self.read_org_table(limit) + return [org.name for org in records if org.name] + def upsert_company(self, company_data: CompanyData) -> Organisation: """Upserts a company record. Updates if hubspot_company_id exists, otherwise creates new.""" - with db_session() as session: + with db_read_session() as session: hubspot_id = company_data.get("hs_object_id") company_name = company_data.get("name") diff --git a/etl/hubspot/scripts/onboarding/new_organisation.py b/etl/hubspot/scripts/onboarding/new_organisation.py index f5faead3..5a11266f 100644 --- a/etl/hubspot/scripts/onboarding/new_organisation.py +++ b/etl/hubspot/scripts/onboarding/new_organisation.py @@ -1,19 +1,19 @@ from etl.hubspot.hubspotClient import HubspotClient, Companies -from etl.hubspot.hubspotDataTodB import HubspotDataToDb +from etl.hubspot.hubspotDataTodB import HubspotDataToDb, CompanyData hubspot = HubspotClient() - +dbRead = HubspotDataToDb() companies_to_add_or_ensure_it_exists = [ Companies.THE_GUINESS_PARTNERSHIP, Companies.SOUTHERN_HOUSING_GROUP, ] for company in companies_to_add_or_ensure_it_exists: - company_info = hubspot.get_company_information(company.value) - company_info - break + company_info: CompanyData = hubspot.get_company_information(company.value) + dbRead.upsert_company(company_info) + dbRead = HubspotDataToDb() - -dbRead.read_org_table() +names = dbRead.get_org_names() +print(f"Organisations in database: {names}") From da039b91b2f3340ba27048af9b0f27004e1378b7 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 24 Mar 2026 15:55:17 +0000 Subject: [PATCH 24/47] hubspot etl for organisation complete --- etl/hubspot/hubspotClient.py | 1 + etl/hubspot/scripts/onboarding/new_organisation.py | 1 + 2 files changed, 2 insertions(+) diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py index c87ea872..6fd11bed 100644 --- a/etl/hubspot/hubspotClient.py +++ b/etl/hubspot/hubspotClient.py @@ -43,6 +43,7 @@ class Companies(Enum): HOMEGROUP = "94946071794" APPLE = "184769046716" THE_GUINESS_PARTNERSHIP = "86970043613" + CALICO_HOMES = "86975437046" class DealStage(Enum): diff --git a/etl/hubspot/scripts/onboarding/new_organisation.py b/etl/hubspot/scripts/onboarding/new_organisation.py index 5a11266f..f2ff8bda 100644 --- a/etl/hubspot/scripts/onboarding/new_organisation.py +++ b/etl/hubspot/scripts/onboarding/new_organisation.py @@ -7,6 +7,7 @@ dbRead = HubspotDataToDb() companies_to_add_or_ensure_it_exists = [ Companies.THE_GUINESS_PARTNERSHIP, Companies.SOUTHERN_HOUSING_GROUP, + Companies.CALICO_HOMES, ] for company in companies_to_add_or_ensure_it_exists: From 934e666357d7339131b70548651440ae30e9ccf2 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 30 Mar 2026 14:41:38 +0000 Subject: [PATCH 25/47] added hubspot to add one deal with deal id --- asset_list/app.py | 16 +- backend/app/db/models/organisation.py | 47 ++- etl/hubspot/hubspotDataTodB.py | 290 +++++++++++++++++- etl/hubspot/requirements.txt | 2 +- etl/hubspot/s3_uploader.py | 116 +++++++ etl/hubspot/scripts/scraper/README.md | 15 + etl/hubspot/scripts/scraper/__init__.py | 0 .../scripts/scraper/handler/Dockerfile | 38 +++ .../scripts/scraper/handler/requirements.txt | 12 + .../scraper/local_handler/docker-compose.yml | 11 + .../local_handler/invoke_local_lambda.py | 28 ++ .../scraper/local_handler/run_local.sh | 2 + etl/hubspot/scripts/scraper/main.py | 45 +++ 13 files changed, 610 insertions(+), 12 deletions(-) create mode 100644 etl/hubspot/s3_uploader.py create mode 100644 etl/hubspot/scripts/scraper/README.md create mode 100644 etl/hubspot/scripts/scraper/__init__.py create mode 100644 etl/hubspot/scripts/scraper/handler/Dockerfile create mode 100644 etl/hubspot/scripts/scraper/handler/requirements.txt create mode 100644 etl/hubspot/scripts/scraper/local_handler/docker-compose.yml create mode 100644 etl/hubspot/scripts/scraper/local_handler/invoke_local_lambda.py create mode 100644 etl/hubspot/scripts/scraper/local_handler/run_local.sh create mode 100644 etl/hubspot/scripts/scraper/main.py diff --git a/asset_list/app.py b/asset_list/app.py index 02c94f10..5794eaf3 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -75,22 +75,22 @@ def app(): data_folder = "/workspaces/model/asset_list" data_filename = "Calico ARA Upload Review.xlsx" - sheet_name = "Upload to Ara - Needs Sign Off" + sheet_name = "Sheet1" postcode_column = "Postcode" - address1_column = "Address 1" + address1_column = "Units" address1_method = None - fulladdress_column = "Address 1" - address_cols_to_concat = [] + fulladdress_column = "Units" + address_cols_to_concat = ["Units"] missing_postcodes_method = None landlord_year_built = None - landlord_os_uprn = "ara_found_uprn" - landlord_property_type = "Property Type" - landlord_built_form = "Property Type" + landlord_os_uprn = None + landlord_property_type = None # Good to include if landlord gave + landlord_built_form = None # Good to include if landlord gave landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "Asset Reference" + landlord_property_id = "llid" landlord_sap = None outcomes_filename = None outcomes_sheetname = None diff --git a/backend/app/db/models/organisation.py b/backend/app/db/models/organisation.py index 774a05af..a3c79e3c 100644 --- a/backend/app/db/models/organisation.py +++ b/backend/app/db/models/organisation.py @@ -1,6 +1,8 @@ -from sqlmodel import SQLModel, Field +from sqlmodel import SQLModel, Field, Column, text from datetime import datetime, timezone from typing import Optional +from sqlalchemy import DateTime +from sqlalchemy.sql import func import uuid @@ -11,3 +13,46 @@ class Organisation(SQLModel, table=True): updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) hubspot_company_id: Optional[str] = None name: Optional[str] = None + + +class HubspotDealData(SQLModel, table=True): + __tablename__ = "hubspot_deal_data" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + + # HubSpot Deal identifiers + deal_id: str = Field(index=True, nullable=False) + dealname: Optional[str] = Field(default=None) + dealstage: Optional[str] = Field(default=None) + company_id: Optional[str] = Field(default=None) + project_code: Optional[str] = Field(default=None) + + # HubSpot custom properties + landlord_property_id: Optional[str] = Field(default=None) + uprn: Optional[str] = Field(default=None) + outcome: Optional[str] = Field(default=None) + outcome_notes: Optional[str] = Field(default=None) + + major_condition_issue_description: Optional[str] = Field(default=None) + major_condition_issue_photos: Optional[str] = Field(default=None) + major_condition_issue_evidence_s3_url: Optional[str] = Field(default=None) + + coordination_status: Optional[str] = Field(default=None) + design_status: Optional[str] = Field(default=None) + + created_at: datetime = Field( + sa_column=Column( + DateTime(timezone=True), + server_default=text("NOW() AT TIME ZONE 'utc'"), + nullable=False, + ) + ) + + updated_at: datetime = Field( + sa_column=Column( + DateTime(timezone=True), + server_default=text("NOW() AT TIME ZONE 'utc'"), + onupdate=func.now(), + nullable=False, + ) + ) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index 8fe61a3e..4ed579e9 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -1,8 +1,10 @@ from backend.app.db.connection import db_read_session -from backend.app.db.models.organisation import Organisation +from backend.app.db.models.organisation import Organisation, HubspotDealData from sqlmodel import select from datetime import datetime, timezone from typing import TypedDict +from etl.hubspot.s3_uploader import S3Uploader +import hashlib class CompanyData(TypedDict): @@ -12,7 +14,7 @@ class CompanyData(TypedDict): class HubspotDataToDb: def __init__(self): - pass + self.s3 = S3Uploader() def read_org_table(self, limit: int = 10): with db_read_session() as session: @@ -53,3 +55,287 @@ class HubspotDataToDb: session.commit() return record + + ### + # Check from here + ### + + def new_record_to_hubspot_data(self, deal_data, company, listing, hubspot_client): + print("⚠️ Deprecated — use the new interface instead.") + return self.upsert_hubspot_deal(deal_data, company, listing, hubspot_client) + + def find_all_deals_with_company_id(self, company_id): + """Returns a list of deals for a given company_id.""" + with db_read_session() as session: + return ( + session.query(HubspotDealData) + .filter(HubspotDealData.company_id == company_id) + .all() + ) + + def find_deal_with_deal_id(self, deal_id): + with db_read_session() as session: + return ( + session.query(HubspotDealData) + .filter(HubspotDealData.deal_id == deal_id) + .one_or_none() + ) + + def _sha256(self, file_path: str) -> str: + """Compute SHA-256 checksum of a file.""" + sha256 = hashlib.sha256() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + sha256.update(chunk) + return sha256.hexdigest() + + def update_deal(self, deal_in_db, hubspot_client): + """ + Checks if a deal needs updating and syncs it with HubSpot. + Also handles major_condition_issue_photos file upload to S3 with integrity check. + """ + + def soft_assert(condition, message="Assertion Failed"): + if not condition: + print(f"⚠️ Soft Assert Failed: {message}") + return False + return True + + print(f"🔍 Checking if deal needs updating (deal_id={deal_in_db.deal_id})") + + hs_deal, hs_company_id, hs_listing = hubspot_client.get_deal_info_for_db( + deal_in_db.deal_id + ) + + # Soft compare key fields + checks = [ + soft_assert( + deal_in_db.deal_id == hs_deal.get("hs_object_id"), "deal_id mismatch" + ), + soft_assert(deal_in_db.company_id == hs_company_id, "company_id mismatch"), + soft_assert( + deal_in_db.landlord_property_id == hs_listing.get("owner_property_id"), + "landlord_property_id mismatch", + ), + soft_assert( + deal_in_db.outcome == hs_deal.get("outcome"), "outcome mismatch" + ), + soft_assert( + deal_in_db.dealstage == hs_deal.get("dealstage"), "dealstage mismatch" + ), + soft_assert( + deal_in_db.dealname == hs_deal.get("dealname"), "dealname mismatch" + ), + soft_assert( + deal_in_db.project_code == hs_deal.get("project_code"), + "project_code mismatch", + ), + soft_assert( + deal_in_db.uprn == hs_listing.get("national_uprn"), "uprn mismatch" + ), + soft_assert( + deal_in_db.outcome_notes == hs_deal.get("outcome_notes"), + "outcome_notes mismatch", + ), + soft_assert( + deal_in_db.major_condition_issue_description + == hs_deal.get("major_condition_issue_description"), + "major condition description mismatch", + ), + soft_assert( + deal_in_db.major_condition_issue_photos + == hs_deal.get("major_condition_issue_photos"), + "major condition issue photos mismatch", + ), + soft_assert( + deal_in_db.coordination_status + == hs_deal.get("coordination_status__stage_1_"), + "coordination stage 1 status mismatch", + ), + soft_assert( + deal_in_db.design_status == hs_deal.get("retrofit_design_status"), + "retrofit design mismatch", + ), + ] + + # If discrepancies found, update from HubSpot + if not all(checks): + print( + f"❗ Discrepancies found for deal_id {deal_in_db.deal_id} — syncing with HubSpot." + ) + self.upsert_hubspot_deal(hs_deal, hs_company_id, hs_listing, hubspot_client) + return False + + # Handle photo upload if it exists but S3 URL is missing + if ( + deal_in_db.major_condition_issue_photos + and not deal_in_db.major_condition_issue_evidence_s3_url + ): + print( + f"🖼️ Found photo for deal_id {deal_in_db.deal_id} — uploading to S3..." + ) + + photo_url = hs_deal.get("major_condition_issue_photos") + if photo_url: + try: + # Download from HubSpot using fresh URL from hs_deal (not stale DB URL) + local_file = hubspot_client.download_file_from_url(photo_url) + + # Upload to S3 + bucket = "retrofit-data-dev" + s3_url = self.s3.upload_file( + local_file, bucket, prefix="hubspot/awaabs_law_evidence/" + ) + + # Download again to verify integrity + downloaded = self.s3.download_from_url(s3_url) + if self._sha256(local_file) == self._sha256(downloaded): + print("✅ SHA256 match verified — upload successful.") + else: + print("❌ SHA256 mismatch — integrity check failed.") + raise ValueError("File integrity check failed after S3 upload.") + + # Update DB record with S3 URL + with db_read_session() as session: + db_record = session.get(HubspotDealData, deal_in_db.id) + db_record.major_condition_issue_evidence_s3_url = s3_url + session.add(db_record) + session.commit() + print( + f"✅ Updated DB with S3 URL for deal_id={deal_in_db.deal_id}" + ) + return False + except Exception as e: + print( + f"⚠️ Failed to download/upload photo for deal_id {deal_in_db.deal_id}: {e}" + ) + # Continue without the file — don't crash the entire update + else: + print(f"⚠️ Photo URL missing for deal_id {deal_in_db.deal_id}") + + else: + print(f"✅ No update or upload required for deal_id {deal_in_db.deal_id}.") + + return True + + def upsert_hubspot_deal(self, deal_data, company, listing, hubspot_client): + """ + Inserts or updates a deal record. + Also uploads photos if present and adds S3 URL. + """ + with db_read_session() as session: + deal_id = deal_data.get("hs_object_id") + + statement = select(HubspotDealData).where( + HubspotDealData.deal_id == deal_id + ) + existing = session.exec(statement).first() + + if existing: + print(f"🔄 Updating existing deal (deal_id={deal_id})") + + for attr, value in { + "dealname": deal_data.get("dealname"), + "dealstage": deal_data.get("dealstage"), + "landlord_property_id": listing.get("owner_property_id"), + "uprn": listing.get("national_uprn"), + "outcome": deal_data.get("outcome"), + "outcome_notes": deal_data.get("outcome_notes"), + "project_code": deal_data.get("project_code"), + "company_id": company, + "major_condition_issue_description": deal_data.get( + "major_condition_issue_description" + ), + "major_condition_issue_photos": deal_data.get( + "major_condition_issue_photos" + ), + "major_condition_issue_description": deal_data.get( + "major_condition_issue_description" + ), + "major_condition_issue_photos": deal_data.get( + "major_condition_issue_photos" + ), + "coordination_status": deal_data.get( + "coordination_status__stage_1_" + ), + "design_status": deal_data.get("retrofit_design_status"), + }.items(): + setattr(existing, attr, value or getattr(existing, attr)) + + # Upload if photo exists but S3 link missing + if ( + existing.major_condition_issue_photos + and not existing.major_condition_issue_evidence_s3_url + ): + # Fetch fresh URL from HubSpot instead of using potentially expired stored URL + fresh_deal = hubspot_client.from_deal_id_get_info(existing.deal_id) + photo_url = fresh_deal.get("major_condition_issue_photos") + + if photo_url: + try: + local_file = hubspot_client.download_file_from_url( + photo_url + ) + s3_url = self.s3.upload_file( + local_file, + "retrofit-data-dev", + prefix="hubspot/awaabs_law_evidence/", + ) + existing.major_condition_issue_evidence_s3_url = s3_url + except Exception as e: + print( + f"⚠️ Failed to download photo for deal_id {existing.deal_id}: {e}" + ) + # Continue without the file — don't crash the update + else: + print(f"⚠️ Photo URL missing for deal_id {existing.deal_id}") + + session.add(existing) + session.commit() + session.refresh(existing) + return existing + + else: + print(f"🆕 Inserting new deal (deal_id={deal_id})") + new_record = HubspotDealData( + deal_id=deal_id, + dealname=deal_data.get("dealname"), + dealstage=deal_data.get("dealstage"), + landlord_property_id=listing.get("owner_property_id"), + uprn=listing.get("national_uprn"), + outcome=deal_data.get("outcome"), + outcome_notes=deal_data.get("outcome_notes"), + project_code=deal_data.get("project_code"), + company_id=company, + major_condition_issue_description=deal_data.get( + "major_condition_issue_description" + ), + major_condition_issue_photos=deal_data.get( + "major_condition_issue_photos" + ), + coordination_status=deal_data.get("coordination_status__stage_1_"), + design_status=deal_data.get("retrofit_design_status"), + ) + + # Handle upload at insert time + if new_record.major_condition_issue_photos: + try: + local_file = hubspot_client.download_file_from_url( + new_record.major_condition_issue_photos + ) + s3_url = self.s3.upload_file( + local_file, + "retrofit-data-dev", + prefix="hubspot/awaabs_law_evidence/", + ) + new_record.major_condition_issue_evidence_s3_url = s3_url + except Exception as e: + print( + f"⚠️ Failed to download photo for deal_id {new_record.deal_id}: {e}" + ) + # Continue without the file — don't crash the insert + + session.add(new_record) + session.commit() + session.refresh(new_record) + return new_record diff --git a/etl/hubspot/requirements.txt b/etl/hubspot/requirements.txt index ef8e3ebc..44a58f77 100644 --- a/etl/hubspot/requirements.txt +++ b/etl/hubspot/requirements.txt @@ -1 +1 @@ -hubspot-api-client \ No newline at end of file +hubspot-api-client diff --git a/etl/hubspot/s3_uploader.py b/etl/hubspot/s3_uploader.py new file mode 100644 index 00000000..0d217bd2 --- /dev/null +++ b/etl/hubspot/s3_uploader.py @@ -0,0 +1,116 @@ +import os +import boto3 +from botocore.exceptions import ClientError +from urllib.parse import urlparse +from datetime import datetime +import requests + + +class S3Uploader: + """ + Simple helper to upload local files to S3 and return their S3 HTTPS URI. + """ + + def __init__( + self, + aws_access_key: str = "AKIAU5A36PPNK7RXX52V", + aws_secret_key: str = "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", + region: str = "eu-west-2", + ): + self.aws_access_key = aws_access_key + self.aws_secret_key = aws_secret_key + self.region = region + + self.s3 = boto3.client( + "s3", + aws_access_key_id=self.aws_access_key, + aws_secret_access_key=self.aws_secret_key, + region_name=self.region, + ) + + def upload_file(self, file_path: str, bucket: str, prefix: str = "uploads/") -> str: + """ + Upload a local file to an S3 bucket and return its HTTPS URI. + + Args: + file_path (str): Path to the local file. + bucket (str): S3 bucket name. + prefix (str): Folder/prefix in the bucket. + + Returns: + str: HTTPS-style S3 URI (not signed). + """ + try: + filename = os.path.basename(file_path) + timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + s3_key = os.path.join(prefix, f"{timestamp}_{filename}") + + self.s3.upload_file(file_path, bucket, s3_key) + + s3_uri = f"https://{bucket}.s3.{self.region}.amazonaws.com/{s3_key}" + return s3_uri + + except ClientError as e: + raise RuntimeError(f"❌ S3 upload failed: {e}") + + def print_bucket(self): + print(self.s3.head_bucket(Bucket="retrofit-data-dev")) + + def generate_presigned_url( + self, bucket: str, key: str, expires_in: int = 3600 + ) -> str: + """ + Generate a temporary presigned URL for an S3 object. + """ + try: + return self.s3.generate_presigned_url( + "get_object", + Params={"Bucket": bucket, "Key": key}, + ExpiresIn=expires_in, + ) + except ClientError as e: + raise RuntimeError(f"❌ Failed to generate signed URL: {e}") + + def download_from_url( + self, s3_url: str, local_dir: str = ".", expires_in: int = 3600 + ) -> str: + """ + Download a file from a public or private S3 URL. + If private, generates a presigned URL first. + + Args: + s3_url (str): Full S3 HTTPS URL (e.g., https://bucket.s3.region.amazonaws.com/path/file.txt) + local_dir (str): Folder to save the file in. + expires_in (int): Presigned URL lifetime (seconds). + + Returns: + str: Local file path of the downloaded file. + """ + parsed = urlparse(s3_url) + host_parts = parsed.netloc.split(".") + if len(host_parts) < 3 or host_parts[1] != "s3": + raise ValueError("❌ Not a valid S3 HTTPS URL") + + bucket = host_parts[0] + key = parsed.path.lstrip("/") + + # Generate presigned URL (whether public or private) + presigned_url = self.generate_presigned_url(bucket, key, expires_in) + + filename = os.path.basename(key) + local_path = os.path.join(local_dir, filename) + + try: + response = requests.get(presigned_url, stream=True) + response.raise_for_status() + + os.makedirs(local_dir, exist_ok=True) + with open(local_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + print(f"✅ Downloaded: {local_path}") + return local_path + + except requests.exceptions.RequestException as e: + raise RuntimeError(f"❌ Failed to download file: {e}") diff --git a/etl/hubspot/scripts/scraper/README.md b/etl/hubspot/scripts/scraper/README.md new file mode 100644 index 00000000..2d7fe975 --- /dev/null +++ b/etl/hubspot/scripts/scraper/README.md @@ -0,0 +1,15 @@ +Input: + + + + +Function: + + + + +Used in: + +when changes are made in hubspot, this will trigger a workflow in make. + +This in turn will trigger this sqs which I'm building from this directory \ No newline at end of file diff --git a/etl/hubspot/scripts/scraper/__init__.py b/etl/hubspot/scripts/scraper/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/etl/hubspot/scripts/scraper/handler/Dockerfile b/etl/hubspot/scripts/scraper/handler/Dockerfile new file mode 100644 index 00000000..bbcc3e22 --- /dev/null +++ b/etl/hubspot/scripts/scraper/handler/Dockerfile @@ -0,0 +1,38 @@ +FROM public.ecr.aws/lambda/python:3.10 +# FROM python:3.11.10-bullseye + + +ARG DEV_DB_HOST +ARG DEV_DB_PORT +ARG DEV_DB_NAME + +ENV DB_HOST=${DEV_DB_HOST} +ENV DB_PORT=${DEV_DB_PORT} +ENV DB_NAME=${DEV_DB_NAME} + + +# Set working directory (Lambda task root) +WORKDIR /var/task + +# ----------------------------- +# Copy requirements FIRST (for Docker layer caching) +# ----------------------------- +COPY etl/hubspot/scripts/scraper/handler/requirements.txt . + +# Install dependencies into Lambda runtime +RUN pip install --no-cache-dir -r requirements.txt + + +# Copy necessary files for database and utility imports +COPY backend/ backend/ +COPY utils/ utils/ +COPY datatypes/ datatypes/ +COPY etl/hubspot etl/hubspot + +# Copy the handler +COPY etl/hubspot/scripts/scraper/main.py . + +# ----------------------------- +# Lambda handler +# ----------------------------- +CMD ["main.handler"] \ No newline at end of file diff --git a/etl/hubspot/scripts/scraper/handler/requirements.txt b/etl/hubspot/scripts/scraper/handler/requirements.txt new file mode 100644 index 00000000..230b460e --- /dev/null +++ b/etl/hubspot/scripts/scraper/handler/requirements.txt @@ -0,0 +1,12 @@ +pandas==2.2.2 +numpy<2.0 +requests +tqdm +openpyxl +epc-api-python==1.0.2 +boto3==1.35.44 +sqlmodel +sqlalchemy==2.0.36 +psycopg2-binary==2.9.10 +pydantic-settings==2.6.0 +hubspot-api-client \ No newline at end of file diff --git a/etl/hubspot/scripts/scraper/local_handler/docker-compose.yml b/etl/hubspot/scripts/scraper/local_handler/docker-compose.yml new file mode 100644 index 00000000..77679650 --- /dev/null +++ b/etl/hubspot/scripts/scraper/local_handler/docker-compose.yml @@ -0,0 +1,11 @@ +version: "3.9" + +services: + hubspot-scraper: + build: + context: ../../../../../ + dockerfile: etl/hubspot/scripts/scraper/handler/Dockerfile + ports: + - "9000:8080" + env_file: + - ../../../../../.env \ No newline at end of file diff --git a/etl/hubspot/scripts/scraper/local_handler/invoke_local_lambda.py b/etl/hubspot/scripts/scraper/local_handler/invoke_local_lambda.py new file mode 100644 index 00000000..69580a93 --- /dev/null +++ b/etl/hubspot/scripts/scraper/local_handler/invoke_local_lambda.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import json +import requests + +HOST = "localhost" +PORT = "9000" + +LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations" + +payload = { + "Records": [ + { + "body": json.dumps( + { + "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", + "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0", + "hubspot_deal_id": "254427203793", + } + ) + } + ] +} + +response = requests.post(LAMBDA_URL, json=payload) + +print("Status code:", response.status_code) +print("Response:") +print(response.text) diff --git a/etl/hubspot/scripts/scraper/local_handler/run_local.sh b/etl/hubspot/scripts/scraper/local_handler/run_local.sh new file mode 100644 index 00000000..17474bdb --- /dev/null +++ b/etl/hubspot/scripts/scraper/local_handler/run_local.sh @@ -0,0 +1,2 @@ +docker compose build --no-cache +docker compose up --force-recreate diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py new file mode 100644 index 00000000..a51cd4a4 --- /dev/null +++ b/etl/hubspot/scripts/scraper/main.py @@ -0,0 +1,45 @@ +""" +TODO: + +1) [completed]Get hubspot deal properties from one deal +2) Put it in some class +3) [completed] Load the db and check if upsert it into the table +4) Getting working on a AWS lambda +5) [completed] subtask and tasks history +6) The new sexy deal properties, move it over +""" + +from backend.utils.subtasks import subtask_handler +from etl.hubspot.hubspotClient import HubspotClient +from etl.hubspot.hubspotDataTodB import HubspotDataToDb +from typing import Any + + +@subtask_handler() +def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: + if local is True: + body = { + "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", + "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0", + "hubspot_deal_id": "254427203793", + } + + hubspot_deal_id = body.get("hubspot_deal_id", "") + + if hubspot_deal_id == "": + raise RuntimeError( + "Missing Hubspot Deal ID in SQS body request, 'hubspot_deal_id'" + ) + + hubspot = HubspotClient() + dbloader = HubspotDataToDb() + + deal = dbloader.find_deal_with_deal_id(hubspot_deal_id) + + if deal: + dbloader.update_deal(deal, hubspot) + else: + deal, company, listing = hubspot.get_deal_info_for_db(hubspot_deal_id) + dbloader.upsert_hubspot_deal(deal, company, listing, hubspot) + + print("Finsihed running") From 1abc53f3e3156a5da53b00adafe6a6fd67072b2d Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 30 Mar 2026 14:42:11 +0000 Subject: [PATCH 26/47] removed hashlib as its from the standard library --- etl/hubspot/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/hubspot/requirements.txt b/etl/hubspot/requirements.txt index 44a58f77..ef8e3ebc 100644 --- a/etl/hubspot/requirements.txt +++ b/etl/hubspot/requirements.txt @@ -1 +1 @@ -hubspot-api-client +hubspot-api-client \ No newline at end of file From d6f9b4879709a3867caf3a8eb466fc41d0e8f05c Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 30 Mar 2026 15:44:52 +0000 Subject: [PATCH 27/47] depploy hubspot etl registry --- infrastructure/terraform/shared/main.tf | 31 +++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 486f79ca..5a396b3a 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -574,3 +574,34 @@ output "cdn_certificate_state_bucket" { value = module.cdn_certificate_state_bucket.bucket_name } + +################################################ +# Hubspot ETL Lambda +################################################ +module "hubspot_etl_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "hubspot-etl-bucket-terraform-state" + +} + +module "hubspot_etl_registry" { + source = "../modules/container_registry" + name = "hubspot_etl" + stage = var.stage + +} + +# S3 policy for postcode splitter to read from retrofit data bucket +module "hubspot_etl_s3_read_and_write" { + source = "../modules/s3_iam_policy" + + policy_name = "HubspotETLReadandWriteS3" + policy_description = "Allow ordnance Lambda to read and write from retrofit-data bucket" + bucket_arns = ["arn:aws:s3:::retrofit-data-${var.stage}"] + actions = ["s3:GetObject", "s3:ListBucket", "s3:PutObject"] + resource_paths = ["/*"] +} + +output "ordnance_s3_read_and_write_arn" { + value = module.hubspot_etl_s3_read_and_write.policy_arn +} From 764ee81dad1447be2008ecdad77283412cd0026c Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 30 Mar 2026 15:52:28 +0000 Subject: [PATCH 28/47] hubspot etl --- .github/workflows/deploy_terraform.yml | 42 ++++++++++++++++++++++++- infrastructure/terraform/shared/main.tf | 2 +- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index e41534e6..500b2435 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -433,4 +433,44 @@ jobs: - name: Terraform Apply if: env.TERRAFORM_APPLY == 'true' working-directory: infrastructure/terraform/cdn - run: terraform apply -auto-approve tfplan \ No newline at end of file + run: terraform apply -auto-approve tfplan + + # ============================================================ + # Build Hubspot ETL image + # ============================================================ + hubspot_etl_image: + needs: [determine_stage, shared_terraform] + uses: ./.github/workflows/_build_image.yml + with: + ecr_repo: hubspot-etl-${{ needs.determine_stage.outputs.stage }} + dockerfile_path: etl/hubspot/scripts/scraper/handler/Dockerfile + build_context: . + build_args: | + DEV_DB_HOST=$DEV_DB_HOST + DEV_DB_PORT=$DEV_DB_PORT + DEV_DB_NAME=$DEV_DB_NAME + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} + DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} + DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} + + # ============================================================ + # Deploy OrdanceSurvey Lambda + # ============================================================ + ordnanceSurvey_lambda: + needs: [hubspot_etl_image, determine_stage] + uses: ./.github/workflows/_deploy_lambda.yml + with: + lambda_name: hubspotETLtoAraDb + lambda_path: infrastructure/terraform/lambda/hubspot_deal_etl + stage: ${{ needs.determine_stage.outputs.stage }} + ecr_repo: ordnance-${{ needs.determine_stage.outputs.stage }} + image_digest: ${{ needs.hubspot_etl_image.outputs.image_digest }} + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 5a396b3a..25c40b7a 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -586,7 +586,7 @@ module "hubspot_etl_bucket" { module "hubspot_etl_registry" { source = "../modules/container_registry" - name = "hubspot_etl" + name = "hubspot-etl" stage = var.stage } From 3ebe04423f10c5930d6a7cfd696613c3c3bf9eac Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 30 Mar 2026 15:53:30 +0000 Subject: [PATCH 29/47] deployed --- etl/hubspot/scripts/scraper/main.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index a51cd4a4..f862948b 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -1,12 +1,10 @@ """ -TODO: - 1) [completed]Get hubspot deal properties from one deal 2) Put it in some class 3) [completed] Load the db and check if upsert it into the table -4) Getting working on a AWS lambda +4) [completed]Getting working on a AWS lambda 5) [completed] subtask and tasks history -6) The new sexy deal properties, move it over +6) [TODO]The new sexy deal properties, move it over """ from backend.utils.subtasks import subtask_handler From a249ba13748c293212362d0b6def15e6ca9e3ac0 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 30 Mar 2026 17:15:31 +0000 Subject: [PATCH 30/47] got rid of tox --- .github/workflows/unit_tests.yml | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index f09988b0..0b0b68ea 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -7,7 +7,8 @@ on: jobs: - test: + test-docker: + name: Tests (Docker) runs-on: ubuntu-latest steps: @@ -18,14 +19,8 @@ jobs: run: docker build -f Dockerfile.test -t model-test . - name: Run tests - env: - EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }} - HUBSPOT_API_KEY: ${{ secrets.HUBSPOT_API_KEY }} - run: | - # docker run --rm \ - # -e EPC_AUTH_TOKEN=${{ secrets.DEV_EPC_AUTH_TOKEN }} \ - # -e HUBSPOT_API_KEY=${{ secrets.HUBSPOT_API_KEY }} \ - # model-test pytest -m 'not integration' - - make test ARGS="-m 'not integration'" + docker run --rm \ + -e EPC_AUTH_TOKEN=${{ secrets.DEV_EPC_AUTH_TOKEN }} \ + -e HUBSPOT_API_KEY=${{ secrets.HUBSPOT_API_KEY }} \ + model-test pytest -m 'not integration' From 56fe3a1be00e3fdc4d435bf9acf038d056818c2f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 30 Mar 2026 17:22:54 +0000 Subject: [PATCH 31/47] get rid of parser as it doesn't work --- backend/scripts/combine_address2uprn_outputs.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/backend/scripts/combine_address2uprn_outputs.py b/backend/scripts/combine_address2uprn_outputs.py index f065c676..105b8639 100644 --- a/backend/scripts/combine_address2uprn_outputs.py +++ b/backend/scripts/combine_address2uprn_outputs.py @@ -53,13 +53,3 @@ def main(task_id, output): print(f"Combined CSV saved to {output}") print(f"Total rows: {len(combined)}") - - -# if __name__ == "__main__": -# parser = argparse.ArgumentParser() -# parser.add_argument("task_id", help="Task ID folder in S3") -# parser.add_argument("--output", default="combined.csv") - -# args = parser.parse_args() - -# main(args.task_id, args.output) From be09749c0a23ba6abd1680170cc9547d3232b2a2 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 09:02:18 +0000 Subject: [PATCH 32/47] got company updates to work --- etl/hubspot/hubspotClient.py | 9 ++++++++- etl/hubspot/hubspotDataTodB.py | 2 +- etl/hubspot/scripts/onboarding/new_organisation.py | 10 ++++++++++ etl/hubspot/scripts/scraper/main.py | 4 ++-- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py index 6fd11bed..8bbe8a63 100644 --- a/etl/hubspot/hubspotClient.py +++ b/etl/hubspot/hubspotClient.py @@ -25,7 +25,7 @@ from hubspot.crm.associations.v4.models import ( # type: ignore[reportMissingTy ForwardPaging as AssociationsPaging, NextPage as AssociationsPagingNext, ) -from etl.hubspot.hubspotDataTodB import CompanyData +from etl.hubspot.hubspotDataTodB import CompanyData, HubspotDataToDb from backend.app.config import get_settings @@ -217,8 +217,15 @@ class HubspotClient: def get_deal_info_for_db( self, deal_id: str ) -> tuple[dict[str, str], Optional[str], Optional[dict[str, str]]]: + deal: dict[str, str] = self.from_deal_id_get_info(deal_id) company: Optional[str] = self.from_deal_id_get_associated_company_id(deal_id) + + if company: + company_data: CompanyData = self.get_company_information(company) + dbloader: HubspotDataToDb = HubspotDataToDb() + dbloader.upsert_company(company_data) + listing: Optional[dict[str, str]] = self.from_deal_id_get_associated_listing( deal_id ) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index fb88422c..58da4036 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -90,7 +90,7 @@ class HubspotDataToDb: sha256.update(chunk) return sha256.hexdigest() - def update_deal(self, deal_in_db, hubspot_client): + def update_deal(self, deal_in_db, hubspot_client) -> bool: """ Checks if a deal needs updating and syncs it with HubSpot. Also handles major_condition_issue_photos file upload to S3 with integrity check. diff --git a/etl/hubspot/scripts/onboarding/new_organisation.py b/etl/hubspot/scripts/onboarding/new_organisation.py index f2ff8bda..f8c6ba7a 100644 --- a/etl/hubspot/scripts/onboarding/new_organisation.py +++ b/etl/hubspot/scripts/onboarding/new_organisation.py @@ -1,3 +1,13 @@ +""" +README.md + +This is a simple script to showcase how a new organisation can be +added to AraDb. + +This has been made reduntant due to doing this process when ever +hubspot has a webhook +""" + from etl.hubspot.hubspotClient import HubspotClient, Companies from etl.hubspot.hubspotDataTodB import HubspotDataToDb, CompanyData diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index f862948b..aa9a9502 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -29,8 +29,8 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: "Missing Hubspot Deal ID in SQS body request, 'hubspot_deal_id'" ) - hubspot = HubspotClient() - dbloader = HubspotDataToDb() + hubspot: HubspotClient = HubspotClient() + dbloader: HubspotDataToDb = HubspotDataToDb() deal = dbloader.find_deal_with_deal_id(hubspot_deal_id) From b928689c79643fe86dda1a0870c455c93321b190 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 10:35:43 +0000 Subject: [PATCH 33/47] add db properly --- .../scripts/scraper/handler/Dockerfile | 10 ----- .../terraform/lambda/hubspot_deal_etl/main.tf | 44 +++++++++++++++++++ .../lambda/hubspot_deal_etl/provider.tf | 16 +++++++ .../lambda/hubspot_deal_etl/variables.tf | 37 ++++++++++++++++ infrastructure/terraform/shared/main.tf | 2 +- 5 files changed, 98 insertions(+), 11 deletions(-) create mode 100644 infrastructure/terraform/lambda/hubspot_deal_etl/main.tf create mode 100644 infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf create mode 100644 infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf diff --git a/etl/hubspot/scripts/scraper/handler/Dockerfile b/etl/hubspot/scripts/scraper/handler/Dockerfile index bbcc3e22..012da376 100644 --- a/etl/hubspot/scripts/scraper/handler/Dockerfile +++ b/etl/hubspot/scripts/scraper/handler/Dockerfile @@ -1,16 +1,6 @@ FROM public.ecr.aws/lambda/python:3.10 # FROM python:3.11.10-bullseye - -ARG DEV_DB_HOST -ARG DEV_DB_PORT -ARG DEV_DB_NAME - -ENV DB_HOST=${DEV_DB_HOST} -ENV DB_PORT=${DEV_DB_PORT} -ENV DB_NAME=${DEV_DB_NAME} - - # Set working directory (Lambda task root) WORKDIR /var/task diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf new file mode 100644 index 00000000..ec2b18e3 --- /dev/null +++ b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf @@ -0,0 +1,44 @@ +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + + +data "aws_secretsmanager_secret_version" "db_credentials" { + secret_id = "${var.stage}/assessment_model/db_credentials" +} + +locals { + db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) +} + + +module "lambda" { + source = "../../modules/lambda_with_sqs" + + name = REPLACE ME #"address2uprn" for example + stage = var.stage + + image_uri = local.image_uri + + # Optional: Set maximum_concurrency to limit concurrent SQS-triggered invocations (2-1000) + maximum_concurrency = var.maximum_concurrency + + batch_size = var.batch_size + + environment = { + STAGE = var.stage + LOG_LEVEL = "info" + DB_USERNAME = local.db_credentials.db_assessment_model_username + DB_PASSWORD = local.db_credentials.db_assessment_model_password + } +} + +resource "aws_iam_role_policy_attachment" "lambda_s3_policy" { + role = module.lambda.role_name + policy_arn = data.terraform_remote_state.shared.outputs.hubspot_etl_s3_read_and_write_arn +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf new file mode 100644 index 00000000..3d66f392 --- /dev/null +++ b/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + } + + backend "s3" { + bucket = REPLACE_ME + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf new file mode 100644 index 00000000..e7646811 --- /dev/null +++ b/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf @@ -0,0 +1,37 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda (e.g. address2uprn)" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + +variable "maximum_concurrency" { + type = number + default = null + description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit." +} + +variable "batch_size" { + type = number + default = 1 +} + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 8d645522..bc16dc70 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -670,6 +670,6 @@ module "hubspot_etl_s3_read_and_write" { resource_paths = ["/*"] } -output "ordnance_s3_read_and_write_arn" { +output "hubspot_etl_s3_read_and_write_arn" { value = module.hubspot_etl_s3_read_and_write.policy_arn } \ No newline at end of file From ecba9264485ed19ba2983973322d18cbecd59a41 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 10:36:19 +0000 Subject: [PATCH 34/47] added db host and name --- infrastructure/terraform/lambda/hubspot_deal_etl/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf index ec2b18e3..effcada6 100644 --- a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf +++ b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf @@ -17,7 +17,7 @@ locals { } -module "lambda" { +module "hubspot_deal_etl" { source = "../../modules/lambda_with_sqs" name = REPLACE ME #"address2uprn" for example From 0f9d031944874cf9ca75005a213f5e01ea4541ec Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 10:38:46 +0000 Subject: [PATCH 35/47] removed subtask handler as need to do that differently --- etl/hubspot/scripts/scraper/main.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index aa9a9502..94342497 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -7,18 +7,15 @@ 6) [TODO]The new sexy deal properties, move it over """ -from backend.utils.subtasks import subtask_handler from etl.hubspot.hubspotClient import HubspotClient from etl.hubspot.hubspotDataTodB import HubspotDataToDb from typing import Any -@subtask_handler() +# @subtask_handler() TODO: Do this without subtask_handler but task_handler() that creates task_id and subtask_id def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: if local is True: body = { - "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", - "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0", "hubspot_deal_id": "254427203793", } From 5d6f4b3aead6f46a1c3ea21ab41fe0c9f4509e01 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 10:41:03 +0000 Subject: [PATCH 36/47] added checks --- etl/hubspot/hubspotDataTodB.py | 8 ++++---- etl/hubspot/scripts/scraper/main.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index 58da4036..f7f79e46 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -63,7 +63,7 @@ class HubspotDataToDb: def new_record_to_hubspot_data(self, deal_data, company, listing, hubspot_client): print("⚠️ Deprecated — use the new interface instead.") - return self.upsert_hubspot_deal(deal_data, company, listing, hubspot_client) + return self.upsert_deal(deal_data, company, listing, hubspot_client) def find_all_deals_with_company_id(self, company_id): """Returns a list of deals for a given company_id.""" @@ -90,7 +90,7 @@ class HubspotDataToDb: sha256.update(chunk) return sha256.hexdigest() - def update_deal(self, deal_in_db, hubspot_client) -> bool: + def update_deal_with_checks(self, deal_in_db, hubspot_client) -> bool: """ Checks if a deal needs updating and syncs it with HubSpot. Also handles major_condition_issue_photos file upload to S3 with integrity check. @@ -164,7 +164,7 @@ class HubspotDataToDb: print( f"❗ Discrepancies found for deal_id {deal_in_db.deal_id} — syncing with HubSpot." ) - self.upsert_hubspot_deal(hs_deal, hs_company_id, hs_listing, hubspot_client) + self.upsert_deal(hs_deal, hs_company_id, hs_listing, hubspot_client) return False # Handle photo upload if it exists but S3 URL is missing @@ -219,7 +219,7 @@ class HubspotDataToDb: return True - def upsert_hubspot_deal(self, deal_data, company, listing, hubspot_client): + def upsert_deal(self, deal_data, company, listing, hubspot_client): """ Inserts or updates a deal record. Also uploads photos if present and adds S3 URL. diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 94342497..48864b22 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -32,9 +32,9 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: deal = dbloader.find_deal_with_deal_id(hubspot_deal_id) if deal: - dbloader.update_deal(deal, hubspot) + dbloader.update_deal_with_checks(deal, hubspot) else: deal, company, listing = hubspot.get_deal_info_for_db(hubspot_deal_id) - dbloader.upsert_hubspot_deal(deal, company, listing, hubspot) + dbloader.upsert_deal(deal, company, listing, hubspot) print("Finsihed running") From 3ae78816a599871772ca26cb94309f9532e58dd7 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 10:42:02 +0000 Subject: [PATCH 37/47] revmoed keys --- etl/hubspot/s3_uploader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/hubspot/s3_uploader.py b/etl/hubspot/s3_uploader.py index 0d217bd2..f5cc0ec9 100644 --- a/etl/hubspot/s3_uploader.py +++ b/etl/hubspot/s3_uploader.py @@ -13,8 +13,8 @@ class S3Uploader: def __init__( self, - aws_access_key: str = "AKIAU5A36PPNK7RXX52V", - aws_secret_key: str = "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", + aws_access_key: str, + aws_secret_key: str, region: str = "eu-west-2", ): self.aws_access_key = aws_access_key From 21fa5aad45438b3f1bcd2228308c3e1810f69e87 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 10:42:42 +0000 Subject: [PATCH 38/47] update policy description --- infrastructure/terraform/shared/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index bc16dc70..9d272eb6 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -664,7 +664,7 @@ module "hubspot_etl_s3_read_and_write" { source = "../modules/s3_iam_policy" policy_name = "HubspotETLReadandWriteS3" - policy_description = "Allow ordnance Lambda to read and write from retrofit-data bucket" + policy_description = "Allow hubspot_etl_lambda Lambda to read and write from retrofit-data bucket" bucket_arns = ["arn:aws:s3:::retrofit-data-${var.stage}"] actions = ["s3:GetObject", "s3:ListBucket", "s3:PutObject"] resource_paths = ["/*"] From 2ccb6ddbcf9d23e29df66b1b46c5b7d530d075de Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 10:45:32 +0000 Subject: [PATCH 39/47] revert back to main --- pyrightconfig.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyrightconfig.json b/pyrightconfig.json index 18f578a5..d4e0e2a4 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -2,7 +2,7 @@ "typeCheckingMode": "strict", "venvPath": "/Users/khalimconn-kowlessar/opt/anaconda3/envs/", "venv": "Fastapi-backend", -"include": [ + "include": [ "." ] } \ No newline at end of file From 72bf64cd8e3635f4fd98424665d48ca193802982 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 10:56:00 +0000 Subject: [PATCH 40/47] verbose --- .github/workflows/unit_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 0b0b68ea..9f7ed83b 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -23,4 +23,4 @@ jobs: docker run --rm \ -e EPC_AUTH_TOKEN=${{ secrets.DEV_EPC_AUTH_TOKEN }} \ -e HUBSPOT_API_KEY=${{ secrets.HUBSPOT_API_KEY }} \ - model-test pytest -m 'not integration' + model-test pytest -vv -m 'not integration' From 1f66e1b17f86e20a68f8bcf73114ce49d0e6ef5e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 11:00:19 +0000 Subject: [PATCH 41/47] db details --- .github/workflows/unit_tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 9f7ed83b..a6673c34 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -23,4 +23,7 @@ jobs: docker run --rm \ -e EPC_AUTH_TOKEN=${{ secrets.DEV_EPC_AUTH_TOKEN }} \ -e HUBSPOT_API_KEY=${{ secrets.HUBSPOT_API_KEY }} \ + -e DB_HOST=${{ secrets.DEV_DB_HOST }} \ + -e DB_NAME=${{ secrets.DEV_DB_NAME }} \ + -e DB_PORT=${{ secrets.DEV_DB_PORT }} \ model-test pytest -vv -m 'not integration' From ba331d44dc44d70c30cf6028c74a23e99b61f568 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 11:06:57 +0000 Subject: [PATCH 42/47] db details --- .github/workflows/deploy_terraform.yml | 7 ------- infrastructure/terraform/lambda/hubspot_deal_etl/main.tf | 2 +- .../terraform/lambda/hubspot_deal_etl/provider.tf | 2 +- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 1208ee7b..fe95e3d6 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -496,17 +496,10 @@ jobs: ecr_repo: hubspot-etl-${{ needs.determine_stage.outputs.stage }} dockerfile_path: etl/hubspot/scripts/scraper/handler/Dockerfile build_context: . - build_args: | - DEV_DB_HOST=$DEV_DB_HOST - DEV_DB_PORT=$DEV_DB_PORT - DEV_DB_NAME=$DEV_DB_NAME secrets: AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} - DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} - DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} - DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} # ============================================================ # Deploy Hubspot ETL Lambda diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf index effcada6..051c7154 100644 --- a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf +++ b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf @@ -20,7 +20,7 @@ locals { module "hubspot_deal_etl" { source = "../../modules/lambda_with_sqs" - name = REPLACE ME #"address2uprn" for example + name = "hubspot_deal_etl" stage = var.stage image_uri = local.image_uri diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf index 3d66f392..c8a3972c 100644 --- a/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf +++ b/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf @@ -7,7 +7,7 @@ terraform { } backend "s3" { - bucket = REPLACE_ME + bucket = "hubspot-etl-bucket-terraform-state" key = "terraform.tfstate" region = "eu-west-2" } From b991ab73f7aad1fd05d7302f8f77905a1c56e707 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 11:19:16 +0000 Subject: [PATCH 43/47] add postgres --- .github/workflows/deploy_terraform.yml | 3 +++ .github/workflows/unit_tests.yml | 24 ++++++++++++++++--- .../terraform/lambda/_template/variables.tf | 1 + .../lambda/hubspot_deal_etl/variables.tf | 13 ++++++++++ 4 files changed, 38 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index fe95e3d6..cbcd88c4 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -515,6 +515,9 @@ jobs: image_digest: ${{ needs.hubspot_etl_image.outputs.image_digest }} terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} secrets: + TF_VAR_db_host: ${{ secrets.DEV_DB_HOST }} + TF_VAR_db_name: ${{ secrets.DEV_DB_NAME }} + TF_VAR_db_port: ${{ secrets.DEV_DB_PORT }} AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index a6673c34..740f88f7 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -11,6 +11,21 @@ jobs: name: Tests (Docker) runs-on: ubuntu-latest + services: + postgres: + image: postgres:15 + env: + POSTGRES_USER: test + POSTGRES_PASSWORD: test + POSTGRES_DB: test + ports: + - 5432:5432 + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + steps: - name: Checkout code uses: actions/checkout@v4 @@ -21,9 +36,12 @@ jobs: - name: Run tests run: | docker run --rm \ + --network host \ -e EPC_AUTH_TOKEN=${{ secrets.DEV_EPC_AUTH_TOKEN }} \ -e HUBSPOT_API_KEY=${{ secrets.HUBSPOT_API_KEY }} \ - -e DB_HOST=${{ secrets.DEV_DB_HOST }} \ - -e DB_NAME=${{ secrets.DEV_DB_NAME }} \ - -e DB_PORT=${{ secrets.DEV_DB_PORT }} \ + -e DB_HOST=localhost \ + -e DB_NAME=test \ + -e DB_USERNAME=test \ + -e DB_PASSWORD=test \ + -e DB_PORT=5432 \ model-test pytest -vv -m 'not integration' diff --git a/infrastructure/terraform/lambda/_template/variables.tf b/infrastructure/terraform/lambda/_template/variables.tf index e7646811..ae588840 100644 --- a/infrastructure/terraform/lambda/_template/variables.tf +++ b/infrastructure/terraform/lambda/_template/variables.tf @@ -35,3 +35,4 @@ locals { output "resolved_image_uri" { value = local.image_uri } + diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf index e7646811..2e7da609 100644 --- a/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf +++ b/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf @@ -35,3 +35,16 @@ locals { output "resolved_image_uri" { value = local.image_uri } + + +variable "db_host" { + type = string +} + +variable "db_name" { + type = string +} + +variable "db_port" { + type = string +} \ No newline at end of file From a946eb295921bb30bbbb31722d7c34620b6ec068 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 11:27:23 +0000 Subject: [PATCH 44/47] added sql model to db --- backend/export/tests/conftest.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backend/export/tests/conftest.py b/backend/export/tests/conftest.py index 10bfa971..80344c5e 100644 --- a/backend/export/tests/conftest.py +++ b/backend/export/tests/conftest.py @@ -2,6 +2,8 @@ import pytest from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from backend.app.db.base import Base +from sqlmodel import SQLModel +import backend.app.db.models.organisation # noqa: F401 — registers Organisation with SQLModel.metadata @pytest.fixture(scope="function") @@ -25,12 +27,14 @@ def engine(postgresql): # Create tables once per test session Base.metadata.create_all(engine) + SQLModel.metadata.create_all(engine) # Yeild will split this function into two phase. 1) setup and 2) teardown, the latter of which will run after all # tests have completed yield engine # Clean-up after entire test session + SQLModel.metadata.drop_all(engine) Base.metadata.drop_all(engine) engine.dispose() From f8736d3574707bb38b7c234a91faa41b35472ef8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 11:42:20 +0000 Subject: [PATCH 45/47] added sql model to db --- .github/workflows/unit_tests.yml | 11 +++++++++++ sfr/principal_pitch/2_export_data.py | 10 ++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 740f88f7..436428f9 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -33,6 +33,17 @@ jobs: - name: Build test image run: docker build -f Dockerfile.test -t model-test . + - name: Initialise database schema + run: | + docker run --rm \ + --network host \ + -e DB_HOST=localhost \ + -e DB_NAME=test \ + -e DB_USERNAME=test \ + -e DB_PASSWORD=test \ + -e DB_PORT=5432 \ + model-test python scripts/init_db.py + - name: Run tests run: | docker run --rm \ diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index df54749e..c89560cb 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -26,15 +26,13 @@ from backend.app.db.functions.materials_functions import get_materials from collections import defaultdict from sqlalchemy import func -# PORTFOLIO_ID = 206 -# SCENARIOS = [389] -PORTFOLIO_ID = 633 -SCENARIOS = [1146] +PORTFOLIO_ID = 639 +SCENARIOS = [1157] scenario_names = { - 1146: "Most Economic", + 1157: "EPC C - no EWI solid floor", } -project_name = "WCHG EPC D rated properties" +project_name = "Instagroup Sample" def get_data(portfolio_id, scenario_ids): From c498dc19511a6289eb25cb216d0afd9342888cb8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 11:45:59 +0000 Subject: [PATCH 46/47] init db --- scripts/init_db.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 scripts/init_db.py diff --git a/scripts/init_db.py b/scripts/init_db.py new file mode 100644 index 00000000..69edf777 --- /dev/null +++ b/scripts/init_db.py @@ -0,0 +1,5 @@ +from sqlmodel import SQLModel +import backend.app.db.models.organisation # noqa: F401 +from backend.app.db.connection import db_engine + +SQLModel.metadata.create_all(db_engine) From bba88bc077de2746e5b854aaa3a773d268c0e2fc Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 11:51:54 +0000 Subject: [PATCH 47/47] init db --- backend/app/db/models/organisation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/app/db/models/organisation.py b/backend/app/db/models/organisation.py index a3c79e3c..e8649cdd 100644 --- a/backend/app/db/models/organisation.py +++ b/backend/app/db/models/organisation.py @@ -43,7 +43,7 @@ class HubspotDealData(SQLModel, table=True): created_at: datetime = Field( sa_column=Column( DateTime(timezone=True), - server_default=text("NOW() AT TIME ZONE 'utc'"), + server_default=text("(NOW() AT TIME ZONE 'utc')"), nullable=False, ) ) @@ -51,7 +51,7 @@ class HubspotDealData(SQLModel, table=True): updated_at: datetime = Field( sa_column=Column( DateTime(timezone=True), - server_default=text("NOW() AT TIME ZONE 'utc'"), + server_default=text("(NOW() AT TIME ZONE 'utc')"), onupdate=func.now(), nullable=False, )