From e9db66b6b423f455697193af36922a3ace130da9 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Thu, 12 Mar 2026 11:58:46 +0000
Subject: [PATCH 01/47] added hubspot dependency to backend

---
 .devcontainer/backend/Dockerfile | 3 ++-
 etl/hubspot/hubspotClient.py     | 5 +++++
 etl/hubspot/requirements.txt     | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 etl/hubspot/hubspotClient.py
 create mode 100644 etl/hubspot/requirements.txt

diff --git a/.devcontainer/backend/Dockerfile b/.devcontainer/backend/Dockerfile
index 662f53b0..6a1cc120 100644
--- a/.devcontainer/backend/Dockerfile
+++ b/.devcontainer/backend/Dockerfile
@@ -35,7 +35,8 @@ ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
 ADD backend/engine/requirements.txt requirements1.txt
 ADD backend/app/requirements/requirements.txt requirements2.txt
 ADD .devcontainer/backend/requirements.txt requirements3.txt
-RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt
+ADD etl/hubspot/requirements.txt requirements4.txt
+RUN cat requirements1.txt requirements2.txt requirements3.txt requirements4.txt > requirements.txt
 RUN pip install -r requirements.txt
 
 # 5) Workdir
diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py
new file mode 100644
index 00000000..39cea6a1
--- /dev/null
+++ b/etl/hubspot/hubspotClient.py
@@ -0,0 +1,5 @@
+import hubspot
+
+class HubspotClient():
+
+    def
\ No newline at end of file
diff --git a/etl/hubspot/requirements.txt b/etl/hubspot/requirements.txt
new file mode 100644
index 00000000..105cba07
--- /dev/null
+++ b/etl/hubspot/requirements.txt
@@ -0,0 +1 @@
+hubspot
\ No newline at end of file

From 76dbde602b1ff2d5cb29d4a946411283b951b7e2 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 13 Mar 2026 13:27:52 +0000
Subject: [PATCH 02/47] added tests and hubspot client

---
 .devcontainer/backend/requirements.txt        |   2 +-
 backend/app/config.py                         |   2 +
 conftest.py                                   |   1 +
 etl/hubspot/hubspotClient.py                  | 442 +++++++++++++++++-
 etl/hubspot/requirements.txt                  |   2 +-
 etl/hubspot/tests/__init__.py                 |   0
 .../tests/test_hubspot_client_integration.py  | 117 +++++
 pyrightconfig.json                            |   2 +-
 pytest.ini                                    |   2 +-
 9 files changed, 563 insertions(+), 7 deletions(-)
 create mode 100644 etl/hubspot/tests/__init__.py
 create mode 100644 etl/hubspot/tests/test_hubspot_client_integration.py

diff --git a/.devcontainer/backend/requirements.txt b/.devcontainer/backend/requirements.txt
index 5cd40ced..f6e1f665 100644
--- a/.devcontainer/backend/requirements.txt
+++ b/.devcontainer/backend/requirements.txt
@@ -23,4 +23,4 @@ psycopg[binary]
 pytest-postgresql
 # Formatting
 black==26.1.0
-boto3-stubs
\ No newline at end of file
+boto3-stubs
diff --git a/backend/app/config.py b/backend/app/config.py
index 6604fec9..46301e30 100644
--- a/backend/app/config.py
+++ b/backend/app/config.py
@@ -65,6 +65,8 @@ class Settings(BaseSettings):
 
     ORDNANCE_SURVEY_API_KEY: str = "changeme"
 
+    HUBSPOT_API_KEY: Optional[str] = None
+
     # Optional AWS creds (only required in local)
     AWS_ACCESS_KEY_ID: Optional[str] = None
     AWS_SECRET_KEY_ID: Optional[str] = None
diff --git a/conftest.py b/conftest.py
index d93f0023..2ea20ebb 100644
--- a/conftest.py
+++ b/conftest.py
@@ -30,6 +30,7 @@ DEFAULT_ENV = {
     "HEATING_KWH_PREDICTIONS_BUCKET": "test",
     "HOTWATER_KWH_PREDICTIONS_BUCKET": "test",
     "ENERGY_ASSESSMENTS_BUCKET": "test",
+    "HUBSPOT_API_KEY": "changeme",
 }
 
 # runs immediately when pytest starts, BEFORE collection
diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py
index 39cea6a1..9c1cd31e 100644
--- a/etl/hubspot/hubspotClient.py
+++ b/etl/hubspot/hubspotClient.py
@@ -1,5 +1,441 @@
-import hubspot
+import os
+from enum import Enum
+from typing import Optional, cast
 
-class HubspotClient():
+from hubspot.client import Client  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.associations import ApiException  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.objects import SimplePublicObjectInput  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.objects.api.basic_api import BasicApi as ObjectsBasicApi  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.deals.api.basic_api import BasicApi as DealsBasicApi  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.companies.api.basic_api import BasicApi as CompaniesBasicApi  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.products.api.basic_api import BasicApi as ProductsBasicApi  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.line_items.api.basic_api import BasicApi as LineItemsBasicApi  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.pipelines.api.pipelines_api import PipelinesApi  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.pipelines.models import (  # type: ignore[reportMissingTypeStubs]
+    CollectionResponsePipelineNoPaging as PipelinesResponse,
+)
+from hubspot.crm.pipelines.models import Pipeline as HubspotPipeline  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.pipelines.models import PipelineStage as HubspotPipelineStage  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.objects.models import SimplePublicObject as HubspotObject  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.associations.v4 import AssociationSpec  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.associations.v4.api.basic_api import BasicApi as AssociationsBasicApi  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.associations.v4.models import (  # type: ignore[reportMissingTypeStubs]
+    CollectionResponseMultiAssociatedObjectWithLabelForwardPaging as AssociationsPageResponse,
+    MultiAssociatedObjectWithLabel as AssociationsResult,
+    ForwardPaging as AssociationsPaging,
+    NextPage as AssociationsPagingNext,
+)
 
-    def
\ No newline at end of file
+
+from backend.app.config import get_settings
+from utils.logger import setup_logger
+
+import mimetypes
+import requests
+
+
+class Companies(Enum):
+    ABRI = "237615001799"
+    SOUTHERN_HOUSING_GROUP = "109343619305"
+    LIVEWEST = "86205872354"
+    SURESERVE = "301745289413"
+    HOMEGROUP = "94946071794"
+    APPLE = "184769046716"
+    THE_GUINESS_PARTNERSHIP = "86970043613"
+
+
+class DealStage(Enum):
+    SURVEYED_COMPLETE_NEEDS_SIGN_OFF = "1617223914"
+    SURVEYED_NO_ACCESS_NEED_SIGN_OFF = "1617223915"
+    CUSTOMER_CONTACTED = "888730834"
+    SURVEYED_COMPLETED_SIGNED_OFF = "1617223916"
+    FILES_MISSING_FROM_ASSESSOR = "1887736000"
+
+
+class Pipeline(Enum):
+    OPERATIONS_SOCIAL_HOUSING = "1167582403"
+
+
+# TODO get guiness working from here
+
+
+class HubspotClient:
+
+    def __init__(self):
+        """
+        Hey Tech Team, Hubspot Library doesn't do type hitting.
+        We have type hinted stuff but pylance never becomes happy.
+        However, because I added the type hinting to the best of ability
+        and you'll still get sensible ide suggestions.
+        """
+        settings = get_settings()
+        access_token = settings.HUBSPOT_API_KEY
+        if access_token is None:
+            raise RuntimeError("Missing HUBSPOT_API_KEY in env")
+        self.access_token: str = access_token
+        self.logger = setup_logger()
+        self.client: Client = Client.create(access_token=self.access_token)  # type: ignore[reportUnknownMemberType]
+        # [Developer Only]
+        # Add a dot in front of client and see the wonders of ide suggestions
+        # This wouldn't work if we didn't add ': Client' to self.client.
+        # Sorry - not sorry but enjoy, Past Junte 13/03/2026
+        # self.client
+
+    def get_deal_ids_from_company(self, company_id: str) -> list[str]:
+        associations_api: AssociationsBasicApi = (  # type: ignore[reportUnknownMemberType]
+            self.client.crm.associations.v4.basic_api  # type: ignore[reportUnknownMemberType]
+        )
+
+        deal_ids: list[str] = []
+        after: Optional[str] = None
+
+        while True:
+            response: AssociationsPageResponse = associations_api.get_page(  # type: ignore[reportUnknownMemberType]
+                object_type="companies",
+                object_id=company_id,
+                to_object_type="deals",
+                limit=100,
+                after=after,
+            )
+
+            results: list[AssociationsResult] = cast(list[AssociationsResult], response.results)  # type: ignore[reportUnknownMemberType]
+            for assoc in results:
+                assoc: AssociationsResult
+                object_id: str = cast(str, assoc.to_object_id)  # type: ignore[reportUnknownMemberType, reportUnknownVariableType]
+                deal_ids.append(object_id)
+
+            paging: Optional[AssociationsPaging] = cast(Optional[AssociationsPaging], response.paging)  # type: ignore[reportUnknownMemberType]
+            if not paging:
+                break
+
+            paging_next: Optional[AssociationsPagingNext] = cast(Optional[AssociationsPagingNext], paging.next)  # type: ignore[reportUnknownMemberType, reportUnknownVariableType]
+            if not paging_next:
+                break
+
+            after = cast(str, paging_next.after)  # type: ignore[reportUnknownMemberType, reportUnknownVariableType]
+
+        return deal_ids
+
+    def from_deal_id_get_associated_company_id(self, deal_id: str) -> Optional[str]:
+        """
+        Get the associated company ID from a given deal ID.
+        Returns the associated company ID, or None if not found.
+        """
+        try:
+            associations_api: AssociationsBasicApi = self.client.crm.associations.v4.basic_api  # type: ignore[reportUnknownMemberType]
+
+            # Fetch associations for this specific deal only
+            response: AssociationsPageResponse = associations_api.get_page(  # type: ignore[reportUnknownMemberType]
+                object_type="deals",
+                object_id=deal_id,
+                to_object_type="companies",
+                limit=1,  # Expect only one associated company
+            )
+
+            results: list[AssociationsResult] = cast(list[AssociationsResult], response.results)  # type: ignore[reportUnknownMemberType]
+            if not results:
+                self.logger.info(f"No company association found for deal {deal_id}")
+                return None
+
+            first: AssociationsResult = results[0]
+            company_id: str = cast(str, first.to_object_id)  # type: ignore[reportUnknownMemberType, reportUnknownVariableType]
+            self.logger.info(f"Associated company ID for deal {deal_id}: {company_id}")
+            return company_id
+
+        except ApiException as e:
+            self.logger.error(
+                f"Error fetching associated company for deal {deal_id}: {e}"
+            )
+            return None
+
+    def from_deal_id_get_associated_listing(
+        self, deal_id: str
+    ) -> Optional[dict[str, str]]:
+        """
+        Get the associated listing information for a given deal.
+        Returns a dictionary of listing properties, or None if not found.
+        """
+        associations_api: AssociationsBasicApi = self.client.crm.associations.v4.basic_api  # type: ignore[reportUnknownMemberType]
+        listings_api: ObjectsBasicApi = self.client.crm.objects.basic_api  # type: ignore[reportUnknownMemberType]  # works for custom objects like "listing"
+
+        # Fetch associated listing(s)
+        response: AssociationsPageResponse = associations_api.get_page(  # type: ignore[reportUnknownMemberType]
+            object_type="deals",
+            object_id=deal_id,
+            to_object_type="0-420",  # <-- use your exact custom object name slug here
+            limit=1,
+        )
+
+        results: list[AssociationsResult] = cast(list[AssociationsResult], response.results)  # type: ignore[reportUnknownMemberType]
+        if not results:
+            self.logger.info(f"No listing association found for deal {deal_id}")
+            return None
+
+        first: AssociationsResult = results[0]
+        listing_id: str = cast(str, first.to_object_id)  # type: ignore[reportUnknownMemberType, reportUnknownVariableType]
+        self.logger.info(f"Associated listing ID for deal {deal_id}: {listing_id}")
+
+        # Fetch listing details (the "listing information")
+        listing: HubspotObject = listings_api.get_by_id(  # type: ignore[reportUnknownMemberType]
+            object_type="0-420",  # again, must match your HubSpot object name
+            object_id=listing_id,
+            properties=[
+                "national_uprn",
+                "domna_property_id",
+                "owner_property_id",
+            ],
+        )
+
+        listing_info: dict[str, str] = cast(dict[str, str], listing.properties)  # type: ignore[reportUnknownMemberType]
+        self.logger.info(f"Listing info for deal {deal_id}: {listing_info}")
+        return listing_info
+
+    def from_deal_id_get_info(self, deal_id: str) -> dict[str, str]:
+        deals_api: DealsBasicApi = self.client.crm.deals.basic_api  # type: ignore[reportUnknownMemberType]
+
+        deal: HubspotObject = deals_api.get_by_id(  # type: ignore[reportUnknownMemberType]
+            deal_id,
+            properties=[
+                "dealname",
+                "dealstage",
+                "pipeline",
+                "outcome",  # outcome,
+                "outcome_notes",  # outcome notes
+                "project_code",
+                "major_condition_issue_description",
+                "major_condition_issue_photos",
+                "coordination_status__stage_1_",  # Coordiantion Status (Stage 1),
+                "retrofit_design_status",  # Retrofit Design Status
+            ],
+        )
+
+        deal_info: dict[str, str] = cast(dict[str, str], deal.properties)  # type: ignore[reportUnknownMemberType]
+        return deal_info
+
+    def get_deal_info_for_db(
+        self, deal_id: str
+    ) -> tuple[dict[str, str], Optional[str], Optional[dict[str, str]]]:
+        deal: dict[str, str] = self.from_deal_id_get_info(deal_id)
+        company: Optional[str] = self.from_deal_id_get_associated_company_id(deal_id)
+        listing: Optional[dict[str, str]] = self.from_deal_id_get_associated_listing(
+            deal_id
+        )
+
+        return deal, company, listing
+
+    def get_company_information(self, company_id: str) -> dict[str, str]:
+        companies_api: CompaniesBasicApi = self.client.crm.companies.basic_api  # type: ignore[reportUnknownMemberType]
+
+        company: HubspotObject = companies_api.get_by_id(  # type: ignore[reportUnknownMemberType]
+            company_id,
+            properties=[
+                "name",
+            ],
+        )
+
+        company_info: dict[str, str] = cast(dict[str, str], company.properties)  # type: ignore[reportUnknownMemberType]
+        return company_info
+
+    def get_all_pipelines(self) -> list[dict[str, str]]:
+        """
+        Retrieve all pipelines for deals, returning a list of dicts with pipeline names and IDs.
+        """
+        try:
+            pipelines_api: PipelinesApi = self.client.crm.pipelines.pipelines_api  # type: ignore[reportUnknownMemberType]
+            response: PipelinesResponse = pipelines_api.get_all(object_type="deals")  # type: ignore[reportUnknownMemberType]
+
+            results: list[HubspotPipeline] = cast(list[HubspotPipeline], response.results)  # type: ignore[reportUnknownMemberType]
+            pipelines: list[dict[str, str]] = []
+            for pipeline in results:
+                pipeline: HubspotPipeline
+                pipelines.append(
+                    {
+                        "name": cast(str, pipeline.label),  # type: ignore[reportUnknownMemberType]
+                        "id": cast(str, pipeline.id),  # type: ignore[reportUnknownMemberType]
+                    }
+                )
+
+            self.logger.info(f"Retrieved {len(pipelines)} pipelines.")
+            return pipelines
+
+        except Exception as e:
+            self.logger.error(f"Error retrieving pipelines: {e}")
+            return []
+
+    def get_deal_stages_from_pipeline_id(
+        self, pipeline_id: Optional[str] = None
+    ) -> list[dict[str, str]]:
+        """
+        Retrieve all deal stages for a given pipeline.
+        If no pipeline_id is provided, retrieves all stages for all pipelines.
+        Returns a list of dicts with pipeline name, stage name, and stage ID.
+        """
+        try:
+            pipelines_api: PipelinesApi = self.client.crm.pipelines.pipelines_api  # type: ignore[reportUnknownMemberType]
+            response: PipelinesResponse = pipelines_api.get_all(object_type="deals")  # type: ignore[reportUnknownMemberType]
+
+            all_stages: list[dict[str, str]] = []
+
+            for pipeline in cast(list[HubspotPipeline], response.results):  # type: ignore[reportUnknownMemberType]
+                pipeline: HubspotPipeline
+                # Skip other pipelines if a specific one is requested
+                pipeline_id_str: str = cast(str, pipeline.id)  # type: ignore[reportUnknownMemberType]
+                if pipeline_id and pipeline_id_str != str(pipeline_id):
+                    continue
+
+                for stage in cast(list[HubspotPipelineStage], pipeline.stages):  # type: ignore[reportUnknownMemberType]
+                    stage: HubspotPipelineStage
+                    all_stages.append(
+                        {
+                            "pipeline_name": cast(str, pipeline.label),  # type: ignore[reportUnknownMemberType]
+                            "pipeline_id": pipeline_id_str,
+                            "stage_name": cast(str, stage.label),  # type: ignore[reportUnknownMemberType]
+                            "stage_id": cast(str, stage.id),  # type: ignore[reportUnknownMemberType]
+                        }
+                    )
+
+            if not all_stages:
+                self.logger.info(
+                    f"No deal stages found for pipeline {pipeline_id if pipeline_id else 'ALL'}"
+                )
+            else:
+                self.logger.info(f"Retrieved {len(all_stages)} deal stages.")
+
+            return all_stages
+
+        except Exception as e:
+            self.logger.error(f"Error retrieving deal stages: {e}")
+            return []
+
+    def download_file_from_url(
+        self, download_url: str, save_path: Optional[str] = None
+    ) -> str:
+        """
+        Download a file from a HubSpot file URL (public or private), keeping its original file type.
+        """
+
+        try:
+            headers: dict[str, str] = {}
+            if "hubspotusercontent" not in download_url:
+                headers["Authorization"] = f"Bearer {self.access_token}"
+
+            self.logger.info(f"Downloading HubSpot file: {download_url}")
+            response = requests.get(
+                download_url, headers=headers, stream=True, allow_redirects=True
+            )
+            response.raise_for_status()
+
+            # Try to infer filename from Content-Disposition header
+            content_disposition = response.headers.get("content-disposition")
+            if content_disposition and "filename=" in content_disposition:
+                filename = content_disposition.split("filename=")[1].strip('"')
+            else:
+                # fallback: extract from URL or content-type
+                filename = (
+                    os.path.basename(download_url.split("?")[0]) or "hubspot_download"
+                )
+                if "." not in filename:
+                    content_type = response.headers.get("content-type")
+                    ext = (
+                        mimetypes.guess_extension(content_type.split(";")[0])
+                        if content_type
+                        else None
+                    )
+                    if ext:
+                        filename += ext
+
+            # Make sure save_path is valid
+            if save_path is None:
+                save_path = os.path.abspath(filename)
+            elif os.path.isdir(save_path):
+                save_path = os.path.join(save_path, filename)
+            else:
+                # if user passes a file path directly, leave it
+                save_path = os.path.abspath(save_path)
+
+            with open(save_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    f.write(chunk)
+
+            self.logger.info(f"File downloaded successfully → {save_path}")
+            return save_path
+
+        except requests.exceptions.RequestException as e:
+            self.logger.error(f"Failed to download file from HubSpot: {e}")
+            raise
+
+    def create_line_item_from_product(self, product_id: str, quantity: int = 1) -> str:
+        products_api: ProductsBasicApi = self.client.crm.products.basic_api  # type: ignore[reportUnknownMemberType]
+
+        # Fetch product mapping
+        product: HubspotObject = products_api.get_by_id(  # type: ignore[reportUnknownMemberType]
+            product_id, properties=["name", "price", "hs_price"]
+        )
+        product_properties: dict[str, str] = cast(dict[str, str], product.properties)  # type: ignore[reportUnknownMemberType]
+
+        name: Optional[str] = product_properties.get("name")
+        price: str = product_properties.get("price") or product_properties.get("hs_price") or "0"
+
+        # Build line item payload
+        line_item_input = SimplePublicObjectInput(
+            properties={
+                "hs_product_id": product_id,
+                "name": name,
+                "quantity": str(quantity),
+                "price": price,
+                "amount": str(float(price) * quantity),
+                "invoiced": "Outstanding",
+            }
+        )
+
+        line_items_api: LineItemsBasicApi = self.client.crm.line_items.basic_api  # type: ignore[reportUnknownMemberType]
+
+        # Create line item
+        line_item: HubspotObject = line_items_api.create(line_item_input)  # type: ignore[reportUnknownMemberType]
+        return cast(str, line_item.id)  # type: ignore[reportUnknownMemberType]
+
+    def associate_line_item_to_deal(self, line_item_id: str, deal_id: str) -> None:
+        self.logger.info(f"Associating line item {line_item_id} → deal {deal_id}")
+
+        association_api: AssociationsBasicApi = self.client.crm.associations.v4.basic_api  # type: ignore[reportUnknownMemberType]
+
+        association_api.create(  # type: ignore[reportUnknownMemberType]
+            "0-3",  # to object type
+            deal_id,  # to object id
+            "line_items",  # from object type
+            line_item_id,  # from object id
+            [
+                AssociationSpec(
+                    association_category="HUBSPOT_DEFINED",
+                    association_type_id=19,  # line_item → deal
+                )
+            ],
+        )
+
+    def add_product_line_item_to_deal(
+        self, deal_id: str, product_id: str, quantity: int = 1
+    ) -> str:
+        # Step 1: Create the line item from product mapping
+        line_item_id: str = self.create_line_item_from_product(product_id, quantity)
+
+        # Step 2: Associate the created line item to the deal
+        self.associate_line_item_to_deal(line_item_id, deal_id)
+
+        return line_item_id
+
+    def delete_line_item(self, line_item_id: str) -> bool:
+        """
+        Delete (archive) a line item in HubSpot by its ID.
+        """
+        try:
+            self.logger.info(f"Deleting line item {line_item_id}...")
+
+            line_items_api: LineItemsBasicApi = self.client.crm.line_items.basic_api  # type: ignore[reportUnknownMemberType]
+            line_items_api.archive(line_item_id)  # type: ignore[reportUnknownMemberType]
+
+            self.logger.info(f"Line item {line_item_id} deleted successfully.")
+            return True
+
+        except ApiException as e:
+            self.logger.error(f"Failed to delete line item {line_item_id}: {e}")
+            return False
diff --git a/etl/hubspot/requirements.txt b/etl/hubspot/requirements.txt
index 105cba07..ef8e3ebc 100644
--- a/etl/hubspot/requirements.txt
+++ b/etl/hubspot/requirements.txt
@@ -1 +1 @@
-hubspot
\ No newline at end of file
+hubspot-api-client
\ No newline at end of file
diff --git a/etl/hubspot/tests/__init__.py b/etl/hubspot/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/etl/hubspot/tests/test_hubspot_client_integration.py b/etl/hubspot/tests/test_hubspot_client_integration.py
new file mode 100644
index 00000000..d7cf46fd
--- /dev/null
+++ b/etl/hubspot/tests/test_hubspot_client_integration.py
@@ -0,0 +1,117 @@
+import os
+from typing import Optional
+
+import pytest
+from etl.hubspot.hubspotClient import HubspotClient, Companies, Pipeline, DealStage
+
+
+class TestHubspotClientIntegration:
+    """Integration tests using real HubSpot API calls."""
+
+    @pytest.fixture
+    def client(self):
+        """Initialize HubSpot client with env variables."""
+        return HubspotClient()
+
+    def test_client_initialization(self, client: HubspotClient):
+        """Test that client initializes successfully with API key."""
+        assert client.access_token is not None
+        assert client.client is not None
+        assert client.logger is not None
+
+    def test_get_deal_ids_from_company(self, client: HubspotClient):
+        """Test getting deal IDs from Apple company includes expected deal."""
+        company_id: str = Companies.APPLE.value
+
+        deal_ids: list[str] = client.get_deal_ids_from_company(company_id)
+
+        # https://app-eu1.hubspot.com/contacts/145275138/record/0-3/263490768079
+        assert "263490768079" in deal_ids
+
+    def test_get_company_id_from_deal_id(self, client: HubspotClient):
+        deal_id: str = "263490768079"
+
+        company_id: Optional[str] = client.from_deal_id_get_associated_company_id(
+            deal_id
+        )
+        # https://app-eu1.hubspot.com/contacts/145275138/record/0-3/263490768079
+        assert company_id == Companies.APPLE.value
+
+    def test_from_deal_id_get_associated_listing(self, client: HubspotClient):
+        deal_id: str = "263490768079"
+
+        listing_info: Optional[dict[str, str]] = (
+            client.from_deal_id_get_associated_listing(deal_id)
+        )
+
+        assert listing_info is not None
+        assert "hs_object_id" in listing_info
+        assert "national_uprn" in listing_info
+        assert "owner_property_id" in listing_info
+        assert "domna_property_id" in listing_info
+
+    def test_from_deal_id_get_info(self, client: HubspotClient):
+        deal_id: str = "263490768079"
+
+        deal_info: dict[str, str] = client.from_deal_id_get_info(deal_id)
+
+        assert "dealname" in deal_info
+        assert "dealstage" in deal_info
+        assert "pipeline" in deal_info
+        assert "outcome" in deal_info  # outcome
+        assert "outcome_notes" in deal_info  # outcome notes
+        assert "project_code" in deal_info
+        assert "major_condition_issue_description" in deal_info
+        assert "major_condition_issue_photos" in deal_info
+        assert (
+            "coordination_status__stage_1_" in deal_info
+        )  # Coordiantion Status (Stage 1)
+        assert "retrofit_design_status" in deal_info  # Retrofit Design Status
+
+    def test_get_deal_info_for_db(self, client: HubspotClient):
+        deal_id: str = "263490768079"
+
+        deal, company, listing = client.get_deal_info_for_db(deal_id)
+
+        assert "dealname" in deal
+        assert "dealstage" in deal
+        assert "pipeline" in deal
+
+        assert company == Companies.APPLE.value
+
+        assert listing is None or "hs_object_id" in listing
+
+    def test_get_company_information(self, client: HubspotClient):
+        company_id: str = Companies.APPLE.value
+
+        company_info: dict[str, str] = client.get_company_information(company_id)
+
+        assert "name" in company_info
+        assert company_info["name"].lower() == "Apple".lower()
+
+    def test_get_all_pipelines(self, client: HubspotClient):
+        pipelines: list[dict[str, str]] = client.get_all_pipelines()
+
+        assert len(pipelines) > 0
+        pipeline_ids: list[str] = [p["id"] for p in pipelines]
+        assert Pipeline.OPERATIONS_SOCIAL_HOUSING.value in pipeline_ids
+
+    def test_get_deal_stages_from_pipeline_id(self, client: HubspotClient):
+        stages: list[dict[str, str]] = client.get_deal_stages_from_pipeline_id(
+            Pipeline.OPERATIONS_SOCIAL_HOUSING.value
+        )
+
+        assert len(stages) > 0
+        stage_ids: list[str] = [s["stage_id"] for s in stages]
+        assert DealStage.SURVEYED_COMPLETE_NEEDS_SIGN_OFF.value in stage_ids
+
+    def test_download_file_from_url(
+        self, client: HubspotClient, tmp_path: Optional[str]
+    ):
+        deal_info: dict[str, str] = client.from_deal_id_get_info("254427203793")
+        download_url: str = deal_info["major_condition_issue_photos"]
+
+        save_path: str = client.download_file_from_url(download_url, str(tmp_path))
+
+        assert os.path.exists(save_path)
+        assert os.path.getsize(save_path) > 0
diff --git a/pyrightconfig.json b/pyrightconfig.json
index d4e0e2a4..18f578a5 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -2,7 +2,7 @@
   "typeCheckingMode": "strict",
   "venvPath": "/Users/khalimconn-kowlessar/opt/anaconda3/envs/",
   "venv": "Fastapi-backend",
-  "include": [
+"include": [
     "."
   ]
 }
\ No newline at end of file
diff --git a/pytest.ini b/pytest.ini
index 608d5e0c..c9dd8ca8 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -3,4 +3,4 @@ pythonpath = .
 log_cli = true
 log_cli_level = INFO
 addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial
-testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests
+testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests

From 3970d70518f1432bc68f2af2532eec63308e5ff4 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 13 Mar 2026 14:36:53 +0000
Subject: [PATCH 03/47] its now perfect

---
 etl/hubspot/hubspotClient.py | 64 ++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 35 deletions(-)

diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py
index 9c1cd31e..b41d71f8 100644
--- a/etl/hubspot/hubspotClient.py
+++ b/etl/hubspot/hubspotClient.py
@@ -2,24 +2,22 @@ import os
 from enum import Enum
 from typing import Optional, cast
 
-from hubspot.client import Client  # type: ignore[reportMissingTypeStubs]
-from hubspot.crm.associations import ApiException  # type: ignore[reportMissingTypeStubs]
-from hubspot.crm.objects import SimplePublicObjectInput  # type: ignore[reportMissingTypeStubs]
-from hubspot.crm.objects.api.basic_api import BasicApi as ObjectsBasicApi  # type: ignore[reportMissingTypeStubs]
-from hubspot.crm.deals.api.basic_api import BasicApi as DealsBasicApi  # type: ignore[reportMissingTypeStubs]
-from hubspot.crm.companies.api.basic_api import BasicApi as CompaniesBasicApi  # type: ignore[reportMissingTypeStubs]
-from hubspot.crm.products.api.basic_api import BasicApi as ProductsBasicApi  # type: ignore[reportMissingTypeStubs]
-from hubspot.crm.line_items.api.basic_api import BasicApi as LineItemsBasicApi  # type: ignore[reportMissingTypeStubs]
-from hubspot.crm.pipelines.api.pipelines_api import PipelinesApi  # type: ignore[reportMissingTypeStubs]
-from hubspot.crm.pipelines.models import (  # type: ignore[reportMissingTypeStubs]
+from hubspot.client import Client
+from hubspot.crm.associations import ApiException
+from hubspot.crm.objects import SimplePublicObjectInput
+from hubspot.crm.objects.api.basic_api import BasicApi as ObjectsBasicApi
+from hubspot.crm.deals.api.basic_api import BasicApi as DealsBasicApi
+from hubspot.crm.companies.api.basic_api import BasicApi as CompaniesBasicApi
+from hubspot.crm.pipelines.api.pipelines_api import PipelinesApi
+from hubspot.crm.pipelines.models import (
     CollectionResponsePipelineNoPaging as PipelinesResponse,
 )
-from hubspot.crm.pipelines.models import Pipeline as HubspotPipeline  # type: ignore[reportMissingTypeStubs]
-from hubspot.crm.pipelines.models import PipelineStage as HubspotPipelineStage  # type: ignore[reportMissingTypeStubs]
-from hubspot.crm.objects.models import SimplePublicObject as HubspotObject  # type: ignore[reportMissingTypeStubs]
-from hubspot.crm.associations.v4 import AssociationSpec  # type: ignore[reportMissingTypeStubs]
-from hubspot.crm.associations.v4.api.basic_api import BasicApi as AssociationsBasicApi  # type: ignore[reportMissingTypeStubs]
-from hubspot.crm.associations.v4.models import (  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.pipelines.models import Pipeline as HubspotPipeline
+from hubspot.crm.pipelines.models import PipelineStage as HubspotPipelineStage
+from hubspot.crm.objects.models import SimplePublicObject as HubspotObject
+from hubspot.crm.associations.v4 import AssociationSpec
+from hubspot.crm.associations.v4.api.basic_api import BasicApi as AssociationsBasicApi
+from hubspot.crm.associations.v4.models import (
     CollectionResponseMultiAssociatedObjectWithLabelForwardPaging as AssociationsPageResponse,
     MultiAssociatedObjectWithLabel as AssociationsResult,
     ForwardPaging as AssociationsPaging,
@@ -364,17 +362,16 @@ class HubspotClient:
             self.logger.error(f"Failed to download file from HubSpot: {e}")
             raise
 
-    def create_line_item_from_product(self, product_id: str, quantity: int = 1) -> str:
-        products_api: ProductsBasicApi = self.client.crm.products.basic_api  # type: ignore[reportUnknownMemberType]
-
+    def create_line_item_from_product(self, product_id: str, quantity: int = 1):
         # Fetch product mapping
-        product: HubspotObject = products_api.get_by_id(  # type: ignore[reportUnknownMemberType]
+        product = self.client.crm.products.basic_api.get_by_id(
             product_id, properties=["name", "price", "hs_price"]
         )
-        product_properties: dict[str, str] = cast(dict[str, str], product.properties)  # type: ignore[reportUnknownMemberType]
 
-        name: Optional[str] = product_properties.get("name")
-        price: str = product_properties.get("price") or product_properties.get("hs_price") or "0"
+        name = product.properties.get("name")
+        price = (
+            product.properties.get("price") or product.properties.get("hs_price") or "0"
+        )
 
         # Build line item payload
         line_item_input = SimplePublicObjectInput(
@@ -388,18 +385,16 @@ class HubspotClient:
             }
         )
 
-        line_items_api: LineItemsBasicApi = self.client.crm.line_items.basic_api  # type: ignore[reportUnknownMemberType]
-
         # Create line item
-        line_item: HubspotObject = line_items_api.create(line_item_input)  # type: ignore[reportUnknownMemberType]
-        return cast(str, line_item.id)  # type: ignore[reportUnknownMemberType]
+        line_item = self.client.crm.line_items.basic_api.create(line_item_input)
+        return line_item.id
 
-    def associate_line_item_to_deal(self, line_item_id: str, deal_id: str) -> None:
+    def associate_line_item_to_deal(self, line_item_id: str, deal_id: str):
         self.logger.info(f"Associating line item {line_item_id} → deal {deal_id}")
 
-        association_api: AssociationsBasicApi = self.client.crm.associations.v4.basic_api  # type: ignore[reportUnknownMemberType]
+        association_api = self.client.crm.associations.v4.basic_api
 
-        association_api.create(  # type: ignore[reportUnknownMemberType]
+        association_api.create(
             "0-3",  # to object type
             deal_id,  # to object id
             "line_items",  # from object type
@@ -414,24 +409,23 @@ class HubspotClient:
 
     def add_product_line_item_to_deal(
         self, deal_id: str, product_id: str, quantity: int = 1
-    ) -> str:
+    ):
         # Step 1: Create the line item from product mapping
-        line_item_id: str = self.create_line_item_from_product(product_id, quantity)
+        line_item_id = self.create_line_item_from_product(product_id, quantity)
 
         # Step 2: Associate the created line item to the deal
         self.associate_line_item_to_deal(line_item_id, deal_id)
 
         return line_item_id
 
-    def delete_line_item(self, line_item_id: str) -> bool:
+    def delete_line_item(self, line_item_id: str):
         """
         Delete (archive) a line item in HubSpot by its ID.
         """
         try:
             self.logger.info(f"Deleting line item {line_item_id}...")
 
-            line_items_api: LineItemsBasicApi = self.client.crm.line_items.basic_api  # type: ignore[reportUnknownMemberType]
-            line_items_api.archive(line_item_id)  # type: ignore[reportUnknownMemberType]
+            self.client.crm.line_items.basic_api.archive(line_item_id)
 
             self.logger.info(f"Line item {line_item_id} deleted successfully.")
             return True

From cca72928d91ebb03d4b5fc5aa92715f264221c5b Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 13 Mar 2026 14:38:10 +0000
Subject: [PATCH 04/47] its now perfect

---
 etl/hubspot/hubspotClient.py | 60 +++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py
index b41d71f8..1946bcdf 100644
--- a/etl/hubspot/hubspotClient.py
+++ b/etl/hubspot/hubspotClient.py
@@ -2,22 +2,22 @@ import os
 from enum import Enum
 from typing import Optional, cast
 
-from hubspot.client import Client
-from hubspot.crm.associations import ApiException
-from hubspot.crm.objects import SimplePublicObjectInput
-from hubspot.crm.objects.api.basic_api import BasicApi as ObjectsBasicApi
-from hubspot.crm.deals.api.basic_api import BasicApi as DealsBasicApi
-from hubspot.crm.companies.api.basic_api import BasicApi as CompaniesBasicApi
-from hubspot.crm.pipelines.api.pipelines_api import PipelinesApi
-from hubspot.crm.pipelines.models import (
+from hubspot.client import Client  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.associations import ApiException  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.objects import SimplePublicObjectInput  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.objects.api.basic_api import BasicApi as ObjectsBasicApi  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.deals.api.basic_api import BasicApi as DealsBasicApi  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.companies.api.basic_api import BasicApi as CompaniesBasicApi  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.pipelines.api.pipelines_api import PipelinesApi  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.pipelines.models import (  # type: ignore[reportMissingTypeStubs]
     CollectionResponsePipelineNoPaging as PipelinesResponse,
 )
-from hubspot.crm.pipelines.models import Pipeline as HubspotPipeline
-from hubspot.crm.pipelines.models import PipelineStage as HubspotPipelineStage
-from hubspot.crm.objects.models import SimplePublicObject as HubspotObject
-from hubspot.crm.associations.v4 import AssociationSpec
-from hubspot.crm.associations.v4.api.basic_api import BasicApi as AssociationsBasicApi
-from hubspot.crm.associations.v4.models import (
+from hubspot.crm.pipelines.models import Pipeline as HubspotPipeline  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.pipelines.models import PipelineStage as HubspotPipelineStage  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.objects.models import SimplePublicObject as HubspotObject  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.associations.v4 import AssociationSpec  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.associations.v4.api.basic_api import BasicApi as AssociationsBasicApi  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.associations.v4.models import (  # type: ignore[reportMissingTypeStubs]
     CollectionResponseMultiAssociatedObjectWithLabelForwardPaging as AssociationsPageResponse,
     MultiAssociatedObjectWithLabel as AssociationsResult,
     ForwardPaging as AssociationsPaging,
@@ -362,15 +362,17 @@ class HubspotClient:
             self.logger.error(f"Failed to download file from HubSpot: {e}")
             raise
 
-    def create_line_item_from_product(self, product_id: str, quantity: int = 1):
+    def create_line_item_from_product(self, product_id: str, quantity: int = 1) -> str:
         # Fetch product mapping
-        product = self.client.crm.products.basic_api.get_by_id(
+        products_api: ProductsBasicApi = self.client.crm.products.basic_api  # type: ignore[reportUnknownMemberType]
+        product: HubspotObject = products_api.get_by_id(  # type: ignore[reportUnknownMemberType]
             product_id, properties=["name", "price", "hs_price"]
         )
+        properties: dict[str, str] = cast(dict[str, str], product.properties)  # type: ignore[reportUnknownMemberType]
 
-        name = product.properties.get("name")
-        price = (
-            product.properties.get("price") or product.properties.get("hs_price") or "0"
+        name: str = properties.get("name") or ""
+        price: str = (
+            properties.get("price") or properties.get("hs_price") or "0"
         )
 
         # Build line item payload
@@ -386,15 +388,16 @@ class HubspotClient:
         )
 
         # Create line item
-        line_item = self.client.crm.line_items.basic_api.create(line_item_input)
-        return line_item.id
+        line_items_api: LineItemsBasicApi = self.client.crm.line_items.basic_api  # type: ignore[reportUnknownMemberType]
+        line_item: HubspotObject = line_items_api.create(line_item_input)  # type: ignore[reportUnknownMemberType]
+        return cast(str, line_item.id)  # type: ignore[reportUnknownMemberType]
 
-    def associate_line_item_to_deal(self, line_item_id: str, deal_id: str):
+    def associate_line_item_to_deal(self, line_item_id: str, deal_id: str) -> None:
         self.logger.info(f"Associating line item {line_item_id} → deal {deal_id}")
 
-        association_api = self.client.crm.associations.v4.basic_api
+        association_api: AssociationsBasicApi = self.client.crm.associations.v4.basic_api  # type: ignore[reportUnknownMemberType]
 
-        association_api.create(
+        association_api.create(  # type: ignore[reportUnknownMemberType]
             "0-3",  # to object type
             deal_id,  # to object id
             "line_items",  # from object type
@@ -409,23 +412,24 @@ class HubspotClient:
 
     def add_product_line_item_to_deal(
         self, deal_id: str, product_id: str, quantity: int = 1
-    ):
+    ) -> str:
         # Step 1: Create the line item from product mapping
-        line_item_id = self.create_line_item_from_product(product_id, quantity)
+        line_item_id: str = self.create_line_item_from_product(product_id, quantity)
 
         # Step 2: Associate the created line item to the deal
         self.associate_line_item_to_deal(line_item_id, deal_id)
 
         return line_item_id
 
-    def delete_line_item(self, line_item_id: str):
+    def delete_line_item(self, line_item_id: str) -> bool:
         """
         Delete (archive) a line item in HubSpot by its ID.
         """
         try:
             self.logger.info(f"Deleting line item {line_item_id}...")
 
-            self.client.crm.line_items.basic_api.archive(line_item_id)
+            line_items_api: LineItemsBasicApi = self.client.crm.line_items.basic_api  # type: ignore[reportUnknownMemberType]
+            line_items_api.archive(line_item_id)  # type: ignore[reportUnknownMemberType]
 
             self.logger.info(f"Line item {line_item_id} deleted successfully.")
             return True

From 2349eba89e7239f7768d021171480c6e06e1cdfd Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 13 Mar 2026 14:38:32 +0000
Subject: [PATCH 05/47] its now perfect

---
 etl/hubspot/hubspotClient.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py
index 1946bcdf..f93a736c 100644
--- a/etl/hubspot/hubspotClient.py
+++ b/etl/hubspot/hubspotClient.py
@@ -8,6 +8,8 @@ from hubspot.crm.objects import SimplePublicObjectInput  # type: ignore[reportMi
 from hubspot.crm.objects.api.basic_api import BasicApi as ObjectsBasicApi  # type: ignore[reportMissingTypeStubs]
 from hubspot.crm.deals.api.basic_api import BasicApi as DealsBasicApi  # type: ignore[reportMissingTypeStubs]
 from hubspot.crm.companies.api.basic_api import BasicApi as CompaniesBasicApi  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.products.api.basic_api import BasicApi as ProductsBasicApi  # type: ignore[reportMissingTypeStubs]
+from hubspot.crm.line_items.api.basic_api import BasicApi as LineItemsBasicApi  # type: ignore[reportMissingTypeStubs]
 from hubspot.crm.pipelines.api.pipelines_api import PipelinesApi  # type: ignore[reportMissingTypeStubs]
 from hubspot.crm.pipelines.models import (  # type: ignore[reportMissingTypeStubs]
     CollectionResponsePipelineNoPaging as PipelinesResponse,

From f8187634058d05fa7f2961b9980f2ec720911b84 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 13 Mar 2026 14:43:55 +0000
Subject: [PATCH 06/47] make tests work

---
 test.requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test.requirements.txt b/test.requirements.txt
index d8b8b777..4bd89caa 100644
--- a/test.requirements.txt
+++ b/test.requirements.txt
@@ -4,4 +4,5 @@ pytest-cov
 pytest-mock
 dotenv
 psycopg[binary]
-pytest-postgresql
\ No newline at end of file
+pytest-postgresql
+hubspot-api-client
\ No newline at end of file

From 6e8f29afc8dbd385c9e526f70b43cb4ec9613b04 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 13 Mar 2026 14:52:41 +0000
Subject: [PATCH 07/47] added to rerun

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 9268ba25..b470e12c 100644
--- a/README.md
+++ b/README.md
@@ -39,3 +39,4 @@ pytest --cov-config=model_data/.coveragerc --cov=model_data
 
 This will produce the test results and coverage reports
 
+

From 8294a80fdfd05be346ffcef8d38331ba5744b2a4 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 13 Mar 2026 15:12:43 +0000
Subject: [PATCH 08/47] change the way the tests are ran as i don't like
 makefile

---
 .github/workflows/unit_tests.yml | 18 ++++++++----------
 Makefile                         | 30 ------------------------------
 2 files changed, 8 insertions(+), 40 deletions(-)
 delete mode 100644 Makefile

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index cc6431b8..91ca7e26 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -14,17 +14,15 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
-      - name: Set up Python 3.11
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.11'
+      - name: Build test image
+        run: docker build -f Dockerfile.test -t model-test .
 
-      - name: Install tox via Makefile
-        run: |
-          make setup
-
-      - name: Run tests with tox via Makefile
+      - name: Run tests
         env:
           EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }}
+          
         run: |
-          make test
\ No newline at end of file
+          docker run --rm \
+            -e EPC_AUTH_TOKEN=${{ secrets.DEV_EPC_AUTH_TOKEN }} \
+            -e HUBSPOT_API_KEY=${{ secrets.HUBSPOT_API_KEY }} \
+            model-test pytest
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 00942acd..00000000
--- a/Makefile
+++ /dev/null
@@ -1,30 +0,0 @@
-# Project Makefile
-
-PYTHON = python
-
-.PHONY: setup test lint typecheck check clean
-
-# Install dev dependencies + tox
-setup:
-	$(PYTHON) -m pip install --upgrade pip
-	$(PYTHON) -m pip install tox black ruff mypy
-
-# Run tests (pass ARGS="..." for specific tests)
-test:
-	tox -- $(ARGS)
-
-# Code formatting check + linting
-lint:
-	ruff .
-	black --check .
-
-# Static type checks
-typecheck:
-	mypy .
-
-# Full quality check (all checks + tests)
-check: lint typecheck test
-
-# Clean up tox environments
-clean:
-	rm -rf .tox

From 81d84368cfd88232239ec3c92d8e77e6fc5d8417 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 13 Mar 2026 15:15:14 +0000
Subject: [PATCH 09/47] we are going to use docker instead

---
 Dockerfile.test | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 Dockerfile.test

diff --git a/Dockerfile.test b/Dockerfile.test
new file mode 100644
index 00000000..d566c435
--- /dev/null
+++ b/Dockerfile.test
@@ -0,0 +1,23 @@
+FROM python:3.11-slim
+
+# Install PostgreSQL binaries — required by pytest-postgresql to spawn ephemeral test databases
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends postgresql \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Copy requirements first so Docker can cache the install layer
+COPY backend/engine/requirements.txt backend/engine/requirements.txt
+COPY backend/app/requirements/requirements.txt backend/app/requirements/requirements.txt
+COPY test.requirements.txt test.requirements.txt
+
+RUN pip install --no-cache-dir \
+    -r backend/engine/requirements.txt \
+    -r backend/app/requirements/requirements.txt \
+    -r test.requirements.txt
+
+# Copy source
+COPY . .
+
+CMD ["pytest"]

From 7fb8ee9202fc3c739942b96269a657e519a22d13 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 13 Mar 2026 15:20:15 +0000
Subject: [PATCH 10/47]  re run

---
 Dockerfile.test | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile.test b/Dockerfile.test
index d566c435..debbfa8b 100644
--- a/Dockerfile.test
+++ b/Dockerfile.test
@@ -6,6 +6,7 @@ RUN apt-get update \
     && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /app
+ENV PYTHONPATH=/app
 
 # Copy requirements first so Docker can cache the install layer
 COPY backend/engine/requirements.txt backend/engine/requirements.txt

From 6f6aa62efee423692dcf6eb332a636c4d7bc6bff Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 13 Mar 2026 15:26:44 +0000
Subject: [PATCH 11/47] add more requirements

---
 Dockerfile.test       | 2 ++
 test.requirements.txt | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/Dockerfile.test b/Dockerfile.test
index debbfa8b..6091aa50 100644
--- a/Dockerfile.test
+++ b/Dockerfile.test
@@ -11,11 +11,13 @@ ENV PYTHONPATH=/app
 # Copy requirements first so Docker can cache the install layer
 COPY backend/engine/requirements.txt backend/engine/requirements.txt
 COPY backend/app/requirements/requirements.txt backend/app/requirements/requirements.txt
+COPY asset_list/requirements.txt asset_list/requirements.txt
 COPY test.requirements.txt test.requirements.txt
 
 RUN pip install --no-cache-dir \
     -r backend/engine/requirements.txt \
     -r backend/app/requirements/requirements.txt \
+    -r asset_list/requirements.txt \
     -r test.requirements.txt
 
 # Copy source
diff --git a/test.requirements.txt b/test.requirements.txt
index 4bd89caa..936e2f7d 100644
--- a/test.requirements.txt
+++ b/test.requirements.txt
@@ -5,4 +5,5 @@ pytest-mock
 dotenv
 psycopg[binary]
 pytest-postgresql
-hubspot-api-client
\ No newline at end of file
+hubspot-api-client
+fuzzywuzzy
\ No newline at end of file

From 27f17563d46ecf05a901e092d74b5d6654706179 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 13 Mar 2026 15:41:43 +0000
Subject: [PATCH 12/47] pytest ini

---
 .github/workflows/unit_tests.yml |  1 +
 Dockerfile.test                  |  2 --
 Dockerfile.test.dockerignore     | 11 +++++++++++
 3 files changed, 12 insertions(+), 2 deletions(-)
 create mode 100644 Dockerfile.test.dockerignore

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 91ca7e26..116bc265 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -20,6 +20,7 @@ jobs:
       - name: Run tests
         env:
           EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }}
+          HUBSPOT_API_KEY: ${{ secrets.HUBSPOT_API_KEY }}
           
         run: |
           docker run --rm \
diff --git a/Dockerfile.test b/Dockerfile.test
index 6091aa50..debbfa8b 100644
--- a/Dockerfile.test
+++ b/Dockerfile.test
@@ -11,13 +11,11 @@ ENV PYTHONPATH=/app
 # Copy requirements first so Docker can cache the install layer
 COPY backend/engine/requirements.txt backend/engine/requirements.txt
 COPY backend/app/requirements/requirements.txt backend/app/requirements/requirements.txt
-COPY asset_list/requirements.txt asset_list/requirements.txt
 COPY test.requirements.txt test.requirements.txt
 
 RUN pip install --no-cache-dir \
     -r backend/engine/requirements.txt \
     -r backend/app/requirements/requirements.txt \
-    -r asset_list/requirements.txt \
     -r test.requirements.txt
 
 # Copy source
diff --git a/Dockerfile.test.dockerignore b/Dockerfile.test.dockerignore
new file mode 100644
index 00000000..8a846047
--- /dev/null
+++ b/Dockerfile.test.dockerignore
@@ -0,0 +1,11 @@
+# We need this file otherwise it'll use .dockerignore
+# Exclude large/irrelevant directories that are not needed for testing
+model_data/local_data/
+backend/node_modules/
+backend/.idea/
+infrastructure/
+data_collection/
+node_modules/
+conservation_areas/
+open_uprn/
+land_registry/

From 08478b17fb838584cc3a63641700da6586d3cfa5 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 13 Mar 2026 15:44:16 +0000
Subject: [PATCH 13/47] run tests

---
 Dockerfile.test.dockerignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile.test.dockerignore b/Dockerfile.test.dockerignore
index 8a846047..4f79c6ee 100644
--- a/Dockerfile.test.dockerignore
+++ b/Dockerfile.test.dockerignore
@@ -3,6 +3,7 @@
 model_data/local_data/
 backend/node_modules/
 backend/.idea/
+backend/.env
 infrastructure/
 data_collection/
 node_modules/

From ad189b4cacf56f2944b3a519cec4fff17b27c7fc Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Fri, 13 Mar 2026 15:56:13 +0000
Subject: [PATCH 14/47] post gres can't be ran as root

---
 Dockerfile.test | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Dockerfile.test b/Dockerfile.test
index debbfa8b..802eb3a4 100644
--- a/Dockerfile.test
+++ b/Dockerfile.test
@@ -21,4 +21,8 @@ RUN pip install --no-cache-dir \
 # Copy source
 COPY . .
 
+# pg_ctl refuses to run as root — create an unprivileged user
+RUN useradd -m testuser && chown -R testuser /app
+USER testuser
+
 CMD ["pytest"]

From 1b53b47048500ef30142714c13211f5f740f43a1 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 17 Mar 2026 12:37:50 +0000
Subject: [PATCH 15/47] add this in a sensible branch

---
 backend/address2UPRN/README.md | 14 ++++++++------
 backend/address2UPRN/main.py   |  9 ++++++---
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/backend/address2UPRN/README.md b/backend/address2UPRN/README.md
index 6d26f281..e34e45f6 100644
--- a/backend/address2UPRN/README.md
+++ b/backend/address2UPRN/README.md
@@ -5,10 +5,11 @@ Before you run:
 
 Step 1) Get the list and ensure the following columns exists
 
+I believe lower and upper case matter:
 * Address 1
 * Address 2
 * Address 3
-* postcode
+* Postcode
 
 And save it as a .csv file
 
@@ -23,16 +24,17 @@ For this example I'll be using "s3://retrofit-data-dev/ara_raw_inputs/calico/Cal
 
 Go to Ara DB and make a new task_id with a randomly generated uuid as the primarily key
 
-task_id = a7b70a02-4df4-45b5-a50b-196e095910bb
-sub_task_id = 567cf73b-1210-4909-9ecc-36ae7e23420e
+task_id = 169ea9b0-01b5-48dc-9f90-ae1989491d09 
+sub_task_id = e5704f9e-29fe-43c8-8913-05be09f2440f
+s3 => s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv
 
 Step 3) Alright, now lets make the input for postcode-splitter sqs to get the ball rolling
 postcode-splitter-sqs => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2Fpostcode-splitter-queue-dev
 
 {
-    "task_id": "a7b70a02-4df4-45b5-a50b-196e095910bb",
-    "sub_task_id": "567cf73b-1210-4909-9ecc-36ae7e23420e",
-    "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico Homes Full list EPC Properties(Sheet2) (1) (1).csv"
+    "task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09",
+    "sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f",
+    "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching (1)(Sheet1).csv"
 }
 Each batch of csv should be saved in retrofit-data-dev/ara_postcode_splitter_batches/<task-id>/<sub-task-id>/<timestamp:uuid4>.csv
 
diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py
index d0ba36e6..c458e40d 100644
--- a/backend/address2UPRN/main.py
+++ b/backend/address2UPRN/main.py
@@ -351,9 +351,9 @@ def handler(event, context, local=False):
                 {
                     "body": json.dumps(
                         {
-                            "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
-                            "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d",
-                            "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv",
+                            "task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09",
+                            "sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f",
+                            "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv",
                         }
                     )
                 }
@@ -441,6 +441,9 @@ def handler(event, context, local=False):
             # Process the rows
             logger.info(f"Processing {len(df)} rows for task {task_id}")
 
+            df["postcode_clean"] = (
+                df["Postcode"].astype(str).str.upper().str.strip().str.replace(" ", "")
+            )
             clean_df = df.dropna(subset=["postcode_clean"])
 
             postcode_to_addresses = {

From d3e9fd41e683001f360e042c14c08168b63bc720 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 17 Mar 2026 12:40:44 +0000
Subject: [PATCH 16/47] fixed address 2 uprn now usees POSTCODE

---
 etl/hubspot/tests/test_hubspot_client_integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etl/hubspot/tests/test_hubspot_client_integration.py b/etl/hubspot/tests/test_hubspot_client_integration.py
index d7cf46fd..a3d8ae54 100644
--- a/etl/hubspot/tests/test_hubspot_client_integration.py
+++ b/etl/hubspot/tests/test_hubspot_client_integration.py
@@ -14,7 +14,7 @@ class TestHubspotClientIntegration:
         return HubspotClient()
 
     def test_client_initialization(self, client: HubspotClient):
-        """Test that client initializes successfully with API key."""
+        """Checks initialisation of HubspotClient and fails early if env variables is not set"""
         assert client.access_token is not None
         assert client.client is not None
         assert client.logger is not None

From 547f50550bf4cba3493e1bdfa94579d9323cb3f5 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 17 Mar 2026 12:58:35 +0000
Subject: [PATCH 17/47] readded per khalims request

---
 Makefile | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 Makefile

diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..00942acd
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,30 @@
+# Project Makefile
+
+PYTHON = python
+
+.PHONY: setup test lint typecheck check clean
+
+# Install dev dependencies + tox
+setup:
+	$(PYTHON) -m pip install --upgrade pip
+	$(PYTHON) -m pip install tox black ruff mypy
+
+# Run tests (pass ARGS="..." for specific tests)
+test:
+	tox -- $(ARGS)
+
+# Code formatting check + linting
+lint:
+	ruff .
+	black --check .
+
+# Static type checks
+typecheck:
+	mypy .
+
+# Full quality check (all checks + tests)
+check: lint typecheck test
+
+# Clean up tox environments
+clean:
+	rm -rf .tox

From 6bfeeeb1b180e50247adad7401222730189860c7 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 17 Mar 2026 13:14:16 +0000
Subject: [PATCH 18/47] go back to origional

---
 backend/address2UPRN/main.py | 272 +++++++++++++++++++++++++++++------
 1 file changed, 226 insertions(+), 46 deletions(-)

diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py
index c458e40d..af29a095 100644
--- a/backend/address2UPRN/main.py
+++ b/backend/address2UPRN/main.py
@@ -1,11 +1,13 @@
-from typing import Optional
-
 from epc_api.client import EpcClient
 import os
 from urllib.parse import urlencode
 import pandas as pd
+from difflib import SequenceMatcher
 from utils.logger import setup_logger
+import re
+from typing import Set
 import json
+import requests
 from uuid import UUID
 import uuid
 from backend.app.db.functions.tasks.Tasks import SubTaskInterface
@@ -16,8 +18,6 @@ from utils.s3 import (
 )
 from datetime import datetime
 
-from backend.utils.addressMatch import AddressMatch
-
 logger = setup_logger()
 
 
@@ -29,6 +29,191 @@ if EPC_AUTH_TOKEN is None:
     raise RuntimeError("EPC_AUTH_TOKEN not defined in env")
 
 
+def is_valid_postcode(postcode_clean: str) -> bool:
+    """
+    Validate postcode using postcodes.io.
+
+    Expects a sanitised postcode (e.g. E84SQ).
+    Returns True if valid, False otherwise.
+    """
+    POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate"
+    if not postcode_clean:
+        return False
+
+    try:
+        resp = requests.get(
+            POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean),
+            timeout=5,
+        )
+        resp.raise_for_status()
+        return resp.json().get("result", False)
+    except requests.RequestException:
+        # Network issues, rate limits, etc.
+        return False
+
+
+def levenshtein(a: str, b: str) -> float:
+    """
+    Address similarity score in [0, 1].
+
+    Strategy:
+    - Normalise
+    - Strongly penalise mismatched house/flat numbers
+    - Combine token overlap + character similarity
+    """
+
+    def extract_number_sequence(s: str) -> list[str]:
+        return re.findall(r"\d+[a-z]?", s)
+
+    def extract_numbers(s: str) -> Set[str]:
+        return set(extract_number_sequence(s))
+
+    def tokenise(s: str) -> Set[str]:
+        return set(s.split())
+
+    def extract_building_number(s: str) -> str | None:
+        """
+        Extract the main building number (NOT flat/unit).
+        Assumes formats like:
+        - '42 moreton road'
+        - 'flat 3 42 moreton road'
+        """
+        tokens = s.split()
+
+        # remove flat/unit context
+        cleaned = []
+        skip_next = False
+        for t in tokens:
+            if t in ("flat", "apt", "apartment", "unit"):
+                skip_next = True
+                continue
+            if skip_next:
+                skip_next = False
+                continue
+            cleaned.append(t)
+
+        # first remaining number is building number
+        for t in cleaned:
+            if re.fullmatch(r"\d+[a-z]?", t):
+                return t
+
+        return None
+
+    a_norm = normalise_address(a)
+    b_norm = normalise_address(b)
+
+    # --- hard signal: numbers ---
+    nums_a = extract_numbers(a_norm)
+    nums_b = extract_numbers(b_norm)
+
+    if nums_a and not nums_b:
+        return 0.0
+
+    # No shared numbers at all → impossible match
+    if nums_a and nums_b and nums_a.isdisjoint(nums_b):
+        return 0.0
+
+    # 🔒 HARD GUARD: building number must match
+    bld_a = extract_building_number(a_norm)
+    bld_b = extract_building_number(b_norm)
+
+    if bld_a and bld_b and bld_a != bld_b:
+        return 0.0
+
+    # --- order-sensitive flat/building guard ---
+    seq_a = extract_number_sequence(a_norm)
+    seq_b = extract_number_sequence(b_norm)
+
+    has_flat_token_user = any(
+        tok in a_norm for tok in ("flat", "apt", "apartment", "unit")
+    )
+    has_flat_token_epc = "flat" in b_norm
+
+    if (
+        len(seq_a) == 2
+        and len(seq_b) >= 2
+        and has_flat_token_epc
+        and not has_flat_token_user
+        and seq_a != seq_b[:2]
+    ):
+        return 0.0
+
+    # --- token similarity (order-independent) ---
+    toks_a = tokenise(a_norm)
+    toks_b = tokenise(b_norm)
+
+    if not toks_a or not toks_b:
+        token_score = 0.0
+    else:
+        token_score = len(toks_a & toks_b) / len(toks_a | toks_b)
+
+    # --- character similarity (soft signal) ---
+    char_score = SequenceMatcher(None, a_norm, b_norm).ratio()
+
+    # --- weighted blend ---
+    return round(
+        0.65 * token_score + 0.35 * char_score,
+        4,
+    )
+
+
+def normalise_address(s: str) -> str:
+    """
+    Canonical UK-focused address normalisation.
+
+    - Lowercases
+    - Removes punctuation (keeps / for flats)
+    - Normalises whitespace
+    - Applies synonym compression at token level
+    """
+
+    if not s:
+        return ""
+
+    ADDRESS_SYNONYMS = {
+        # street types
+        "rd": "road",
+        "rd.": "road",
+        "st": "street",
+        "st.": "street",
+        "ave": "avenue",
+        "ave.": "avenue",
+        "ln": "lane",
+        "ln.": "lane",
+        "cres": "crescent",
+        "ct": "court",
+        "dr": "drive",
+        # flats / units
+        "apt": "flat",
+        "apartment": "flat",
+        "unit": "flat",
+        "ste": "suite",
+        # numbering noise
+        "no": "",
+        "no.": "",
+    }
+    # 1. lowercase
+    s = s.lower()
+
+    # 1.5 split digit-letter suffixes
+    s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s)
+
+    # 2. remove punctuation except /
+    s = re.sub(r"[^\w\s/]", " ", s)
+
+    # 3. normalise whitespace
+    s = re.sub(r"\s+", " ", s).strip()
+
+    # 4. tokenise + synonym normalisation
+    tokens = []
+    for tok in s.split():
+        replacement = ADDRESS_SYNONYMS.get(tok, tok)
+        if replacement:
+            tokens.append(replacement)
+
+    return " ".join(tokens)
+
+
 def score_addresses(
     df: pd.DataFrame,
     user_address: str,
@@ -37,7 +222,7 @@ def score_addresses(
     if column not in df.columns:
         raise ValueError(f"Missing column: {column}")
 
-    return df[column].apply(lambda x: AddressMatch.score(user_address, x))
+    return df[column].apply(lambda x: levenshtein(user_address, x))
 
 
 def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3):
@@ -129,11 +314,9 @@ def get_uprn_candidates(
 
     out = df.copy()
 
-    user_norm = AddressMatch.normalise_address(user_address)
+    user_norm = normalise_address(user_address)
 
-    out["lexiscore"] = out[address_column].apply(
-        lambda x: AddressMatch.levenshtein(user_norm, x)
-    )
+    out["lexiscore"] = out[address_column].apply(lambda x: levenshtein(user_norm, x))
 
     # Normalise UPRN to string
     out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True)
@@ -297,10 +480,7 @@ def resolve_uprns_for_postcode_group(
 
 
 def save_results_to_s3(
-    results_df: pd.DataFrame,
-    task_id: str,
-    sub_task_id: str,
-    bucket_name: Optional[str] = None,
+    results_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None
 ) -> bool:
     """
     Save results DataFrame to S3 as CSV.
@@ -351,9 +531,9 @@ def handler(event, context, local=False):
                 {
                     "body": json.dumps(
                         {
-                            "task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09",
-                            "sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f",
-                            "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv",
+                            "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
+                            "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d",
+                            "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-16T12:00:20.257856_7b520c0e.csv",
                         }
                     )
                 }
@@ -441,9 +621,19 @@ def handler(event, context, local=False):
             # Process the rows
             logger.info(f"Processing {len(df)} rows for task {task_id}")
 
-            df["postcode_clean"] = (
-                df["Postcode"].astype(str).str.upper().str.strip().str.replace(" ", "")
-            )
+            # Create user_input column by concatenating Address columns if not already present
+            if "user_input" not in df.columns:
+                df["user_input"] = (
+                    df["Address 1"].fillna("")
+                    + " "
+                    + df["Address 2"].fillna("")
+                    + " "
+                    + df["Address 3"].fillna("")
+                ).str.strip()
+                logger.info(f"Created user_input column from Address 1 and Address 2")
+            else:
+                logger.info(f"user_input column already present in data")
+
             clean_df = df.dropna(subset=["postcode_clean"])
 
             postcode_to_addresses = {
@@ -463,7 +653,7 @@ def handler(event, context, local=False):
                 )
 
                 # Validate postcode before processing
-                if not AddressMatch.is_valid_postcode(postcode):
+                if not is_valid_postcode(postcode):
                     logger.warning(f"Postcode {postcode} is invalid, skipping")
                     continue
 
@@ -482,67 +672,57 @@ def handler(event, context, local=False):
                 # Process each address in this postcode with the same EPC data
                 for row in postcode_rows:
                     try:
-                        # Concatenate Address columns directly
-                        address2uprn_user_input = (
-                            str(row.get("Address 1", "")).strip()
-                            + " "
-                            + str(row.get("Address 2", "")).strip()
-                            + " "
-                            + str(row.get("Address 3", "")).strip()
-                        ).strip()
-
-                        if not address2uprn_user_input:
+                        user_input = row.get("user_input", "")
+                        if not user_input:
                             logger.warning(
-                                f"Skipping row with missing address components for postcode {postcode}"
+                                f"Skipping row with missing user_input for postcode {postcode}"
                             )
                             continue
 
                         # Get UPRN using the pre-fetched EPC data with all return options
                         result = get_uprn_with_epc_df(
-                            user_inputed_address=address2uprn_user_input,
-                            epc_df=epc_df,
-                            verbose=True,
+                            user_inputed_address=user_input, epc_df=epc_df, verbose=True
                         )
 
                         # Parse result tuple if successful
                         if result:
                             uprn, found_address, score = result
                             logger.info(
-                                f"Found UPRN for {address2uprn_user_input} in {postcode}: {uprn} (score: {score})"
+                                f"Found UPRN for {user_input} in {postcode}: {uprn} (score: {score})"
                             )
 
                             results_data.append(
                                 {
                                     **row,  # Include all original data
-                                    "address2uprn_uprn": uprn,
-                                    "address2uprn_address": found_address,
-                                    "address2uprn_lexiscore": score,
+                                    "uprn": uprn,
+                                    "domna_found_address": found_address,
+                                    "domna_lexiscore": score,
                                 }
                             )
                         else:
                             logger.warning(
-                                f"No UPRN found for {address2uprn_user_input} in {postcode}"
+                                f"No UPRN found for {user_input} in {postcode}"
                             )
                             results_data.append(
                                 {
                                     **row,  # Include all original data
-                                    "address2uprn_uprn": None,
-                                    "address2uprn_address": None,
-                                    "address2uprn_lexiscore": None,
+                                    "uprn": None,
+                                    "domna_found_address": None,
+                                    "domna_lexiscore": None,
                                 }
                             )
 
                     except Exception as e:
                         logger.error(
-                            f"Error processing address {row.get('address2uprn_user_input', 'unknown')}: {e}"
+                            f"Error processing address {row.get('user_input', 'unknown')}: {e}"
                         )
                         # Still add the row with error markers
                         results_data.append(
                             {
                                 **row,
-                                "address2uprn_uprn": None,
-                                "address2uprn_address": None,
-                                "address2uprn_lexiscore": None,
+                                "uprn": None,
+                                "domna_found_address": None,
+                                "domna_lexiscore": None,
                                 "error": str(e),
                             }
                         )

From f69a6151404f9d30c6ff85a91921c5eff563b050 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 17 Mar 2026 17:13:49 +0000
Subject: [PATCH 19/47] revert to old one

---
 backend/address2UPRN/main.py | 265 ++++++-----------------------------
 1 file changed, 41 insertions(+), 224 deletions(-)

diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py
index af29a095..d0ba36e6 100644
--- a/backend/address2UPRN/main.py
+++ b/backend/address2UPRN/main.py
@@ -1,13 +1,11 @@
+from typing import Optional
+
 from epc_api.client import EpcClient
 import os
 from urllib.parse import urlencode
 import pandas as pd
-from difflib import SequenceMatcher
 from utils.logger import setup_logger
-import re
-from typing import Set
 import json
-import requests
 from uuid import UUID
 import uuid
 from backend.app.db.functions.tasks.Tasks import SubTaskInterface
@@ -18,6 +16,8 @@ from utils.s3 import (
 )
 from datetime import datetime
 
+from backend.utils.addressMatch import AddressMatch
+
 logger = setup_logger()
 
 
@@ -29,191 +29,6 @@ if EPC_AUTH_TOKEN is None:
     raise RuntimeError("EPC_AUTH_TOKEN not defined in env")
 
 
-def is_valid_postcode(postcode_clean: str) -> bool:
-    """
-    Validate postcode using postcodes.io.
-
-    Expects a sanitised postcode (e.g. E84SQ).
-    Returns True if valid, False otherwise.
-    """
-    POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate"
-    if not postcode_clean:
-        return False
-
-    try:
-        resp = requests.get(
-            POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean),
-            timeout=5,
-        )
-        resp.raise_for_status()
-        return resp.json().get("result", False)
-    except requests.RequestException:
-        # Network issues, rate limits, etc.
-        return False
-
-
-def levenshtein(a: str, b: str) -> float:
-    """
-    Address similarity score in [0, 1].
-
-    Strategy:
-    - Normalise
-    - Strongly penalise mismatched house/flat numbers
-    - Combine token overlap + character similarity
-    """
-
-    def extract_number_sequence(s: str) -> list[str]:
-        return re.findall(r"\d+[a-z]?", s)
-
-    def extract_numbers(s: str) -> Set[str]:
-        return set(extract_number_sequence(s))
-
-    def tokenise(s: str) -> Set[str]:
-        return set(s.split())
-
-    def extract_building_number(s: str) -> str | None:
-        """
-        Extract the main building number (NOT flat/unit).
-        Assumes formats like:
-        - '42 moreton road'
-        - 'flat 3 42 moreton road'
-        """
-        tokens = s.split()
-
-        # remove flat/unit context
-        cleaned = []
-        skip_next = False
-        for t in tokens:
-            if t in ("flat", "apt", "apartment", "unit"):
-                skip_next = True
-                continue
-            if skip_next:
-                skip_next = False
-                continue
-            cleaned.append(t)
-
-        # first remaining number is building number
-        for t in cleaned:
-            if re.fullmatch(r"\d+[a-z]?", t):
-                return t
-
-        return None
-
-    a_norm = normalise_address(a)
-    b_norm = normalise_address(b)
-
-    # --- hard signal: numbers ---
-    nums_a = extract_numbers(a_norm)
-    nums_b = extract_numbers(b_norm)
-
-    if nums_a and not nums_b:
-        return 0.0
-
-    # No shared numbers at all → impossible match
-    if nums_a and nums_b and nums_a.isdisjoint(nums_b):
-        return 0.0
-
-    # 🔒 HARD GUARD: building number must match
-    bld_a = extract_building_number(a_norm)
-    bld_b = extract_building_number(b_norm)
-
-    if bld_a and bld_b and bld_a != bld_b:
-        return 0.0
-
-    # --- order-sensitive flat/building guard ---
-    seq_a = extract_number_sequence(a_norm)
-    seq_b = extract_number_sequence(b_norm)
-
-    has_flat_token_user = any(
-        tok in a_norm for tok in ("flat", "apt", "apartment", "unit")
-    )
-    has_flat_token_epc = "flat" in b_norm
-
-    if (
-        len(seq_a) == 2
-        and len(seq_b) >= 2
-        and has_flat_token_epc
-        and not has_flat_token_user
-        and seq_a != seq_b[:2]
-    ):
-        return 0.0
-
-    # --- token similarity (order-independent) ---
-    toks_a = tokenise(a_norm)
-    toks_b = tokenise(b_norm)
-
-    if not toks_a or not toks_b:
-        token_score = 0.0
-    else:
-        token_score = len(toks_a & toks_b) / len(toks_a | toks_b)
-
-    # --- character similarity (soft signal) ---
-    char_score = SequenceMatcher(None, a_norm, b_norm).ratio()
-
-    # --- weighted blend ---
-    return round(
-        0.65 * token_score + 0.35 * char_score,
-        4,
-    )
-
-
-def normalise_address(s: str) -> str:
-    """
-    Canonical UK-focused address normalisation.
-
-    - Lowercases
-    - Removes punctuation (keeps / for flats)
-    - Normalises whitespace
-    - Applies synonym compression at token level
-    """
-
-    if not s:
-        return ""
-
-    ADDRESS_SYNONYMS = {
-        # street types
-        "rd": "road",
-        "rd.": "road",
-        "st": "street",
-        "st.": "street",
-        "ave": "avenue",
-        "ave.": "avenue",
-        "ln": "lane",
-        "ln.": "lane",
-        "cres": "crescent",
-        "ct": "court",
-        "dr": "drive",
-        # flats / units
-        "apt": "flat",
-        "apartment": "flat",
-        "unit": "flat",
-        "ste": "suite",
-        # numbering noise
-        "no": "",
-        "no.": "",
-    }
-    # 1. lowercase
-    s = s.lower()
-
-    # 1.5 split digit-letter suffixes
-    s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s)
-
-    # 2. remove punctuation except /
-    s = re.sub(r"[^\w\s/]", " ", s)
-
-    # 3. normalise whitespace
-    s = re.sub(r"\s+", " ", s).strip()
-
-    # 4. tokenise + synonym normalisation
-    tokens = []
-    for tok in s.split():
-        replacement = ADDRESS_SYNONYMS.get(tok, tok)
-        if replacement:
-            tokens.append(replacement)
-
-    return " ".join(tokens)
-
-
 def score_addresses(
     df: pd.DataFrame,
     user_address: str,
@@ -222,7 +37,7 @@ def score_addresses(
     if column not in df.columns:
         raise ValueError(f"Missing column: {column}")
 
-    return df[column].apply(lambda x: levenshtein(user_address, x))
+    return df[column].apply(lambda x: AddressMatch.score(user_address, x))
 
 
 def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3):
@@ -314,9 +129,11 @@ def get_uprn_candidates(
 
     out = df.copy()
 
-    user_norm = normalise_address(user_address)
+    user_norm = AddressMatch.normalise_address(user_address)
 
-    out["lexiscore"] = out[address_column].apply(lambda x: levenshtein(user_norm, x))
+    out["lexiscore"] = out[address_column].apply(
+        lambda x: AddressMatch.levenshtein(user_norm, x)
+    )
 
     # Normalise UPRN to string
     out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True)
@@ -480,7 +297,10 @@ def resolve_uprns_for_postcode_group(
 
 
 def save_results_to_s3(
-    results_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None
+    results_df: pd.DataFrame,
+    task_id: str,
+    sub_task_id: str,
+    bucket_name: Optional[str] = None,
 ) -> bool:
     """
     Save results DataFrame to S3 as CSV.
@@ -533,7 +353,7 @@ def handler(event, context, local=False):
                         {
                             "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
                             "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d",
-                            "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-16T12:00:20.257856_7b520c0e.csv",
+                            "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv",
                         }
                     )
                 }
@@ -621,19 +441,6 @@ def handler(event, context, local=False):
             # Process the rows
             logger.info(f"Processing {len(df)} rows for task {task_id}")
 
-            # Create user_input column by concatenating Address columns if not already present
-            if "user_input" not in df.columns:
-                df["user_input"] = (
-                    df["Address 1"].fillna("")
-                    + " "
-                    + df["Address 2"].fillna("")
-                    + " "
-                    + df["Address 3"].fillna("")
-                ).str.strip()
-                logger.info(f"Created user_input column from Address 1 and Address 2")
-            else:
-                logger.info(f"user_input column already present in data")
-
             clean_df = df.dropna(subset=["postcode_clean"])
 
             postcode_to_addresses = {
@@ -653,7 +460,7 @@ def handler(event, context, local=False):
                 )
 
                 # Validate postcode before processing
-                if not is_valid_postcode(postcode):
+                if not AddressMatch.is_valid_postcode(postcode):
                     logger.warning(f"Postcode {postcode} is invalid, skipping")
                     continue
 
@@ -672,57 +479,67 @@ def handler(event, context, local=False):
                 # Process each address in this postcode with the same EPC data
                 for row in postcode_rows:
                     try:
-                        user_input = row.get("user_input", "")
-                        if not user_input:
+                        # Concatenate Address columns directly
+                        address2uprn_user_input = (
+                            str(row.get("Address 1", "")).strip()
+                            + " "
+                            + str(row.get("Address 2", "")).strip()
+                            + " "
+                            + str(row.get("Address 3", "")).strip()
+                        ).strip()
+
+                        if not address2uprn_user_input:
                             logger.warning(
-                                f"Skipping row with missing user_input for postcode {postcode}"
+                                f"Skipping row with missing address components for postcode {postcode}"
                             )
                             continue
 
                         # Get UPRN using the pre-fetched EPC data with all return options
                         result = get_uprn_with_epc_df(
-                            user_inputed_address=user_input, epc_df=epc_df, verbose=True
+                            user_inputed_address=address2uprn_user_input,
+                            epc_df=epc_df,
+                            verbose=True,
                         )
 
                         # Parse result tuple if successful
                         if result:
                             uprn, found_address, score = result
                             logger.info(
-                                f"Found UPRN for {user_input} in {postcode}: {uprn} (score: {score})"
+                                f"Found UPRN for {address2uprn_user_input} in {postcode}: {uprn} (score: {score})"
                             )
 
                             results_data.append(
                                 {
                                     **row,  # Include all original data
-                                    "uprn": uprn,
-                                    "domna_found_address": found_address,
-                                    "domna_lexiscore": score,
+                                    "address2uprn_uprn": uprn,
+                                    "address2uprn_address": found_address,
+                                    "address2uprn_lexiscore": score,
                                 }
                             )
                         else:
                             logger.warning(
-                                f"No UPRN found for {user_input} in {postcode}"
+                                f"No UPRN found for {address2uprn_user_input} in {postcode}"
                             )
                             results_data.append(
                                 {
                                     **row,  # Include all original data
-                                    "uprn": None,
-                                    "domna_found_address": None,
-                                    "domna_lexiscore": None,
+                                    "address2uprn_uprn": None,
+                                    "address2uprn_address": None,
+                                    "address2uprn_lexiscore": None,
                                 }
                             )
 
                     except Exception as e:
                         logger.error(
-                            f"Error processing address {row.get('user_input', 'unknown')}: {e}"
+                            f"Error processing address {row.get('address2uprn_user_input', 'unknown')}: {e}"
                         )
                         # Still add the row with error markers
                         results_data.append(
                             {
                                 **row,
-                                "uprn": None,
-                                "domna_found_address": None,
-                                "domna_lexiscore": None,
+                                "address2uprn_uprn": None,
+                                "address2uprn_address": None,
+                                "address2uprn_lexiscore": None,
                                 "error": str(e),
                             }
                         )

From fc425b8b66d38305967e22d4a040a316848ddf35 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 17 Mar 2026 17:18:09 +0000
Subject: [PATCH 20/47] better comments

---
 etl/hubspot/hubspotClient.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py
index f93a736c..ed456478 100644
--- a/etl/hubspot/hubspotClient.py
+++ b/etl/hubspot/hubspotClient.py
@@ -162,7 +162,7 @@ class HubspotClient:
         response: AssociationsPageResponse = associations_api.get_page(  # type: ignore[reportUnknownMemberType]
             object_type="deals",
             object_id=deal_id,
-            to_object_type="0-420",  # <-- use your exact custom object name slug here
+            to_object_type="0-420",  # <-- to get an listing object
             limit=1,
         )
 
@@ -373,9 +373,7 @@ class HubspotClient:
         properties: dict[str, str] = cast(dict[str, str], product.properties)  # type: ignore[reportUnknownMemberType]
 
         name: str = properties.get("name") or ""
-        price: str = (
-            properties.get("price") or properties.get("hs_price") or "0"
-        )
+        price: str = properties.get("price") or properties.get("hs_price") or "0"
 
         # Build line item payload
         line_item_input = SimplePublicObjectInput(

From e01b7225bbd62d501faeb85982ad873c6b26eedd Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 24 Mar 2026 13:01:56 +0000
Subject: [PATCH 21/47] save

---
 asset_list/app.py                             | 19 +++++++++----------
 backend/address2UPRN/README.md                | 15 ++++++++-------
 .../scripts/combine_address2uprn_outputs.py   | 12 ++++++------
 sfr/principal_pitch/2_export_data.py          |  6 +++---
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/asset_list/app.py b/asset_list/app.py
index 7858146d..02c94f10 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -74,24 +74,23 @@ def app():
     """
 
     data_folder = "/workspaces/model/asset_list"
-    # data_filename = "For Modelling - Final - reviewed.xlsx"
-    data_filename = "assests.xlsx"
-    sheet_name = "Sheet1"
-    postcode_column = "POSTCODE"
-    address1_column = "ADDRESS"
+    data_filename = "Calico ARA Upload Review.xlsx"
+    sheet_name = "Upload to Ara - Needs Sign Off"
+    postcode_column = "Postcode"
+    address1_column = "Address 1"
     address1_method = None
-    fulladdress_column = "ADDRESS"
+    fulladdress_column = "Address 1"
     address_cols_to_concat = []
     missing_postcodes_method = None
     landlord_year_built = None
-    landlord_os_uprn = None
-    landlord_property_type = None
-    landlord_built_form = None
+    landlord_os_uprn = "ara_found_uprn"
+    landlord_property_type = "Property Type"
+    landlord_built_form = "Property Type"
     landlord_wall_construction = None
     landlord_roof_construction = None
     landlord_heating_system = None
     landlord_existing_pv = None
-    landlord_property_id = "UPRN"
+    landlord_property_id = "Asset Reference"
     landlord_sap = None
     outcomes_filename = None
     outcomes_sheetname = None
diff --git a/backend/address2UPRN/README.md b/backend/address2UPRN/README.md
index e34e45f6..646fec01 100644
--- a/backend/address2UPRN/README.md
+++ b/backend/address2UPRN/README.md
@@ -9,7 +9,7 @@ I believe lower and upper case matter:
 * Address 1
 * Address 2
 * Address 3
-* Postcode
+* postcode
 
 And save it as a .csv file
 
@@ -24,18 +24,19 @@ For this example I'll be using "s3://retrofit-data-dev/ara_raw_inputs/calico/Cal
 
 Go to Ara DB and make a new task_id with a randomly generated uuid as the primarily key
 
-task_id = 169ea9b0-01b5-48dc-9f90-ae1989491d09 
-sub_task_id = e5704f9e-29fe-43c8-8913-05be09f2440f
-s3 => s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching Rerun After Address Fix.csv
+task_id = ea615ac3-ac28-46c4-8bff-2431c5b9c13d
+sub_task_id = 85a23b67-8f18-4299-9bf0-69bfb87adbc7
+s3 => s3://retrofit-data-dev/ara_raw_inputs/eon/North Tyneside Council.csv 
 
 Step 3) Alright, now lets make the input for postcode-splitter sqs to get the ball rolling
 postcode-splitter-sqs => https://eu-west-2.console.aws.amazon.com/sqs/v3/home?region=eu-west-2#/queues/https%3A%2F%2Fsqs.eu-west-2.amazonaws.com%2F337213553626%2Fpostcode-splitter-queue-dev
 
 {
-    "task_id": "169ea9b0-01b5-48dc-9f90-ae1989491d09",
-    "sub_task_id": "e5704f9e-29fe-43c8-8913-05be09f2440f",
-    "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/calico/Calico UPRN Matching (1)(Sheet1).csv"
+    "task_id": "ea615ac3-ac28-46c4-8bff-2431c5b9c13d",
+    "sub_task_id": "85a23b67-8f18-4299-9bf0-69bfb87adbc7",
+    "s3_uri": "s3://retrofit-data-dev/ara_raw_inputs/eon/eon(Sheet1).csv"
 }
+
 Each batch of csv should be saved in retrofit-data-dev/ara_postcode_splitter_batches/<task-id>/<sub-task-id>/<timestamp:uuid4>.csv
 
 outputs of address2uprn ( which is automatically triggered on postcodesplitter) will be saved on retrofit-data-dev/ara_raw_outputs/<task-id>/<subtask-id>/<timestamp:uuid4>.csv
diff --git a/backend/scripts/combine_address2uprn_outputs.py b/backend/scripts/combine_address2uprn_outputs.py
index be17f610..f065c676 100644
--- a/backend/scripts/combine_address2uprn_outputs.py
+++ b/backend/scripts/combine_address2uprn_outputs.py
@@ -55,11 +55,11 @@ def main(task_id, output):
     print(f"Total rows: {len(combined)}")
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("task_id", help="Task ID folder in S3")
-    parser.add_argument("--output", default="combined.csv")
+# if __name__ == "__main__":
+#     parser = argparse.ArgumentParser()
+#     parser.add_argument("task_id", help="Task ID folder in S3")
+#     parser.add_argument("--output", default="combined.csv")
 
-    args = parser.parse_args()
+#     args = parser.parse_args()
 
-    main(args.task_id, args.output)
+#     main(args.task_id, args.output)
diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py
index 519636be..df54749e 100644
--- a/sfr/principal_pitch/2_export_data.py
+++ b/sfr/principal_pitch/2_export_data.py
@@ -28,10 +28,10 @@ from sqlalchemy import func
 
 # PORTFOLIO_ID = 206
 # SCENARIOS = [389]
-PORTFOLIO_ID = 581
-SCENARIOS = [1124]
+PORTFOLIO_ID = 633
+SCENARIOS = [1146]
 scenario_names = {
-    1124: "EPC C - Solar Focused",
+    1146: "Most Economic",
 }
 
 project_name = "WCHG EPC D rated properties"

From a362e1dd99f83352414e5663679ebbc125740716 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 24 Mar 2026 15:34:33 +0000
Subject: [PATCH 22/47] added company information

---
 backend/app/db/models/organisation.py         | 13 +++++
 etl/hubspot/hubspotDataTodB.py                | 50 +++++++++++++++++++
 .../scripts/onboarding/new_organisation.py    | 19 +++++++
 3 files changed, 82 insertions(+)
 create mode 100644 backend/app/db/models/organisation.py
 create mode 100644 etl/hubspot/hubspotDataTodB.py
 create mode 100644 etl/hubspot/scripts/onboarding/new_organisation.py

diff --git a/backend/app/db/models/organisation.py b/backend/app/db/models/organisation.py
new file mode 100644
index 00000000..774a05af
--- /dev/null
+++ b/backend/app/db/models/organisation.py
@@ -0,0 +1,13 @@
+from sqlmodel import SQLModel, Field
+from datetime import datetime, timezone
+from typing import Optional
+import uuid
+
+
+class Organisation(SQLModel, table=True):
+    __tablename__ = "organisation"
+    id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True)
+    created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
+    updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
+    hubspot_company_id: Optional[str] = None
+    name: Optional[str] = None
diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py
new file mode 100644
index 00000000..24df240e
--- /dev/null
+++ b/etl/hubspot/hubspotDataTodB.py
@@ -0,0 +1,50 @@
+from backend.app.db.connection import db_session
+from backend.app.db.models.organisation import Organisation
+from sqlmodel import select
+from datetime import datetime, timezone
+from typing import TypedDict
+
+
+class CompanyData(TypedDict):
+    hs_object_id: str
+    name: str
+
+
+class HubspotDataToDb:
+    def __init__(self):
+        pass
+
+    def read_org_table(self, limit: int = 10):
+        with db_session() as session:
+            records = session.exec(select(Organisation).limit(limit)).all()
+            return records
+
+    def upsert_company(self, company_data: CompanyData) -> Organisation:
+        """Upserts a company record. Updates if hubspot_company_id exists, otherwise creates new."""
+        with db_session() as session:
+            hubspot_id = company_data.get("hs_object_id")
+            company_name = company_data.get("name")
+
+            # Check if company already exists
+            existing = session.exec(
+                select(Organisation).where(
+                    Organisation.hubspot_company_id == hubspot_id
+                )
+            ).first()
+
+            if existing:
+                # Update existing record
+                existing.name = company_name
+                existing.updated_at = datetime.now(timezone.utc)
+                session.add(existing)
+                record = existing
+            else:
+                # Create new record
+                record = Organisation(
+                    hubspot_company_id=hubspot_id,
+                    name=company_name,
+                )
+                session.add(record)
+
+            session.commit()
+            return record
diff --git a/etl/hubspot/scripts/onboarding/new_organisation.py b/etl/hubspot/scripts/onboarding/new_organisation.py
new file mode 100644
index 00000000..f5faead3
--- /dev/null
+++ b/etl/hubspot/scripts/onboarding/new_organisation.py
@@ -0,0 +1,19 @@
+from etl.hubspot.hubspotClient import HubspotClient, Companies
+
+from etl.hubspot.hubspotDataTodB import HubspotDataToDb
+
+hubspot = HubspotClient()
+
+companies_to_add_or_ensure_it_exists = [
+    Companies.THE_GUINESS_PARTNERSHIP,
+    Companies.SOUTHERN_HOUSING_GROUP,
+]
+
+for company in companies_to_add_or_ensure_it_exists:
+    company_info = hubspot.get_company_information(company.value)
+    company_info
+    break
+
+dbRead = HubspotDataToDb()
+
+dbRead.read_org_table()

From 29ab9ecfd778940cfac77161dbb8859c9fba9394 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 24 Mar 2026 15:43:10 +0000
Subject: [PATCH 23/47] added hubspot company data

---
 etl/hubspot/hubspotClient.py                       |  5 +++--
 etl/hubspot/hubspotDataTodB.py                     | 11 ++++++++---
 etl/hubspot/scripts/onboarding/new_organisation.py | 14 +++++++-------
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py
index ed456478..c87ea872 100644
--- a/etl/hubspot/hubspotClient.py
+++ b/etl/hubspot/hubspotClient.py
@@ -25,6 +25,7 @@ from hubspot.crm.associations.v4.models import (  # type: ignore[reportMissingTy
     ForwardPaging as AssociationsPaging,
     NextPage as AssociationsPagingNext,
 )
+from etl.hubspot.hubspotDataTodB import CompanyData
 
 
 from backend.app.config import get_settings
@@ -223,7 +224,7 @@ class HubspotClient:
 
         return deal, company, listing
 
-    def get_company_information(self, company_id: str) -> dict[str, str]:
+    def get_company_information(self, company_id: str) -> CompanyData:
         companies_api: CompaniesBasicApi = self.client.crm.companies.basic_api  # type: ignore[reportUnknownMemberType]
 
         company: HubspotObject = companies_api.get_by_id(  # type: ignore[reportUnknownMemberType]
@@ -233,7 +234,7 @@ class HubspotClient:
             ],
         )
 
-        company_info: dict[str, str] = cast(dict[str, str], company.properties)  # type: ignore[reportUnknownMemberType]
+        company_info: CompanyData = company.properties  # type: ignore[reportUnknownMemberType]
         return company_info
 
     def get_all_pipelines(self) -> list[dict[str, str]]:
diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py
index 24df240e..8fe61a3e 100644
--- a/etl/hubspot/hubspotDataTodB.py
+++ b/etl/hubspot/hubspotDataTodB.py
@@ -1,4 +1,4 @@
-from backend.app.db.connection import db_session
+from backend.app.db.connection import db_read_session
 from backend.app.db.models.organisation import Organisation
 from sqlmodel import select
 from datetime import datetime, timezone
@@ -15,13 +15,18 @@ class HubspotDataToDb:
         pass
 
     def read_org_table(self, limit: int = 10):
-        with db_session() as session:
+        with db_read_session() as session:
             records = session.exec(select(Organisation).limit(limit)).all()
             return records
 
+    def get_org_names(self, limit: int = 10) -> list[str]:
+        """Returns a list of organisation names."""
+        records = self.read_org_table(limit)
+        return [org.name for org in records if org.name]
+
     def upsert_company(self, company_data: CompanyData) -> Organisation:
         """Upserts a company record. Updates if hubspot_company_id exists, otherwise creates new."""
-        with db_session() as session:
+        with db_read_session() as session:
             hubspot_id = company_data.get("hs_object_id")
             company_name = company_data.get("name")
 
diff --git a/etl/hubspot/scripts/onboarding/new_organisation.py b/etl/hubspot/scripts/onboarding/new_organisation.py
index f5faead3..5a11266f 100644
--- a/etl/hubspot/scripts/onboarding/new_organisation.py
+++ b/etl/hubspot/scripts/onboarding/new_organisation.py
@@ -1,19 +1,19 @@
 from etl.hubspot.hubspotClient import HubspotClient, Companies
 
-from etl.hubspot.hubspotDataTodB import HubspotDataToDb
+from etl.hubspot.hubspotDataTodB import HubspotDataToDb, CompanyData
 
 hubspot = HubspotClient()
-
+dbRead = HubspotDataToDb()
 companies_to_add_or_ensure_it_exists = [
     Companies.THE_GUINESS_PARTNERSHIP,
     Companies.SOUTHERN_HOUSING_GROUP,
 ]
 
 for company in companies_to_add_or_ensure_it_exists:
-    company_info = hubspot.get_company_information(company.value)
-    company_info
-    break
+    company_info: CompanyData = hubspot.get_company_information(company.value)
+    dbRead.upsert_company(company_info)
+
 
 dbRead = HubspotDataToDb()
-
-dbRead.read_org_table()
+names = dbRead.get_org_names()
+print(f"Organisations in database: {names}")

From da039b91b2f3340ba27048af9b0f27004e1378b7 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 24 Mar 2026 15:55:17 +0000
Subject: [PATCH 24/47] hubspot etl for organisation complete

---
 etl/hubspot/hubspotClient.py                       | 1 +
 etl/hubspot/scripts/onboarding/new_organisation.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py
index c87ea872..6fd11bed 100644
--- a/etl/hubspot/hubspotClient.py
+++ b/etl/hubspot/hubspotClient.py
@@ -43,6 +43,7 @@ class Companies(Enum):
     HOMEGROUP = "94946071794"
     APPLE = "184769046716"
     THE_GUINESS_PARTNERSHIP = "86970043613"
+    CALICO_HOMES = "86975437046"
 
 
 class DealStage(Enum):
diff --git a/etl/hubspot/scripts/onboarding/new_organisation.py b/etl/hubspot/scripts/onboarding/new_organisation.py
index 5a11266f..f2ff8bda 100644
--- a/etl/hubspot/scripts/onboarding/new_organisation.py
+++ b/etl/hubspot/scripts/onboarding/new_organisation.py
@@ -7,6 +7,7 @@ dbRead = HubspotDataToDb()
 companies_to_add_or_ensure_it_exists = [
     Companies.THE_GUINESS_PARTNERSHIP,
     Companies.SOUTHERN_HOUSING_GROUP,
+    Companies.CALICO_HOMES,
 ]
 
 for company in companies_to_add_or_ensure_it_exists:

From 934e666357d7339131b70548651440ae30e9ccf2 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Mon, 30 Mar 2026 14:41:38 +0000
Subject: [PATCH 25/47] added hubspot to add one deal with deal id

---
 asset_list/app.py                             |  16 +-
 backend/app/db/models/organisation.py         |  47 ++-
 etl/hubspot/hubspotDataTodB.py                | 290 +++++++++++++++++-
 etl/hubspot/requirements.txt                  |   2 +-
 etl/hubspot/s3_uploader.py                    | 116 +++++++
 etl/hubspot/scripts/scraper/README.md         |  15 +
 etl/hubspot/scripts/scraper/__init__.py       |   0
 .../scripts/scraper/handler/Dockerfile        |  38 +++
 .../scripts/scraper/handler/requirements.txt  |  12 +
 .../scraper/local_handler/docker-compose.yml  |  11 +
 .../local_handler/invoke_local_lambda.py      |  28 ++
 .../scraper/local_handler/run_local.sh        |   2 +
 etl/hubspot/scripts/scraper/main.py           |  45 +++
 13 files changed, 610 insertions(+), 12 deletions(-)
 create mode 100644 etl/hubspot/s3_uploader.py
 create mode 100644 etl/hubspot/scripts/scraper/README.md
 create mode 100644 etl/hubspot/scripts/scraper/__init__.py
 create mode 100644 etl/hubspot/scripts/scraper/handler/Dockerfile
 create mode 100644 etl/hubspot/scripts/scraper/handler/requirements.txt
 create mode 100644 etl/hubspot/scripts/scraper/local_handler/docker-compose.yml
 create mode 100644 etl/hubspot/scripts/scraper/local_handler/invoke_local_lambda.py
 create mode 100644 etl/hubspot/scripts/scraper/local_handler/run_local.sh
 create mode 100644 etl/hubspot/scripts/scraper/main.py

diff --git a/asset_list/app.py b/asset_list/app.py
index 02c94f10..5794eaf3 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -75,22 +75,22 @@ def app():
 
     data_folder = "/workspaces/model/asset_list"
     data_filename = "Calico ARA Upload Review.xlsx"
-    sheet_name = "Upload to Ara - Needs Sign Off"
+    sheet_name = "Sheet1"
     postcode_column = "Postcode"
-    address1_column = "Address 1"
+    address1_column = "Units"
     address1_method = None
-    fulladdress_column = "Address 1"
-    address_cols_to_concat = []
+    fulladdress_column = "Units"
+    address_cols_to_concat = ["Units"]
     missing_postcodes_method = None
     landlord_year_built = None
-    landlord_os_uprn = "ara_found_uprn"
-    landlord_property_type = "Property Type"
-    landlord_built_form = "Property Type"
+    landlord_os_uprn = None
+    landlord_property_type = None  # Good to include if landlord gave
+    landlord_built_form = None  # Good to include if landlord gave
     landlord_wall_construction = None
     landlord_roof_construction = None
     landlord_heating_system = None
     landlord_existing_pv = None
-    landlord_property_id = "Asset Reference"
+    landlord_property_id = "llid"
     landlord_sap = None
     outcomes_filename = None
     outcomes_sheetname = None
diff --git a/backend/app/db/models/organisation.py b/backend/app/db/models/organisation.py
index 774a05af..a3c79e3c 100644
--- a/backend/app/db/models/organisation.py
+++ b/backend/app/db/models/organisation.py
@@ -1,6 +1,8 @@
-from sqlmodel import SQLModel, Field
+from sqlmodel import SQLModel, Field, Column, text
 from datetime import datetime, timezone
 from typing import Optional
+from sqlalchemy import DateTime
+from sqlalchemy.sql import func
 import uuid
 
 
@@ -11,3 +13,46 @@ class Organisation(SQLModel, table=True):
     updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
     hubspot_company_id: Optional[str] = None
     name: Optional[str] = None
+
+
+class HubspotDealData(SQLModel, table=True):
+    __tablename__ = "hubspot_deal_data"
+
+    id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True)
+
+    # HubSpot Deal identifiers
+    deal_id: str = Field(index=True, nullable=False)
+    dealname: Optional[str] = Field(default=None)
+    dealstage: Optional[str] = Field(default=None)
+    company_id: Optional[str] = Field(default=None)
+    project_code: Optional[str] = Field(default=None)
+
+    # HubSpot custom properties
+    landlord_property_id: Optional[str] = Field(default=None)
+    uprn: Optional[str] = Field(default=None)
+    outcome: Optional[str] = Field(default=None)
+    outcome_notes: Optional[str] = Field(default=None)
+
+    major_condition_issue_description: Optional[str] = Field(default=None)
+    major_condition_issue_photos: Optional[str] = Field(default=None)
+    major_condition_issue_evidence_s3_url: Optional[str] = Field(default=None)
+
+    coordination_status: Optional[str] = Field(default=None)
+    design_status: Optional[str] = Field(default=None)
+
+    created_at: datetime = Field(
+        sa_column=Column(
+            DateTime(timezone=True),
+            server_default=text("NOW() AT TIME ZONE 'utc'"),
+            nullable=False,
+        )
+    )
+
+    updated_at: datetime = Field(
+        sa_column=Column(
+            DateTime(timezone=True),
+            server_default=text("NOW() AT TIME ZONE 'utc'"),
+            onupdate=func.now(),
+            nullable=False,
+        )
+    )
diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py
index 8fe61a3e..4ed579e9 100644
--- a/etl/hubspot/hubspotDataTodB.py
+++ b/etl/hubspot/hubspotDataTodB.py
@@ -1,8 +1,10 @@
 from backend.app.db.connection import db_read_session
-from backend.app.db.models.organisation import Organisation
+from backend.app.db.models.organisation import Organisation, HubspotDealData
 from sqlmodel import select
 from datetime import datetime, timezone
 from typing import TypedDict
+from etl.hubspot.s3_uploader import S3Uploader
+import hashlib
 
 
 class CompanyData(TypedDict):
@@ -12,7 +14,7 @@ class CompanyData(TypedDict):
 
 class HubspotDataToDb:
     def __init__(self):
-        pass
+        self.s3 = S3Uploader()
 
     def read_org_table(self, limit: int = 10):
         with db_read_session() as session:
@@ -53,3 +55,287 @@ class HubspotDataToDb:
 
             session.commit()
             return record
+
+    ###
+    # Check from here
+    ###
+
+    def new_record_to_hubspot_data(self, deal_data, company, listing, hubspot_client):
+        print("⚠️ Deprecated — use the new interface instead.")
+        return self.upsert_hubspot_deal(deal_data, company, listing, hubspot_client)
+
+    def find_all_deals_with_company_id(self, company_id):
+        """Returns a list of deals for a given company_id."""
+        with db_read_session() as session:
+            return (
+                session.query(HubspotDealData)
+                .filter(HubspotDealData.company_id == company_id)
+                .all()
+            )
+
+    def find_deal_with_deal_id(self, deal_id):
+        with db_read_session() as session:
+            return (
+                session.query(HubspotDealData)
+                .filter(HubspotDealData.deal_id == deal_id)
+                .one_or_none()
+            )
+
+    def _sha256(self, file_path: str) -> str:
+        """Compute SHA-256 checksum of a file."""
+        sha256 = hashlib.sha256()
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(8192), b""):
+                sha256.update(chunk)
+        return sha256.hexdigest()
+
+    def update_deal(self, deal_in_db, hubspot_client):
+        """
+        Checks if a deal needs updating and syncs it with HubSpot.
+        Also handles major_condition_issue_photos file upload to S3 with integrity check.
+        """
+
+        def soft_assert(condition, message="Assertion Failed"):
+            if not condition:
+                print(f"⚠️ Soft Assert Failed: {message}")
+                return False
+            return True
+
+        print(f"🔍 Checking if deal needs updating (deal_id={deal_in_db.deal_id})")
+
+        hs_deal, hs_company_id, hs_listing = hubspot_client.get_deal_info_for_db(
+            deal_in_db.deal_id
+        )
+
+        # Soft compare key fields
+        checks = [
+            soft_assert(
+                deal_in_db.deal_id == hs_deal.get("hs_object_id"), "deal_id mismatch"
+            ),
+            soft_assert(deal_in_db.company_id == hs_company_id, "company_id mismatch"),
+            soft_assert(
+                deal_in_db.landlord_property_id == hs_listing.get("owner_property_id"),
+                "landlord_property_id mismatch",
+            ),
+            soft_assert(
+                deal_in_db.outcome == hs_deal.get("outcome"), "outcome mismatch"
+            ),
+            soft_assert(
+                deal_in_db.dealstage == hs_deal.get("dealstage"), "dealstage mismatch"
+            ),
+            soft_assert(
+                deal_in_db.dealname == hs_deal.get("dealname"), "dealname mismatch"
+            ),
+            soft_assert(
+                deal_in_db.project_code == hs_deal.get("project_code"),
+                "project_code mismatch",
+            ),
+            soft_assert(
+                deal_in_db.uprn == hs_listing.get("national_uprn"), "uprn mismatch"
+            ),
+            soft_assert(
+                deal_in_db.outcome_notes == hs_deal.get("outcome_notes"),
+                "outcome_notes mismatch",
+            ),
+            soft_assert(
+                deal_in_db.major_condition_issue_description
+                == hs_deal.get("major_condition_issue_description"),
+                "major condition description mismatch",
+            ),
+            soft_assert(
+                deal_in_db.major_condition_issue_photos
+                == hs_deal.get("major_condition_issue_photos"),
+                "major condition issue photos mismatch",
+            ),
+            soft_assert(
+                deal_in_db.coordination_status
+                == hs_deal.get("coordination_status__stage_1_"),
+                "coordination stage 1 status mismatch",
+            ),
+            soft_assert(
+                deal_in_db.design_status == hs_deal.get("retrofit_design_status"),
+                "retrofit design mismatch",
+            ),
+        ]
+
+        # If discrepancies found, update from HubSpot
+        if not all(checks):
+            print(
+                f"❗ Discrepancies found for deal_id {deal_in_db.deal_id} — syncing with HubSpot."
+            )
+            self.upsert_hubspot_deal(hs_deal, hs_company_id, hs_listing, hubspot_client)
+            return False
+
+        # Handle photo upload if it exists but S3 URL is missing
+        if (
+            deal_in_db.major_condition_issue_photos
+            and not deal_in_db.major_condition_issue_evidence_s3_url
+        ):
+            print(
+                f"🖼️ Found photo for deal_id {deal_in_db.deal_id} — uploading to S3..."
+            )
+
+            photo_url = hs_deal.get("major_condition_issue_photos")
+            if photo_url:
+                try:
+                    # Download from HubSpot using fresh URL from hs_deal (not stale DB URL)
+                    local_file = hubspot_client.download_file_from_url(photo_url)
+
+                    # Upload to S3
+                    bucket = "retrofit-data-dev"
+                    s3_url = self.s3.upload_file(
+                        local_file, bucket, prefix="hubspot/awaabs_law_evidence/"
+                    )
+
+                    # Download again to verify integrity
+                    downloaded = self.s3.download_from_url(s3_url)
+                    if self._sha256(local_file) == self._sha256(downloaded):
+                        print("✅ SHA256 match verified — upload successful.")
+                    else:
+                        print("❌ SHA256 mismatch — integrity check failed.")
+                        raise ValueError("File integrity check failed after S3 upload.")
+
+                    # Update DB record with S3 URL
+                    with db_read_session() as session:
+                        db_record = session.get(HubspotDealData, deal_in_db.id)
+                        db_record.major_condition_issue_evidence_s3_url = s3_url
+                        session.add(db_record)
+                        session.commit()
+                        print(
+                            f"✅ Updated DB with S3 URL for deal_id={deal_in_db.deal_id}"
+                        )
+                    return False
+                except Exception as e:
+                    print(
+                        f"⚠️ Failed to download/upload photo for deal_id {deal_in_db.deal_id}: {e}"
+                    )
+                    # Continue without the file — don't crash the entire update
+            else:
+                print(f"⚠️ Photo URL missing for deal_id {deal_in_db.deal_id}")
+
+        else:
+            print(f"✅ No update or upload required for deal_id {deal_in_db.deal_id}.")
+
+        return True
+
+    def upsert_hubspot_deal(self, deal_data, company, listing, hubspot_client):
+        """
+        Inserts or updates a deal record.
+        Also uploads photos if present and adds S3 URL.
+        """
+        with db_read_session() as session:
+            deal_id = deal_data.get("hs_object_id")
+
+            statement = select(HubspotDealData).where(
+                HubspotDealData.deal_id == deal_id
+            )
+            existing = session.exec(statement).first()
+
+            if existing:
+                print(f"🔄 Updating existing deal (deal_id={deal_id})")
+
+                for attr, value in {
+                    "dealname": deal_data.get("dealname"),
+                    "dealstage": deal_data.get("dealstage"),
+                    "landlord_property_id": listing.get("owner_property_id"),
+                    "uprn": listing.get("national_uprn"),
+                    "outcome": deal_data.get("outcome"),
+                    "outcome_notes": deal_data.get("outcome_notes"),
+                    "project_code": deal_data.get("project_code"),
+                    "company_id": company,
+                    "major_condition_issue_description": deal_data.get(
+                        "major_condition_issue_description"
+                    ),
+                    "major_condition_issue_photos": deal_data.get(
+                        "major_condition_issue_photos"
+                    ),
+                    "major_condition_issue_description": deal_data.get(
+                        "major_condition_issue_description"
+                    ),
+                    "major_condition_issue_photos": deal_data.get(
+                        "major_condition_issue_photos"
+                    ),
+                    "coordination_status": deal_data.get(
+                        "coordination_status__stage_1_"
+                    ),
+                    "design_status": deal_data.get("retrofit_design_status"),
+                }.items():
+                    setattr(existing, attr, value or getattr(existing, attr))
+
+                # Upload if photo exists but S3 link missing
+                if (
+                    existing.major_condition_issue_photos
+                    and not existing.major_condition_issue_evidence_s3_url
+                ):
+                    # Fetch fresh URL from HubSpot instead of using potentially expired stored URL
+                    fresh_deal = hubspot_client.from_deal_id_get_info(existing.deal_id)
+                    photo_url = fresh_deal.get("major_condition_issue_photos")
+
+                    if photo_url:
+                        try:
+                            local_file = hubspot_client.download_file_from_url(
+                                photo_url
+                            )
+                            s3_url = self.s3.upload_file(
+                                local_file,
+                                "retrofit-data-dev",
+                                prefix="hubspot/awaabs_law_evidence/",
+                            )
+                            existing.major_condition_issue_evidence_s3_url = s3_url
+                        except Exception as e:
+                            print(
+                                f"⚠️ Failed to download photo for deal_id {existing.deal_id}: {e}"
+                            )
+                            # Continue without the file — don't crash the update
+                    else:
+                        print(f"⚠️ Photo URL missing for deal_id {existing.deal_id}")
+
+                session.add(existing)
+                session.commit()
+                session.refresh(existing)
+                return existing
+
+            else:
+                print(f"🆕 Inserting new deal (deal_id={deal_id})")
+                new_record = HubspotDealData(
+                    deal_id=deal_id,
+                    dealname=deal_data.get("dealname"),
+                    dealstage=deal_data.get("dealstage"),
+                    landlord_property_id=listing.get("owner_property_id"),
+                    uprn=listing.get("national_uprn"),
+                    outcome=deal_data.get("outcome"),
+                    outcome_notes=deal_data.get("outcome_notes"),
+                    project_code=deal_data.get("project_code"),
+                    company_id=company,
+                    major_condition_issue_description=deal_data.get(
+                        "major_condition_issue_description"
+                    ),
+                    major_condition_issue_photos=deal_data.get(
+                        "major_condition_issue_photos"
+                    ),
+                    coordination_status=deal_data.get("coordination_status__stage_1_"),
+                    design_status=deal_data.get("retrofit_design_status"),
+                )
+
+                # Handle upload at insert time
+                if new_record.major_condition_issue_photos:
+                    try:
+                        local_file = hubspot_client.download_file_from_url(
+                            new_record.major_condition_issue_photos
+                        )
+                        s3_url = self.s3.upload_file(
+                            local_file,
+                            "retrofit-data-dev",
+                            prefix="hubspot/awaabs_law_evidence/",
+                        )
+                        new_record.major_condition_issue_evidence_s3_url = s3_url
+                    except Exception as e:
+                        print(
+                            f"⚠️ Failed to download photo for deal_id {new_record.deal_id}: {e}"
+                        )
+                        # Continue without the file — don't crash the insert
+
+                session.add(new_record)
+                session.commit()
+                session.refresh(new_record)
+                return new_record
diff --git a/etl/hubspot/requirements.txt b/etl/hubspot/requirements.txt
index ef8e3ebc..44a58f77 100644
--- a/etl/hubspot/requirements.txt
+++ b/etl/hubspot/requirements.txt
@@ -1 +1 @@
-hubspot-api-client
\ No newline at end of file
+hubspot-api-client
diff --git a/etl/hubspot/s3_uploader.py b/etl/hubspot/s3_uploader.py
new file mode 100644
index 00000000..0d217bd2
--- /dev/null
+++ b/etl/hubspot/s3_uploader.py
@@ -0,0 +1,116 @@
+import os
+import boto3
+from botocore.exceptions import ClientError
+from urllib.parse import urlparse
+from datetime import datetime
+import requests
+
+
+class S3Uploader:
+    """
+    Simple helper to upload local files to S3 and return their S3 HTTPS URI.
+    """
+
+    def __init__(
+        self,
+        aws_access_key: str = "AKIAU5A36PPNK7RXX52V",
+        aws_secret_key: str = "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",
+        region: str = "eu-west-2",
+    ):
+        self.aws_access_key = aws_access_key
+        self.aws_secret_key = aws_secret_key
+        self.region = region
+
+        self.s3 = boto3.client(
+            "s3",
+            aws_access_key_id=self.aws_access_key,
+            aws_secret_access_key=self.aws_secret_key,
+            region_name=self.region,
+        )
+
+    def upload_file(self, file_path: str, bucket: str, prefix: str = "uploads/") -> str:
+        """
+        Upload a local file to an S3 bucket and return its HTTPS URI.
+
+        Args:
+            file_path (str): Path to the local file.
+            bucket (str): S3 bucket name.
+            prefix (str): Folder/prefix in the bucket.
+
+        Returns:
+            str: HTTPS-style S3 URI (not signed).
+        """
+        try:
+            filename = os.path.basename(file_path)
+            timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+            s3_key = os.path.join(prefix, f"{timestamp}_{filename}")
+
+            self.s3.upload_file(file_path, bucket, s3_key)
+
+            s3_uri = f"https://{bucket}.s3.{self.region}.amazonaws.com/{s3_key}"
+            return s3_uri
+
+        except ClientError as e:
+            raise RuntimeError(f"❌ S3 upload failed: {e}")
+
+    def print_bucket(self):
+        print(self.s3.head_bucket(Bucket="retrofit-data-dev"))
+
+    def generate_presigned_url(
+        self, bucket: str, key: str, expires_in: int = 3600
+    ) -> str:
+        """
+        Generate a temporary presigned URL for an S3 object.
+        """
+        try:
+            return self.s3.generate_presigned_url(
+                "get_object",
+                Params={"Bucket": bucket, "Key": key},
+                ExpiresIn=expires_in,
+            )
+        except ClientError as e:
+            raise RuntimeError(f"❌ Failed to generate signed URL: {e}")
+
+    def download_from_url(
+        self, s3_url: str, local_dir: str = ".", expires_in: int = 3600
+    ) -> str:
+        """
+        Download a file from a public or private S3 URL.
+        If private, generates a presigned URL first.
+
+        Args:
+            s3_url (str): Full S3 HTTPS URL (e.g., https://bucket.s3.region.amazonaws.com/path/file.txt)
+            local_dir (str): Folder to save the file in.
+            expires_in (int): Presigned URL lifetime (seconds).
+
+        Returns:
+            str: Local file path of the downloaded file.
+        """
+        parsed = urlparse(s3_url)
+        host_parts = parsed.netloc.split(".")
+        if len(host_parts) < 3 or host_parts[1] != "s3":
+            raise ValueError("❌ Not a valid S3 HTTPS URL")
+
+        bucket = host_parts[0]
+        key = parsed.path.lstrip("/")
+
+        # Generate presigned URL (whether public or private)
+        presigned_url = self.generate_presigned_url(bucket, key, expires_in)
+
+        filename = os.path.basename(key)
+        local_path = os.path.join(local_dir, filename)
+
+        try:
+            response = requests.get(presigned_url, stream=True)
+            response.raise_for_status()
+
+            os.makedirs(local_dir, exist_ok=True)
+            with open(local_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    f.write(chunk)
+
+            print(f"✅ Downloaded: {local_path}")
+            return local_path
+
+        except requests.exceptions.RequestException as e:
+            raise RuntimeError(f"❌ Failed to download file: {e}")
diff --git a/etl/hubspot/scripts/scraper/README.md b/etl/hubspot/scripts/scraper/README.md
new file mode 100644
index 00000000..2d7fe975
--- /dev/null
+++ b/etl/hubspot/scripts/scraper/README.md
@@ -0,0 +1,15 @@
+Input:
+
+<Hubspot Deal ID>
+
+
+Function:
+
+<Add hubspot deal/update to hubspot_deal_data>
+
+
+Used in:
+
+when changes are made in hubspot, this will trigger a workflow in make.
+
+This in turn will trigger this sqs which I'm building from this directory
\ No newline at end of file
diff --git a/etl/hubspot/scripts/scraper/__init__.py b/etl/hubspot/scripts/scraper/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/etl/hubspot/scripts/scraper/handler/Dockerfile b/etl/hubspot/scripts/scraper/handler/Dockerfile
new file mode 100644
index 00000000..bbcc3e22
--- /dev/null
+++ b/etl/hubspot/scripts/scraper/handler/Dockerfile
@@ -0,0 +1,38 @@
+FROM public.ecr.aws/lambda/python:3.10
+# FROM python:3.11.10-bullseye
+
+
+ARG DEV_DB_HOST
+ARG DEV_DB_PORT
+ARG DEV_DB_NAME
+
+ENV DB_HOST=${DEV_DB_HOST}
+ENV DB_PORT=${DEV_DB_PORT}
+ENV DB_NAME=${DEV_DB_NAME}
+
+
+# Set working directory (Lambda task root)
+WORKDIR /var/task
+
+# -----------------------------
+# Copy requirements FIRST (for Docker layer caching)
+# -----------------------------
+COPY etl/hubspot/scripts/scraper/handler/requirements.txt .
+
+# Install dependencies into Lambda runtime
+RUN pip install --no-cache-dir -r requirements.txt
+
+
+# Copy necessary files for database and utility imports
+COPY backend/ backend/
+COPY utils/ utils/
+COPY datatypes/ datatypes/
+COPY etl/hubspot etl/hubspot
+
+# Copy the handler
+COPY etl/hubspot/scripts/scraper/main.py .
+
+# -----------------------------
+# Lambda handler
+# -----------------------------
+CMD ["main.handler"]
\ No newline at end of file
diff --git a/etl/hubspot/scripts/scraper/handler/requirements.txt b/etl/hubspot/scripts/scraper/handler/requirements.txt
new file mode 100644
index 00000000..230b460e
--- /dev/null
+++ b/etl/hubspot/scripts/scraper/handler/requirements.txt
@@ -0,0 +1,12 @@
+pandas==2.2.2
+numpy<2.0
+requests
+tqdm
+openpyxl
+epc-api-python==1.0.2
+boto3==1.35.44
+sqlmodel
+sqlalchemy==2.0.36
+psycopg2-binary==2.9.10
+pydantic-settings==2.6.0
+hubspot-api-client
\ No newline at end of file
diff --git a/etl/hubspot/scripts/scraper/local_handler/docker-compose.yml b/etl/hubspot/scripts/scraper/local_handler/docker-compose.yml
new file mode 100644
index 00000000..77679650
--- /dev/null
+++ b/etl/hubspot/scripts/scraper/local_handler/docker-compose.yml
@@ -0,0 +1,11 @@
+version: "3.9"
+
+services:
+  hubspot-scraper:
+    build:
+      context: ../../../../../
+      dockerfile: etl/hubspot/scripts/scraper/handler/Dockerfile
+    ports:
+      - "9000:8080"
+    env_file:
+      - ../../../../../.env
\ No newline at end of file
diff --git a/etl/hubspot/scripts/scraper/local_handler/invoke_local_lambda.py b/etl/hubspot/scripts/scraper/local_handler/invoke_local_lambda.py
new file mode 100644
index 00000000..69580a93
--- /dev/null
+++ b/etl/hubspot/scripts/scraper/local_handler/invoke_local_lambda.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+import json
+import requests
+
+HOST = "localhost"
+PORT = "9000"
+
+LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations"
+
+payload = {
+    "Records": [
+        {
+            "body": json.dumps(
+                {
+                    "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
+                    "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0",
+                    "hubspot_deal_id": "254427203793",
+                }
+            )
+        }
+    ]
+}
+
+response = requests.post(LAMBDA_URL, json=payload)
+
+print("Status code:", response.status_code)
+print("Response:")
+print(response.text)
diff --git a/etl/hubspot/scripts/scraper/local_handler/run_local.sh b/etl/hubspot/scripts/scraper/local_handler/run_local.sh
new file mode 100644
index 00000000..17474bdb
--- /dev/null
+++ b/etl/hubspot/scripts/scraper/local_handler/run_local.sh
@@ -0,0 +1,2 @@
+docker compose build --no-cache
+docker compose up --force-recreate
diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py
new file mode 100644
index 00000000..a51cd4a4
--- /dev/null
+++ b/etl/hubspot/scripts/scraper/main.py
@@ -0,0 +1,45 @@
+"""
+TODO:
+
+1) [completed]Get hubspot deal properties from one deal
+2) Put it in some class
+3) [completed] Load the db and check if upsert it into the table
+4) Getting working on a AWS lambda
+5) [completed] subtask and tasks history
+6) The new sexy deal properties, move it over
+"""
+
+from backend.utils.subtasks import subtask_handler
+from etl.hubspot.hubspotClient import HubspotClient
+from etl.hubspot.hubspotDataTodB import HubspotDataToDb
+from typing import Any
+
+
+@subtask_handler()
+def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
+    if local is True:
+        body = {
+            "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
+            "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0",
+            "hubspot_deal_id": "254427203793",
+        }
+
+    hubspot_deal_id = body.get("hubspot_deal_id", "")
+
+    if hubspot_deal_id == "":
+        raise RuntimeError(
+            "Missing Hubspot Deal ID in SQS body request, 'hubspot_deal_id'"
+        )
+
+    hubspot = HubspotClient()
+    dbloader = HubspotDataToDb()
+
+    deal = dbloader.find_deal_with_deal_id(hubspot_deal_id)
+
+    if deal:
+        dbloader.update_deal(deal, hubspot)
+    else:
+        deal, company, listing = hubspot.get_deal_info_for_db(hubspot_deal_id)
+        dbloader.upsert_hubspot_deal(deal, company, listing, hubspot)
+
+    print("Finsihed running")

From 1abc53f3e3156a5da53b00adafe6a6fd67072b2d Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Mon, 30 Mar 2026 14:42:11 +0000
Subject: [PATCH 26/47] removed hashlib as its from the standard library

---
 etl/hubspot/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etl/hubspot/requirements.txt b/etl/hubspot/requirements.txt
index 44a58f77..ef8e3ebc 100644
--- a/etl/hubspot/requirements.txt
+++ b/etl/hubspot/requirements.txt
@@ -1 +1 @@
-hubspot-api-client
+hubspot-api-client
\ No newline at end of file

From d6f9b4879709a3867caf3a8eb466fc41d0e8f05c Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Mon, 30 Mar 2026 15:44:52 +0000
Subject: [PATCH 27/47] depploy hubspot etl registry

---
 infrastructure/terraform/shared/main.tf | 31 +++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf
index 486f79ca..5a396b3a 100644
--- a/infrastructure/terraform/shared/main.tf
+++ b/infrastructure/terraform/shared/main.tf
@@ -574,3 +574,34 @@ output "cdn_certificate_state_bucket" {
   value = module.cdn_certificate_state_bucket.bucket_name
 }
 
+
+################################################
+# Hubspot ETL Lambda
+################################################
+module "hubspot_etl_bucket" {
+  source      = "../modules/tf_state_bucket"
+  bucket_name = "hubspot-etl-bucket-terraform-state"
+
+}
+
+module "hubspot_etl_registry" {
+  source = "../modules/container_registry"
+  name   = "hubspot_etl"
+  stage = var.stage
+
+}
+
+# S3 policy for postcode splitter to read from retrofit data bucket
+module "hubspot_etl_s3_read_and_write" {
+  source = "../modules/s3_iam_policy"
+
+  policy_name        = "HubspotETLReadandWriteS3"
+  policy_description = "Allow ordnance Lambda to read and write from retrofit-data bucket"
+  bucket_arns        = ["arn:aws:s3:::retrofit-data-${var.stage}"]
+  actions            = ["s3:GetObject", "s3:ListBucket", "s3:PutObject"]
+  resource_paths     = ["/*"]
+}
+
+output "ordnance_s3_read_and_write_arn" {
+  value = module.hubspot_etl_s3_read_and_write.policy_arn
+}

From 764ee81dad1447be2008ecdad77283412cd0026c Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Mon, 30 Mar 2026 15:52:28 +0000
Subject: [PATCH 28/47] hubspot etl

---
 .github/workflows/deploy_terraform.yml  | 42 ++++++++++++++++++++++++-
 infrastructure/terraform/shared/main.tf |  2 +-
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml
index e41534e6..500b2435 100644
--- a/.github/workflows/deploy_terraform.yml
+++ b/.github/workflows/deploy_terraform.yml
@@ -433,4 +433,44 @@ jobs:
       - name: Terraform Apply
         if: env.TERRAFORM_APPLY == 'true'
         working-directory: infrastructure/terraform/cdn
-        run: terraform apply -auto-approve tfplan
\ No newline at end of file
+        run: terraform apply -auto-approve tfplan
+
+  # ============================================================
+  # Build Hubspot ETL  image
+  # ============================================================
+  hubspot_etl_image:
+    needs: [determine_stage, shared_terraform]
+    uses: ./.github/workflows/_build_image.yml
+    with:
+      ecr_repo: hubspot-etl-${{ needs.determine_stage.outputs.stage }}
+      dockerfile_path: etl/hubspot/scripts/scraper/handler/Dockerfile
+      build_context: .
+      build_args: |
+        DEV_DB_HOST=$DEV_DB_HOST
+        DEV_DB_PORT=$DEV_DB_PORT
+        DEV_DB_NAME=$DEV_DB_NAME
+    secrets:
+      AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
+      AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
+      DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }}
+      DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }}
+      DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }}
+
+  # ============================================================
+  # Deploy OrdanceSurvey Lambda
+  # ============================================================
+  ordnanceSurvey_lambda:
+    needs: [hubspot_etl_image, determine_stage]
+    uses: ./.github/workflows/_deploy_lambda.yml
+    with:
+      lambda_name: hubspotETLtoAraDb
+      lambda_path: infrastructure/terraform/lambda/hubspot_deal_etl
+      stage: ${{ needs.determine_stage.outputs.stage }}
+      ecr_repo: ordnance-${{ needs.determine_stage.outputs.stage }}
+      image_digest: ${{ needs.hubspot_etl_image.outputs.image_digest }}
+      terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }}
+    secrets:
+      AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
+      AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf
index 5a396b3a..25c40b7a 100644
--- a/infrastructure/terraform/shared/main.tf
+++ b/infrastructure/terraform/shared/main.tf
@@ -586,7 +586,7 @@ module "hubspot_etl_bucket" {
 
 module "hubspot_etl_registry" {
   source = "../modules/container_registry"
-  name   = "hubspot_etl"
+  name   = "hubspot-etl"
   stage = var.stage
 
 }

From 3ebe04423f10c5930d6a7cfd696613c3c3bf9eac Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Mon, 30 Mar 2026 15:53:30 +0000
Subject: [PATCH 29/47] deployed

---
 etl/hubspot/scripts/scraper/main.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py
index a51cd4a4..f862948b 100644
--- a/etl/hubspot/scripts/scraper/main.py
+++ b/etl/hubspot/scripts/scraper/main.py
@@ -1,12 +1,10 @@
 """
-TODO:
-
 1) [completed]Get hubspot deal properties from one deal
 2) Put it in some class
 3) [completed] Load the db and check if upsert it into the table
-4) Getting working on a AWS lambda
+4) [completed]Getting working on a AWS lambda
 5) [completed] subtask and tasks history
-6) The new sexy deal properties, move it over
+6) [TODO]The new sexy deal properties, move it over
 """
 
 from backend.utils.subtasks import subtask_handler

From a249ba13748c293212362d0b6def15e6ca9e3ac0 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Mon, 30 Mar 2026 17:15:31 +0000
Subject: [PATCH 30/47] got rid of tox

---
 .github/workflows/unit_tests.yml | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index f09988b0..0b0b68ea 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -7,7 +7,8 @@ on:
 
 
 jobs:
-  test:
+  test-docker:
+    name: Tests (Docker)
     runs-on: ubuntu-latest
 
     steps:
@@ -18,14 +19,8 @@ jobs:
         run: docker build -f Dockerfile.test -t model-test .
 
       - name: Run tests
-        env:
-          EPC_AUTH_TOKEN: ${{ secrets.DEV_EPC_AUTH_TOKEN }}
-          HUBSPOT_API_KEY: ${{ secrets.HUBSPOT_API_KEY }}
-          
         run: |
-          # docker run --rm \
-          #   -e EPC_AUTH_TOKEN=${{ secrets.DEV_EPC_AUTH_TOKEN }} \
-          #   -e HUBSPOT_API_KEY=${{ secrets.HUBSPOT_API_KEY }} \
-          #   model-test pytest -m 'not integration'
-
-          make test ARGS="-m 'not integration'"
+          docker run --rm \
+            -e EPC_AUTH_TOKEN=${{ secrets.DEV_EPC_AUTH_TOKEN }} \
+            -e HUBSPOT_API_KEY=${{ secrets.HUBSPOT_API_KEY }} \
+            model-test pytest -m 'not integration'

From 56fe3a1be00e3fdc4d435bf9acf038d056818c2f Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Mon, 30 Mar 2026 17:22:54 +0000
Subject: [PATCH 31/47] get rid of parser as it doesn't work

---
 backend/scripts/combine_address2uprn_outputs.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/backend/scripts/combine_address2uprn_outputs.py b/backend/scripts/combine_address2uprn_outputs.py
index f065c676..105b8639 100644
--- a/backend/scripts/combine_address2uprn_outputs.py
+++ b/backend/scripts/combine_address2uprn_outputs.py
@@ -53,13 +53,3 @@ def main(task_id, output):
 
     print(f"Combined CSV saved to {output}")
     print(f"Total rows: {len(combined)}")
-
-
-# if __name__ == "__main__":
-#     parser = argparse.ArgumentParser()
-#     parser.add_argument("task_id", help="Task ID folder in S3")
-#     parser.add_argument("--output", default="combined.csv")
-
-#     args = parser.parse_args()
-
-#     main(args.task_id, args.output)

From be09749c0a23ba6abd1680170cc9547d3232b2a2 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 09:02:18 +0000
Subject: [PATCH 32/47] got company updates to work

---
 etl/hubspot/hubspotClient.py                       |  9 ++++++++-
 etl/hubspot/hubspotDataTodB.py                     |  2 +-
 etl/hubspot/scripts/onboarding/new_organisation.py | 10 ++++++++++
 etl/hubspot/scripts/scraper/main.py                |  4 ++--
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py
index 6fd11bed..8bbe8a63 100644
--- a/etl/hubspot/hubspotClient.py
+++ b/etl/hubspot/hubspotClient.py
@@ -25,7 +25,7 @@ from hubspot.crm.associations.v4.models import (  # type: ignore[reportMissingTy
     ForwardPaging as AssociationsPaging,
     NextPage as AssociationsPagingNext,
 )
-from etl.hubspot.hubspotDataTodB import CompanyData
+from etl.hubspot.hubspotDataTodB import CompanyData, HubspotDataToDb
 
 
 from backend.app.config import get_settings
@@ -217,8 +217,15 @@ class HubspotClient:
     def get_deal_info_for_db(
         self, deal_id: str
     ) -> tuple[dict[str, str], Optional[str], Optional[dict[str, str]]]:
+
         deal: dict[str, str] = self.from_deal_id_get_info(deal_id)
         company: Optional[str] = self.from_deal_id_get_associated_company_id(deal_id)
+
+        if company:
+            company_data: CompanyData = self.get_company_information(company)
+            dbloader: HubspotDataToDb = HubspotDataToDb()
+            dbloader.upsert_company(company_data)
+
         listing: Optional[dict[str, str]] = self.from_deal_id_get_associated_listing(
             deal_id
         )
diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py
index fb88422c..58da4036 100644
--- a/etl/hubspot/hubspotDataTodB.py
+++ b/etl/hubspot/hubspotDataTodB.py
@@ -90,7 +90,7 @@ class HubspotDataToDb:
                 sha256.update(chunk)
         return sha256.hexdigest()
 
-    def update_deal(self, deal_in_db, hubspot_client):
+    def update_deal(self, deal_in_db, hubspot_client) -> bool:
         """
         Checks if a deal needs updating and syncs it with HubSpot.
         Also handles major_condition_issue_photos file upload to S3 with integrity check.
diff --git a/etl/hubspot/scripts/onboarding/new_organisation.py b/etl/hubspot/scripts/onboarding/new_organisation.py
index f2ff8bda..f8c6ba7a 100644
--- a/etl/hubspot/scripts/onboarding/new_organisation.py
+++ b/etl/hubspot/scripts/onboarding/new_organisation.py
@@ -1,3 +1,13 @@
+"""
+README.md
+
+This is a simple script to showcase how a new organisation can be
+added to AraDb.
+
+This has been made reduntant due to doing this process when ever
+hubspot has a webhook
+"""
+
 from etl.hubspot.hubspotClient import HubspotClient, Companies
 
 from etl.hubspot.hubspotDataTodB import HubspotDataToDb, CompanyData
diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py
index f862948b..aa9a9502 100644
--- a/etl/hubspot/scripts/scraper/main.py
+++ b/etl/hubspot/scripts/scraper/main.py
@@ -29,8 +29,8 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
             "Missing Hubspot Deal ID in SQS body request, 'hubspot_deal_id'"
         )
 
-    hubspot = HubspotClient()
-    dbloader = HubspotDataToDb()
+    hubspot: HubspotClient = HubspotClient()
+    dbloader: HubspotDataToDb = HubspotDataToDb()
 
     deal = dbloader.find_deal_with_deal_id(hubspot_deal_id)
 

From b928689c79643fe86dda1a0870c455c93321b190 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 10:35:43 +0000
Subject: [PATCH 33/47] add db properly

---
 .../scripts/scraper/handler/Dockerfile        | 10 -----
 .../terraform/lambda/hubspot_deal_etl/main.tf | 44 +++++++++++++++++++
 .../lambda/hubspot_deal_etl/provider.tf       | 16 +++++++
 .../lambda/hubspot_deal_etl/variables.tf      | 37 ++++++++++++++++
 infrastructure/terraform/shared/main.tf       |  2 +-
 5 files changed, 98 insertions(+), 11 deletions(-)
 create mode 100644 infrastructure/terraform/lambda/hubspot_deal_etl/main.tf
 create mode 100644 infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf
 create mode 100644 infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf

diff --git a/etl/hubspot/scripts/scraper/handler/Dockerfile b/etl/hubspot/scripts/scraper/handler/Dockerfile
index bbcc3e22..012da376 100644
--- a/etl/hubspot/scripts/scraper/handler/Dockerfile
+++ b/etl/hubspot/scripts/scraper/handler/Dockerfile
@@ -1,16 +1,6 @@
 FROM public.ecr.aws/lambda/python:3.10
 # FROM python:3.11.10-bullseye
 
-
-ARG DEV_DB_HOST
-ARG DEV_DB_PORT
-ARG DEV_DB_NAME
-
-ENV DB_HOST=${DEV_DB_HOST}
-ENV DB_PORT=${DEV_DB_PORT}
-ENV DB_NAME=${DEV_DB_NAME}
-
-
 # Set working directory (Lambda task root)
 WORKDIR /var/task
 
diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf
new file mode 100644
index 00000000..ec2b18e3
--- /dev/null
+++ b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf
@@ -0,0 +1,44 @@
+data "terraform_remote_state" "shared" {
+  backend = "s3"
+  config = {
+    bucket = "assessment-model-terraform-state"
+    key = "env:/${var.stage}/terraform.tfstate"
+    region = "eu-west-2"
+  }
+}
+
+
+data "aws_secretsmanager_secret_version" "db_credentials" {
+  secret_id = "${var.stage}/assessment_model/db_credentials"
+}
+
+locals {
+  db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)
+}
+
+
+module "lambda" {
+  source = "../../modules/lambda_with_sqs"
+
+  name  = REPLACE ME #"address2uprn" for example
+  stage = var.stage
+
+  image_uri = local.image_uri
+
+  # Optional: Set maximum_concurrency to limit concurrent SQS-triggered invocations (2-1000)
+  maximum_concurrency = var.maximum_concurrency
+
+  batch_size = var.batch_size
+
+  environment = {
+    STAGE = var.stage
+    LOG_LEVEL = "info"
+    DB_USERNAME = local.db_credentials.db_assessment_model_username
+    DB_PASSWORD = local.db_credentials.db_assessment_model_password
+  }
+}
+
+resource "aws_iam_role_policy_attachment" "lambda_s3_policy" {
+  role       = module.lambda.role_name
+  policy_arn = data.terraform_remote_state.shared.outputs.hubspot_etl_s3_read_and_write_arn
+}
\ No newline at end of file
diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf
new file mode 100644
index 00000000..3d66f392
--- /dev/null
+++ b/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf
@@ -0,0 +1,16 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = ">= 5.0"
+    }
+  }
+
+  backend "s3" {
+    bucket = REPLACE_ME
+    key    = "terraform.tfstate"
+    region = "eu-west-2"
+  }
+
+  required_version = ">= 1.2.0"
+}
\ No newline at end of file
diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf
new file mode 100644
index 00000000..e7646811
--- /dev/null
+++ b/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf
@@ -0,0 +1,37 @@
+variable "lambda_name" {
+  type        = string
+  description = "Logical name of the lambda (e.g. address2uprn)"
+}
+
+variable "stage" {
+  description = "Deployment stage (e.g. dev, prod)"
+  type        = string
+}
+variable "ecr_repo_url" {
+  type        = string
+  description = "ECR repository URL (no tag, no digest)"
+}
+
+variable "image_digest" {
+  type        = string
+  description = "Image digest (sha256:...)"
+}
+
+variable "maximum_concurrency" {
+  type        = number
+  default     = null
+  description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit."
+}
+
+variable "batch_size" {
+  type    = number
+  default = 1
+}
+
+locals {
+  image_uri = "${var.ecr_repo_url}@${var.image_digest}"
+}
+
+output "resolved_image_uri" {
+  value = local.image_uri
+}
diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf
index 8d645522..bc16dc70 100644
--- a/infrastructure/terraform/shared/main.tf
+++ b/infrastructure/terraform/shared/main.tf
@@ -670,6 +670,6 @@ module "hubspot_etl_s3_read_and_write" {
   resource_paths     = ["/*"]
 }
 
-output "ordnance_s3_read_and_write_arn" {
+output "hubspot_etl_s3_read_and_write_arn" {
   value = module.hubspot_etl_s3_read_and_write.policy_arn
 }
\ No newline at end of file

From ecba9264485ed19ba2983973322d18cbecd59a41 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 10:36:19 +0000
Subject: [PATCH 34/47] added db host and name

---
 infrastructure/terraform/lambda/hubspot_deal_etl/main.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf
index ec2b18e3..effcada6 100644
--- a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf
+++ b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf
@@ -17,7 +17,7 @@ locals {
 }
 
 
-module "lambda" {
+module "hubspot_deal_etl" {
   source = "../../modules/lambda_with_sqs"
 
   name  = REPLACE ME #"address2uprn" for example

From 0f9d031944874cf9ca75005a213f5e01ea4541ec Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 10:38:46 +0000
Subject: [PATCH 35/47] removed subtask handler as need to do that differently

---
 etl/hubspot/scripts/scraper/main.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py
index aa9a9502..94342497 100644
--- a/etl/hubspot/scripts/scraper/main.py
+++ b/etl/hubspot/scripts/scraper/main.py
@@ -7,18 +7,15 @@
 6) [TODO]The new sexy deal properties, move it over
 """
 
-from backend.utils.subtasks import subtask_handler
 from etl.hubspot.hubspotClient import HubspotClient
 from etl.hubspot.hubspotDataTodB import HubspotDataToDb
 from typing import Any
 
 
-@subtask_handler()
+# @subtask_handler() TODO: Do this without subtask_handler but task_handler() that creates task_id and subtask_id
 def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
     if local is True:
         body = {
-            "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917",
-            "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0",
             "hubspot_deal_id": "254427203793",
         }
 

From 5d6f4b3aead6f46a1c3ea21ab41fe0c9f4509e01 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 10:41:03 +0000
Subject: [PATCH 36/47] added checks

---
 etl/hubspot/hubspotDataTodB.py      | 8 ++++----
 etl/hubspot/scripts/scraper/main.py | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py
index 58da4036..f7f79e46 100644
--- a/etl/hubspot/hubspotDataTodB.py
+++ b/etl/hubspot/hubspotDataTodB.py
@@ -63,7 +63,7 @@ class HubspotDataToDb:
 
     def new_record_to_hubspot_data(self, deal_data, company, listing, hubspot_client):
         print("⚠️ Deprecated — use the new interface instead.")
-        return self.upsert_hubspot_deal(deal_data, company, listing, hubspot_client)
+        return self.upsert_deal(deal_data, company, listing, hubspot_client)
 
     def find_all_deals_with_company_id(self, company_id):
         """Returns a list of deals for a given company_id."""
@@ -90,7 +90,7 @@ class HubspotDataToDb:
                 sha256.update(chunk)
         return sha256.hexdigest()
 
-    def update_deal(self, deal_in_db, hubspot_client) -> bool:
+    def update_deal_with_checks(self, deal_in_db, hubspot_client) -> bool:
         """
         Checks if a deal needs updating and syncs it with HubSpot.
         Also handles major_condition_issue_photos file upload to S3 with integrity check.
@@ -164,7 +164,7 @@ class HubspotDataToDb:
             print(
                 f"❗ Discrepancies found for deal_id {deal_in_db.deal_id} — syncing with HubSpot."
             )
-            self.upsert_hubspot_deal(hs_deal, hs_company_id, hs_listing, hubspot_client)
+            self.upsert_deal(hs_deal, hs_company_id, hs_listing, hubspot_client)
             return False
 
         # Handle photo upload if it exists but S3 URL is missing
@@ -219,7 +219,7 @@ class HubspotDataToDb:
 
         return True
 
-    def upsert_hubspot_deal(self, deal_data, company, listing, hubspot_client):
+    def upsert_deal(self, deal_data, company, listing, hubspot_client):
         """
         Inserts or updates a deal record.
         Also uploads photos if present and adds S3 URL.
diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py
index 94342497..48864b22 100644
--- a/etl/hubspot/scripts/scraper/main.py
+++ b/etl/hubspot/scripts/scraper/main.py
@@ -32,9 +32,9 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None:
     deal = dbloader.find_deal_with_deal_id(hubspot_deal_id)
 
     if deal:
-        dbloader.update_deal(deal, hubspot)
+        dbloader.update_deal_with_checks(deal, hubspot)
     else:
         deal, company, listing = hubspot.get_deal_info_for_db(hubspot_deal_id)
-        dbloader.upsert_hubspot_deal(deal, company, listing, hubspot)
+        dbloader.upsert_deal(deal, company, listing, hubspot)
 
     print("Finsihed running")

From 3ae78816a599871772ca26cb94309f9532e58dd7 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 10:42:02 +0000
Subject: [PATCH 37/47] revmoed keys

---
 etl/hubspot/s3_uploader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/etl/hubspot/s3_uploader.py b/etl/hubspot/s3_uploader.py
index 0d217bd2..f5cc0ec9 100644
--- a/etl/hubspot/s3_uploader.py
+++ b/etl/hubspot/s3_uploader.py
@@ -13,8 +13,8 @@ class S3Uploader:
 
     def __init__(
         self,
-        aws_access_key: str = "AKIAU5A36PPNK7RXX52V",
-        aws_secret_key: str = "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",
+        aws_access_key: str,
+        aws_secret_key: str,
         region: str = "eu-west-2",
     ):
         self.aws_access_key = aws_access_key

From 21fa5aad45438b3f1bcd2228308c3e1810f69e87 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 10:42:42 +0000
Subject: [PATCH 38/47] update policy description

---
 infrastructure/terraform/shared/main.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf
index bc16dc70..9d272eb6 100644
--- a/infrastructure/terraform/shared/main.tf
+++ b/infrastructure/terraform/shared/main.tf
@@ -664,7 +664,7 @@ module "hubspot_etl_s3_read_and_write" {
   source = "../modules/s3_iam_policy"
 
   policy_name        = "HubspotETLReadandWriteS3"
-  policy_description = "Allow ordnance Lambda to read and write from retrofit-data bucket"
+  policy_description = "Allow hubspot_etl_lambda Lambda to read and write from retrofit-data bucket"
   bucket_arns        = ["arn:aws:s3:::retrofit-data-${var.stage}"]
   actions            = ["s3:GetObject", "s3:ListBucket", "s3:PutObject"]
   resource_paths     = ["/*"]

From 2ccb6ddbcf9d23e29df66b1b46c5b7d530d075de Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 10:45:32 +0000
Subject: [PATCH 39/47] revert back to main

---
 pyrightconfig.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyrightconfig.json b/pyrightconfig.json
index 18f578a5..d4e0e2a4 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -2,7 +2,7 @@
   "typeCheckingMode": "strict",
   "venvPath": "/Users/khalimconn-kowlessar/opt/anaconda3/envs/",
   "venv": "Fastapi-backend",
-"include": [
+  "include": [
     "."
   ]
 }
\ No newline at end of file

From 72bf64cd8e3635f4fd98424665d48ca193802982 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 10:56:00 +0000
Subject: [PATCH 40/47] verbose

---
 .github/workflows/unit_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 0b0b68ea..9f7ed83b 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -23,4 +23,4 @@ jobs:
           docker run --rm \
             -e EPC_AUTH_TOKEN=${{ secrets.DEV_EPC_AUTH_TOKEN }} \
             -e HUBSPOT_API_KEY=${{ secrets.HUBSPOT_API_KEY }} \
-            model-test pytest -m 'not integration'
+            model-test pytest -vv -m 'not integration'

From 1f66e1b17f86e20a68f8bcf73114ce49d0e6ef5e Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 11:00:19 +0000
Subject: [PATCH 41/47] db details

---
 .github/workflows/unit_tests.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 9f7ed83b..a6673c34 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -23,4 +23,7 @@ jobs:
           docker run --rm \
             -e EPC_AUTH_TOKEN=${{ secrets.DEV_EPC_AUTH_TOKEN }} \
             -e HUBSPOT_API_KEY=${{ secrets.HUBSPOT_API_KEY }} \
+            -e DB_HOST=${{ secrets.DEV_DB_HOST }} \
+            -e DB_NAME=${{ secrets.DEV_DB_NAME }} \
+            -e DB_PORT=${{ secrets.DEV_DB_PORT }} \
             model-test pytest -vv -m 'not integration'

From ba331d44dc44d70c30cf6028c74a23e99b61f568 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 11:06:57 +0000
Subject: [PATCH 42/47] db details

---
 .github/workflows/deploy_terraform.yml                     | 7 -------
 infrastructure/terraform/lambda/hubspot_deal_etl/main.tf   | 2 +-
 .../terraform/lambda/hubspot_deal_etl/provider.tf          | 2 +-
 3 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml
index 1208ee7b..fe95e3d6 100644
--- a/.github/workflows/deploy_terraform.yml
+++ b/.github/workflows/deploy_terraform.yml
@@ -496,17 +496,10 @@ jobs:
       ecr_repo: hubspot-etl-${{ needs.determine_stage.outputs.stage }}
       dockerfile_path: etl/hubspot/scripts/scraper/handler/Dockerfile
       build_context: .
-      build_args: |
-        DEV_DB_HOST=$DEV_DB_HOST
-        DEV_DB_PORT=$DEV_DB_PORT
-        DEV_DB_NAME=$DEV_DB_NAME
     secrets:
       AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
       AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
-      DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }}
-      DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }}
-      DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }}
 
   # ============================================================
   # Deploy Hubspot ETL Lambda
diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf
index effcada6..051c7154 100644
--- a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf
+++ b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf
@@ -20,7 +20,7 @@ locals {
 module "hubspot_deal_etl" {
   source = "../../modules/lambda_with_sqs"
 
-  name  = REPLACE ME #"address2uprn" for example
+  name  = "hubspot_deal_etl"
   stage = var.stage
 
   image_uri = local.image_uri
diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf
index 3d66f392..c8a3972c 100644
--- a/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf
+++ b/infrastructure/terraform/lambda/hubspot_deal_etl/provider.tf
@@ -7,7 +7,7 @@ terraform {
   }
 
   backend "s3" {
-    bucket = REPLACE_ME
+    bucket = "hubspot-etl-bucket-terraform-state"
     key    = "terraform.tfstate"
     region = "eu-west-2"
   }

From b991ab73f7aad1fd05d7302f8f77905a1c56e707 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 11:19:16 +0000
Subject: [PATCH 43/47] add postgres

---
 .github/workflows/deploy_terraform.yml        |  3 +++
 .github/workflows/unit_tests.yml              | 24 ++++++++++++++++---
 .../terraform/lambda/_template/variables.tf   |  1 +
 .../lambda/hubspot_deal_etl/variables.tf      | 13 ++++++++++
 4 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml
index fe95e3d6..cbcd88c4 100644
--- a/.github/workflows/deploy_terraform.yml
+++ b/.github/workflows/deploy_terraform.yml
@@ -515,6 +515,9 @@ jobs:
       image_digest: ${{ needs.hubspot_etl_image.outputs.image_digest }}
       terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }}
     secrets:
+      TF_VAR_db_host: ${{ secrets.DEV_DB_HOST }}
+      TF_VAR_db_name: ${{ secrets.DEV_DB_NAME }}
+      TF_VAR_db_port: ${{ secrets.DEV_DB_PORT }}
       AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
       AWS_REGION: ${{ secrets.DEV_AWS_REGION }}
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index a6673c34..740f88f7 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -11,6 +11,21 @@ jobs:
     name: Tests (Docker)
     runs-on: ubuntu-latest
 
+    services:
+      postgres:
+        image: postgres:15
+        env:
+          POSTGRES_USER: test
+          POSTGRES_PASSWORD: test
+          POSTGRES_DB: test
+        ports:
+          - 5432:5432
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -21,9 +36,12 @@ jobs:
       - name: Run tests
         run: |
           docker run --rm \
+            --network host \
             -e EPC_AUTH_TOKEN=${{ secrets.DEV_EPC_AUTH_TOKEN }} \
             -e HUBSPOT_API_KEY=${{ secrets.HUBSPOT_API_KEY }} \
-            -e DB_HOST=${{ secrets.DEV_DB_HOST }} \
-            -e DB_NAME=${{ secrets.DEV_DB_NAME }} \
-            -e DB_PORT=${{ secrets.DEV_DB_PORT }} \
+            -e DB_HOST=localhost \
+            -e DB_NAME=test \
+            -e DB_USERNAME=test \
+            -e DB_PASSWORD=test \
+            -e DB_PORT=5432 \
             model-test pytest -vv -m 'not integration'
diff --git a/infrastructure/terraform/lambda/_template/variables.tf b/infrastructure/terraform/lambda/_template/variables.tf
index e7646811..ae588840 100644
--- a/infrastructure/terraform/lambda/_template/variables.tf
+++ b/infrastructure/terraform/lambda/_template/variables.tf
@@ -35,3 +35,4 @@ locals {
 output "resolved_image_uri" {
   value = local.image_uri
 }
+
diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf
index e7646811..2e7da609 100644
--- a/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf
+++ b/infrastructure/terraform/lambda/hubspot_deal_etl/variables.tf
@@ -35,3 +35,16 @@ locals {
 output "resolved_image_uri" {
   value = local.image_uri
 }
+
+
+variable "db_host" {
+  type = string
+}
+
+variable "db_name" {
+  type = string
+}
+
+variable "db_port" {
+  type = string
+}
\ No newline at end of file

From a946eb295921bb30bbbb31722d7c34620b6ec068 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 11:27:23 +0000
Subject: [PATCH 44/47] added sql model to db

---
 backend/export/tests/conftest.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backend/export/tests/conftest.py b/backend/export/tests/conftest.py
index 10bfa971..80344c5e 100644
--- a/backend/export/tests/conftest.py
+++ b/backend/export/tests/conftest.py
@@ -2,6 +2,8 @@ import pytest
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 from backend.app.db.base import Base
+from sqlmodel import SQLModel
+import backend.app.db.models.organisation  # noqa: F401 — registers Organisation with SQLModel.metadata
 
 
 @pytest.fixture(scope="function")
@@ -25,12 +27,14 @@ def engine(postgresql):
 
     # Create tables once per test session
     Base.metadata.create_all(engine)
+    SQLModel.metadata.create_all(engine)
 
     # Yeild will split this function into two phase. 1) setup and 2) teardown, the latter of which will run after all
     # tests have completed
     yield engine
 
     # Clean-up after entire test session
+    SQLModel.metadata.drop_all(engine)
     Base.metadata.drop_all(engine)
     engine.dispose()
 

From f8736d3574707bb38b7c234a91faa41b35472ef8 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 11:42:20 +0000
Subject: [PATCH 45/47] added sql model to db

---
 .github/workflows/unit_tests.yml     | 11 +++++++++++
 sfr/principal_pitch/2_export_data.py | 10 ++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 740f88f7..436428f9 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -33,6 +33,17 @@ jobs:
       - name: Build test image
         run: docker build -f Dockerfile.test -t model-test .
 
+      - name: Initialise database schema
+        run: |
+          docker run --rm \
+            --network host \
+            -e DB_HOST=localhost \
+            -e DB_NAME=test \
+            -e DB_USERNAME=test \
+            -e DB_PASSWORD=test \
+            -e DB_PORT=5432 \
+            model-test python scripts/init_db.py
+
       - name: Run tests
         run: |
           docker run --rm \
diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py
index df54749e..c89560cb 100644
--- a/sfr/principal_pitch/2_export_data.py
+++ b/sfr/principal_pitch/2_export_data.py
@@ -26,15 +26,13 @@ from backend.app.db.functions.materials_functions import get_materials
 from collections import defaultdict
 from sqlalchemy import func
 
-# PORTFOLIO_ID = 206
-# SCENARIOS = [389]
-PORTFOLIO_ID = 633
-SCENARIOS = [1146]
+PORTFOLIO_ID = 639
+SCENARIOS = [1157]
 scenario_names = {
-    1146: "Most Economic",
+    1157: "EPC C - no EWI solid floor",
 }
 
-project_name = "WCHG EPC D rated properties"
+project_name = "Instagroup Sample"
 
 
 def get_data(portfolio_id, scenario_ids):

From c498dc19511a6289eb25cb216d0afd9342888cb8 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 11:45:59 +0000
Subject: [PATCH 46/47] init db

---
 scripts/init_db.py | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 scripts/init_db.py

diff --git a/scripts/init_db.py b/scripts/init_db.py
new file mode 100644
index 00000000..69edf777
--- /dev/null
+++ b/scripts/init_db.py
@@ -0,0 +1,5 @@
+from sqlmodel import SQLModel
+import backend.app.db.models.organisation  # noqa: F401
+from backend.app.db.connection import db_engine
+
+SQLModel.metadata.create_all(db_engine)

From bba88bc077de2746e5b854aaa3a773d268c0e2fc Mon Sep 17 00:00:00 2001
From: Jun-te Kim <juntekim@googlemail.com>
Date: Tue, 31 Mar 2026 11:51:54 +0000
Subject: [PATCH 47/47] init db

---
 backend/app/db/models/organisation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/app/db/models/organisation.py b/backend/app/db/models/organisation.py
index a3c79e3c..e8649cdd 100644
--- a/backend/app/db/models/organisation.py
+++ b/backend/app/db/models/organisation.py
@@ -43,7 +43,7 @@ class HubspotDealData(SQLModel, table=True):
     created_at: datetime = Field(
         sa_column=Column(
             DateTime(timezone=True),
-            server_default=text("NOW() AT TIME ZONE 'utc'"),
+            server_default=text("(NOW() AT TIME ZONE 'utc')"),
             nullable=False,
         )
     )
@@ -51,7 +51,7 @@ class HubspotDealData(SQLModel, table=True):
     updated_at: datetime = Field(
         sa_column=Column(
             DateTime(timezone=True),
-            server_default=text("NOW() AT TIME ZONE 'utc'"),
+            server_default=text("(NOW() AT TIME ZONE 'utc')"),
             onupdate=func.now(),
             nullable=False,
         )