diff --git a/.devcontainer/backend/Dockerfile b/.devcontainer/backend/Dockerfile
index 6a1cc120..59aa0cb6 100644
--- a/.devcontainer/backend/Dockerfile
+++ b/.devcontainer/backend/Dockerfile
@@ -64,4 +64,13 @@ RUN apt install -y wget gnupg2 lsb-release
RUN echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list
RUN wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
RUN apt update
-RUN apt install -y postgresql-14
\ No newline at end of file
+RUN apt install -y postgresql-14
+
+# Install Claude
+USER ${USER}
+RUN curl -fsSL https://claude.ai/install.sh | bash \
+ && export PATH="/home/${USER}/.local/bin:${PATH}" \
+ && claude plugin marketplace add JuliusBrussee/caveman \
+ && claude plugin install caveman@caveman
+ENV PATH="/home/vscode/.local/bin:${PATH}"
+USER root
\ No newline at end of file
diff --git a/backend/app/db/models/uploaded_file.py b/backend/app/db/models/uploaded_file.py
index 71763790..a516a1df 100644
--- a/backend/app/db/models/uploaded_file.py
+++ b/backend/app/db/models/uploaded_file.py
@@ -16,6 +16,7 @@ class FileTypeEnum(enum.Enum):
PAS_2023_OCCUPANCY = "pas_2023_occupancy"
ECMK_SITE_NOTE = "ecmk_site_note"
ECMK_RD_SAP_SITE_NOTE = "ecmk_rd_sap_site_note"
+ ECMK_SURVEY_XML = "ecmk_survey_xml"
class FileSourceEnum(enum.Enum):
@@ -37,6 +38,7 @@ class UploadedFile(Base):
landlord_property_id = Column(Text, nullable=True)
uprn = Column(BigInteger, nullable=True)
hubspot_listing_id = Column(BigInteger, nullable=True)
+ hubspot_deal_id = Column(Text, nullable=True)
file_type = Column(
SqlEnum(
diff --git a/backend/ecmk_fetcher/excel_writer.py b/backend/ecmk_fetcher/excel_writer.py
new file mode 100644
index 00000000..f290614b
--- /dev/null
+++ b/backend/ecmk_fetcher/excel_writer.py
@@ -0,0 +1,53 @@
+import os
+from typing import Any
+
+from openpyxl import Workbook, load_workbook
+from openpyxl.worksheet.worksheet import Worksheet
+
+
+def write_row(file_path: str, row_data: dict[str, Any]) -> None:
+ new_keys = list(row_data.keys())
+
+ if not os.path.exists(file_path):
+ wb = Workbook()
+ ws: Worksheet = wb.active # type: ignore[assignment]
+ ws.append(new_keys)
+ ws.append(list(row_data.values()))
+ wb.save(file_path)
+ return
+
+ wb = load_workbook(file_path)
+ ws = wb.active # type: ignore[assignment]
+
+ # Build a mutable header list and insert new columns using insert_cols so
+ # that existing row data shifts along with the headers.
+ # Filter out None to guard against blank columns in the source file.
+ headers: list[str] = [cell.value for cell in ws[1] if cell.value is not None] # type: ignore[misc]
+
+ for key in new_keys:
+ if key in headers:
+ continue
+
+ # Find the first key that comes after this one in new_keys that already
+ # exists in headers — insert before it to keep columns logically grouped.
+ insert_before: str | None = None
+ found = False
+ for k in new_keys:
+ if k == key:
+ found = True
+ continue
+ if found and k in headers:
+ insert_before = k
+ break
+
+ if insert_before is not None:
+ col_idx = headers.index(insert_before) + 1 # 1-based
+ ws.insert_cols(col_idx)
+ ws.cell(row=1, column=col_idx, value=key)
+ headers.insert(col_idx - 1, key)
+ else:
+ headers.append(key)
+ ws.cell(row=1, column=len(headers), value=key)
+
+ ws.append([row_data.get(col) for col in headers])
+ wb.save(file_path)
diff --git a/backend/ecmk_fetcher/processor.py b/backend/ecmk_fetcher/processor.py
index 2f122080..4f8c24ea 100644
--- a/backend/ecmk_fetcher/processor.py
+++ b/backend/ecmk_fetcher/processor.py
@@ -26,13 +26,17 @@ from backend.ecmk_fetcher.browser import (
)
from backend.ecmk_fetcher.reports import (
REPORT_TYPES,
+ FileDownloadButtonType,
build_property_id,
map_report_type_to_db_file_type,
)
+from backend.ecmk_fetcher.excel_writer import write_row
from backend.ecmk_fetcher.upload import (
+ upload_excel_to_sharepoint,
upload_file_to_s3_and_update_db,
upload_file_to_sharepoint,
)
+from backend.ecmk_fetcher.xml_processor import flatten_sap_property, parse_rdsap
from utils.logger import setup_logger
from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient
from utils.sharepoint.domna_sites import DomnaSites
@@ -59,6 +63,15 @@ def run_job() -> None:
)
sharepoint_base_path: str = "/Projects/Southern Housing/SH-SURV-26-001/Assessments"
+ sharepoint_excel_path: str = "/Projects/Southern Housing/SH-SURV-26-001/Modelling"
+
+ DIMENSIONS_FILENAME: str = "Dimensions.xlsx"
+ local_dimensions_path: str = os.path.join(BASE_DIR, DIMENSIONS_FILENAME)
+
+ sharepoint_client.download_file(
+ sharepoint_path=f"{sharepoint_excel_path}/{DIMENSIONS_FILENAME}",
+ local_path=local_dimensions_path,
+ )
s3_bucket: str = "retrofit-energy-assessments-dev"
@@ -141,15 +154,30 @@ def run_job() -> None:
)
try:
- upload_file_to_sharepoint(
- client=sharepoint_client,
- file_path=file_path,
- base_path=sharepoint_base_path,
- subpath=sharepoint_address,
- )
- logger.info(
- f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}"
- )
+ if report_type == FileDownloadButtonType.RAW_XML.value:
+ with open(file_path, "r", encoding="utf-8") as f:
+ xml_string = f.read()
+ details = parse_rdsap(xml_string)
+ row_data = flatten_sap_property(details)
+ write_row(local_dimensions_path, row_data)
+ upload_excel_to_sharepoint(
+ client=sharepoint_client,
+ file_path=local_dimensions_path,
+ sharepoint_path=sharepoint_excel_path,
+ )
+ logger.info(
+ f"Written dimensions row and uploaded Dimensions.xlsx for {address}"
+ )
+ else:
+ upload_file_to_sharepoint(
+ client=sharepoint_client,
+ file_path=file_path,
+ base_path=sharepoint_base_path,
+ subpath=sharepoint_address,
+ )
+ logger.info(
+ f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}"
+ )
# Upload to s3 and update db
upload_file_to_s3_and_update_db(
diff --git a/backend/ecmk_fetcher/reports.py b/backend/ecmk_fetcher/reports.py
index d8d11d50..d2f8ea52 100644
--- a/backend/ecmk_fetcher/reports.py
+++ b/backend/ecmk_fetcher/reports.py
@@ -14,6 +14,7 @@ class FileDownloadButtonType(Enum):
REPORT_TYPES = [
FileDownloadButtonType.ASSESSOR_HUB_SITENOTE_REPORT.value,
FileDownloadButtonType.SITENOTE_REPORT.value,
+ FileDownloadButtonType.RAW_XML.value,
]
@@ -23,6 +24,8 @@ def map_report_type_to_db_file_type(report_type: int) -> FileTypeEnum:
return FileTypeEnum.ECMK_SITE_NOTE
case FileDownloadButtonType.SITENOTE_REPORT.value:
return FileTypeEnum.ECMK_RD_SAP_SITE_NOTE
+ case FileDownloadButtonType.RAW_XML.value:
+ return FileTypeEnum.ECMK_SURVEY_XML
case _:
raise ValueError("Unknown report type")
diff --git a/backend/ecmk_fetcher/tests/test_excel_writer.py b/backend/ecmk_fetcher/tests/test_excel_writer.py
new file mode 100644
index 00000000..3f730951
--- /dev/null
+++ b/backend/ecmk_fetcher/tests/test_excel_writer.py
@@ -0,0 +1,123 @@
+import os
+import pathlib
+import pytest
+from openpyxl import load_workbook
+from openpyxl.worksheet.worksheet import Worksheet
+
+from backend.ecmk_fetcher.excel_writer import write_row
+
+
+@pytest.fixture
+def xlsx_path(tmp_path: pathlib.Path) -> str:
+ return str(tmp_path / "output.xlsx")
+
+
+def _active_sheet(file_path: str) -> Worksheet:
+ ws = load_workbook(file_path).active
+ assert isinstance(ws, Worksheet)
+ return ws
+
+
+def test_write_row_creates_file(xlsx_path: str):
+ # arrange
+ row = {
+ "address": "1 Fake Avenue, AB24 5CD",
+ "property_type": "House",
+ "main_dwelling_floor_1_area_m2": 43.61,
+ }
+
+ # act
+ write_row(xlsx_path, row)
+
+ # assert
+ assert os.path.exists(xlsx_path)
+ ws = _active_sheet(xlsx_path)
+ assert [c.value for c in ws[1]] == list(row.keys())
+ assert [c.value for c in ws[2]] == list(row.values())
+
+
+def test_write_row_appends_to_existing(xlsx_path: str):
+ # arrange
+ row_a = {
+ "address": "1 Fake Avenue, AB24 5CD",
+ "property_type": "House",
+ "main_dwelling_floor_1_area_m2": 43.61,
+ }
+ row_b = {
+ "address": "2 Other Street, XY1 2AB",
+ "property_type": "Flat",
+ "main_dwelling_floor_1_area_m2": 30.0,
+ }
+
+ # act
+ write_row(xlsx_path, row_a)
+ write_row(xlsx_path, row_b)
+
+ # assert
+ ws = _active_sheet(xlsx_path)
+ assert ws.max_row == 3 # 1 header + 2 data rows
+ assert [c.value for c in ws[1]] == list(row_a.keys())
+ assert [c.value for c in ws[2]] == list(row_a.values())
+ assert [c.value for c in ws[3]] == list(row_b.values())
+
+
+def test_write_row_inserts_new_columns_at_logical_positions(xlsx_path: str):
+ # arrange
+ # First row: main_dwelling floor 1 + roof
+ # Second row: also has main_dwelling floor 2 — should be inserted between floor 1 and roof,
+ # not appended to the end
+ row_a = {
+ "address": "1 Fake Avenue, AB24 5CD",
+ "property_type": "House",
+ "main_dwelling_floor_1_area_m2": 43.61,
+ "main_dwelling_floor_1_height_m": 2.46,
+ "main_dwelling_roof_construction": 4,
+ }
+ row_b = {
+ "address": "2 Other Street, XY1 2AB",
+ "property_type": "House",
+ "main_dwelling_floor_1_area_m2": 50.0,
+ "main_dwelling_floor_1_height_m": 2.5,
+ "main_dwelling_floor_2_area_m2": 48.0,
+ "main_dwelling_floor_2_height_m": 2.4,
+ "main_dwelling_roof_construction": 4,
+ }
+
+ # act
+ write_row(xlsx_path, row_a)
+ write_row(xlsx_path, row_b)
+
+ # assert
+ ws = _active_sheet(xlsx_path)
+
+ assert [c.value for c in ws[1]] == [
+ "address",
+ "property_type",
+ "main_dwelling_floor_1_area_m2",
+ "main_dwelling_floor_1_height_m",
+ "main_dwelling_floor_2_area_m2", # inserted before roof, not at end
+ "main_dwelling_floor_2_height_m",
+ "main_dwelling_roof_construction",
+ ]
+
+ # row_a had no floor_2 data — those cells should be empty
+ assert [c.value for c in ws[2]] == [
+ "1 Fake Avenue, AB24 5CD",
+ "House",
+ 43.61,
+ 2.46,
+ None, # main_dwelling_floor_2_area_m2
+ None, # main_dwelling_floor_2_height_m
+ 4,
+ ]
+
+ # row_b should be fully populated
+ assert [c.value for c in ws[3]] == [
+ "2 Other Street, XY1 2AB",
+ "House",
+ 50.0,
+ 2.5,
+ 48.0,
+ 2.4,
+ 4,
+ ]
diff --git a/backend/ecmk_fetcher/tests/test_xml_processor.py b/backend/ecmk_fetcher/tests/test_xml_processor.py
new file mode 100644
index 00000000..3695b09d
--- /dev/null
+++ b/backend/ecmk_fetcher/tests/test_xml_processor.py
@@ -0,0 +1,329 @@
+from backend.ecmk_fetcher.xml_processor import (
+ SapPropertyDetails,
+ flatten_sap_property,
+ parse_rdsap,
+)
+
+
+SAMPLE_XML = """
+
+
+
+ 1
+ Fake Avenue
+ Random
+ AB24 5CD
+
+
+
+
+
+
+ 0
+
+
+
+
+ 1
+ Main Dwelling
+ C
+ 7
+
+ 4
+ 2
+ 100mm
+
+ 4
+ 4
+
+
+
+ 25.31
+ 2.46
+ 43.61
+ 0
+ 0
+
+
+
+ 26.16
+ 2.44
+ 42.33
+ 1
+ 0
+
+
+
+
+
+
+ 2
+ Extension
+ C
+
+ 8
+ 7
+ AB
+
+ 3
+ 4
+
+
+
+ 6.85
+ 2.24
+ 4.46
+ 0
+ 0
+
+
+
+
+
+
+
+
+
+
+"""
+
+
+NO_ROOF_XML = """
+
+
+
+ 5
+ Somewhere
+ XY1 2AB
+
+
+
+
+
+ 0
+
+
+ Main Dwelling
+
+
+ 10.0
+ 2.5
+ 50.0
+ 0
+ 3.0
+
+
+
+
+
+
+
+"""
+
+
+def test_parse_rdsap_contract():
+ # arrange + act
+ result: SapPropertyDetails = parse_rdsap(SAMPLE_XML)
+
+ # assert
+ assert result == {
+ "reference": "1AB245CD",
+ "address": "1, Fake Avenue, Random, AB24 5CD",
+ "property_type": "House",
+ "building_parts": [
+ {
+ "identifier": "Main Dwelling",
+ "floors": [
+ {
+ "area_m2": 43.61,
+ "height_m": 2.46,
+ "heat_loss_perimeter_m": 25.31,
+ "party_wall_length_m": 0.0,
+ },
+ {
+ "area_m2": 42.33,
+ "height_m": 2.44,
+ "heat_loss_perimeter_m": 26.16,
+ "party_wall_length_m": 0.0,
+ },
+ ],
+ "roof": {
+ "construction": 4,
+ "insulation_location": 2,
+ "insulation_thickness_mm": 100.0,
+ },
+ },
+ {
+ "identifier": "Extension",
+ "floors": [
+ {
+ "area_m2": 4.46,
+ "height_m": 2.24,
+ "heat_loss_perimeter_m": 6.85,
+ "party_wall_length_m": 0.0,
+ }
+ ],
+ "roof": {
+ "construction": 8,
+ "insulation_location": 7,
+ },
+ },
+ ],
+ }
+
+
+ND_THICKNESS_XML = """
+
+
+
+ 1
+ Somewhere
+ AB1 2CD
+
+
+
+
+
+ 0
+
+
+ Main Dwelling
+ 4
+ 2
+ ND
+
+
+ 10.0
+ 2.5
+ 50.0
+ 0
+ 0
+
+
+
+
+
+
+
+"""
+
+ND_INSULATION_LOCATION_XML = """
+
+
+
+ 1
+ Somewhere
+ AB1 2CD
+
+
+
+
+
+ 0
+
+
+ Main Dwelling
+ 4
+ ND
+ 250
+
+
+ 10.0
+ 2.5
+ 50.0
+ 0
+ 0
+
+
+
+
+
+
+
+"""
+
+
+def test_parse_rdsap_nd_thickness():
+ # 'ND' (not determined) is a valid value in the wild for Roof-Insulation-Thickness
+ # — it should be retained as-is rather than raising
+
+ # arrange + act
+ result: SapPropertyDetails = parse_rdsap(ND_THICKNESS_XML)
+
+ # assert
+ assert result["building_parts"][0]["roof"] == {
+ "construction": 4,
+ "insulation_location": 2,
+ "insulation_thickness_mm": "ND",
+ }
+
+
+def test_parse_rdsap_nd_location():
+ # 'ND' (not determined) is a valid value in the wild for Roof-Insulation-Location
+ # — it should be retained as-is rather than raising
+
+ # arrange + act
+ result: SapPropertyDetails = parse_rdsap(ND_INSULATION_LOCATION_XML)
+
+ # assert
+ assert result["building_parts"][0]["roof"] == {
+ "construction": 4,
+ "insulation_location": "ND",
+ "insulation_thickness_mm": 250,
+ }
+
+
+def test_flatten_full():
+ # Two building parts; Main Dwelling has two floors + full roof,
+ # Extension has one floor + partial roof (no thickness)
+
+ # arrange
+ details: SapPropertyDetails = parse_rdsap(SAMPLE_XML)
+
+ # act
+ result = flatten_sap_property(details)
+
+ # assert
+ assert result == {
+ "reference": "1AB245CD",
+ "address": "1, Fake Avenue, Random, AB24 5CD",
+ "property_type": "House",
+ "main_dwelling_floor_1_area_m2": 43.61,
+ "main_dwelling_floor_1_height_m": 2.46,
+ "main_dwelling_floor_1_heat_loss_perimeter_m": 25.31,
+ "main_dwelling_floor_1_party_wall_length_m": 0.0,
+ "main_dwelling_floor_2_area_m2": 42.33,
+ "main_dwelling_floor_2_height_m": 2.44,
+ "main_dwelling_floor_2_heat_loss_perimeter_m": 26.16,
+ "main_dwelling_floor_2_party_wall_length_m": 0.0,
+ "main_dwelling_roof_construction": 4,
+ "main_dwelling_roof_insulation_location": 2,
+ "main_dwelling_roof_insulation_thickness_mm": 100.0,
+ "extension_floor_1_area_m2": 4.46,
+ "extension_floor_1_height_m": 2.24,
+ "extension_floor_1_heat_loss_perimeter_m": 6.85,
+ "extension_floor_1_party_wall_length_m": 0.0,
+ "extension_roof_construction": 8,
+ "extension_roof_insulation_location": 7,
+ }
+
+
+def test_flatten_no_roof():
+ # Single building part with no roof — roof keys must be absent entirely
+
+ # arrange
+ details: SapPropertyDetails = parse_rdsap(NO_ROOF_XML)
+
+ # act
+ result = flatten_sap_property(details)
+
+ # assert
+ assert result == {
+ "reference": "5XY12AB",
+ "address": "5, Somewhere, XY1 2AB",
+ "property_type": "House",
+ "main_dwelling_floor_1_area_m2": 50.0,
+ "main_dwelling_floor_1_height_m": 2.5,
+ "main_dwelling_floor_1_heat_loss_perimeter_m": 10.0,
+ "main_dwelling_floor_1_party_wall_length_m": 3.0,
+ }
diff --git a/backend/ecmk_fetcher/upload.py b/backend/ecmk_fetcher/upload.py
index 0a744e53..cc2c908d 100644
--- a/backend/ecmk_fetcher/upload.py
+++ b/backend/ecmk_fetcher/upload.py
@@ -28,6 +28,19 @@ def upload_file_to_sharepoint(
)
+def upload_excel_to_sharepoint(
+ client: DomnaSharepointClient,
+ file_path: str,
+ sharepoint_path: str,
+) -> None:
+ client.upload_file(
+ file_path=file_path,
+ sharepoint_path=sharepoint_path,
+ file_name=os.path.basename(file_path),
+ )
+
+
+# TODO: this should be moved to somewhere common and called by pashub fetcher
def upload_file_to_s3_and_update_db(
bucket: str, file_path: str, hubspot_listing_id: str, file_type: FileTypeEnum
) -> None:
diff --git a/backend/ecmk_fetcher/xml_processor.py b/backend/ecmk_fetcher/xml_processor.py
new file mode 100644
index 00000000..f993038b
--- /dev/null
+++ b/backend/ecmk_fetcher/xml_processor.py
@@ -0,0 +1,226 @@
+import xml.etree.ElementTree as ET
+from typing import Any, List, Optional, TypedDict
+
+
+from backend.ecmk_fetcher.reports import build_property_id
+from datatypes.epc.domain.field_mappings import PROPERTY_TYPE_LOOKUP
+
+
+# This file should ultimately live somewhere different, probably
+class Floor(TypedDict):
+ area_m2: float
+ height_m: float
+ heat_loss_perimeter_m: float
+ party_wall_length_m: float
+
+
+class Roof(TypedDict, total=False):
+ construction: int # TODO: map to str
+ insulation_location: int | str # TODO: map to str
+ insulation_thickness_mm: float | str
+
+
+class BuildingPart(TypedDict):
+ identifier: str # e.g. "Main Dwelling", "Extension"
+ floors: List[Floor]
+ roof: Optional[Roof]
+
+
+class SapPropertyDetails(TypedDict):
+ reference: str
+ address: str
+ property_type: str
+ building_parts: List[BuildingPart]
+
+
+def _get_namespace(tag: str) -> str:
+ return tag.split("}")[0].strip("{")
+
+
+def _require_text(value: Optional[str], field: str) -> str:
+ if value is None:
+ raise ValueError(f"Missing required field: {field}")
+ return value
+
+
+def _parse_float(value: Optional[str], field: str) -> float:
+ if value is None:
+ raise ValueError(f"Missing float field: {field}")
+ return float(value)
+
+
+def _parse_int(value: Optional[str], field: str) -> int:
+ if value is None:
+ raise ValueError(f"Missing int field: {field}")
+ return int(value)
+
+
+def _parse_thickness_mm(value: Optional[str]) -> Optional[float | str]:
+ if value is None:
+ return None
+ stripped = value.replace("mm", "").strip()
+ try:
+ return float(stripped)
+ except ValueError:
+ return stripped
+
+
+def parse_rdsap(xml_string: str) -> SapPropertyDetails:
+ root = ET.fromstring(xml_string)
+
+ ns_uri: str = _get_namespace(root.tag)
+ ns: dict[str, str] = {"r": ns_uri}
+
+ # --- Address ---
+ addr_elem = root.find(".//r:Address", ns)
+ if addr_elem is None:
+ raise ValueError("Address element not found")
+
+ address_line_1: str = addr_elem.findtext(
+ "r:Address-Line-1", default="", namespaces=ns
+ )
+ postcode: str = addr_elem.findtext("r:Postcode", default="", namespaces=ns)
+
+ address_parts: List[str] = [
+ address_line_1,
+ addr_elem.findtext("r:Address-Line-2", default="", namespaces=ns),
+ addr_elem.findtext("r:Post-Town", default="", namespaces=ns),
+ postcode,
+ ]
+
+ address: str = ", ".join(part for part in address_parts if part)
+ reference: str = build_property_id(address_line_1, postcode)
+
+ # --- Property Type ---
+ prop_type_text = root.findtext(".//r:Property-Type", namespaces=ns)
+ prop_type_code: int = _parse_int(prop_type_text, "Property-Type")
+ property_type: str = PROPERTY_TYPE_LOOKUP[prop_type_code]
+
+ # --- Building Parts ---
+ building_parts: List[BuildingPart] = []
+
+ for bp in root.findall(".//r:SAP-Building-Part", ns):
+
+ identifier_text = bp.findtext("r:Identifier", namespaces=ns)
+ identifier: str = _require_text(identifier_text, "Identifier")
+
+ # Floors
+ floors: List[Floor] = []
+
+ for f in bp.findall(".//r:SAP-Floor-Dimension", ns):
+
+ area = _parse_float(
+ f.findtext("r:Total-Floor-Area", namespaces=ns),
+ "Total-Floor-Area",
+ )
+
+ height = _parse_float(
+ f.findtext("r:Room-Height", namespaces=ns),
+ "Room-Height",
+ )
+
+ heat_loss = _parse_float(
+ f.findtext("r:Heat-Loss-Perimeter", namespaces=ns),
+ "Heat-Loss-Perimeter",
+ )
+
+ party_wall = _parse_float(
+ f.findtext("r:Party-Wall-Length", namespaces=ns),
+ "Party-Wall-Length",
+ )
+
+ floor: Floor = {
+ "area_m2": area,
+ "height_m": height,
+ "heat_loss_perimeter_m": heat_loss,
+ "party_wall_length_m": party_wall,
+ }
+
+ floors.append(floor)
+
+ # Roof (optional)
+ roof: Optional[Roof] = None
+
+ roof_construction_text = bp.findtext("r:Roof-Construction", namespaces=ns)
+ roof_ins_loc_text = bp.findtext("r:Roof-Insulation-Location", namespaces=ns)
+ roof_thickness_text = bp.findtext("r:Roof-Insulation-Thickness", namespaces=ns)
+
+ if (
+ roof_construction_text is not None
+ or roof_ins_loc_text is not None
+ or roof_thickness_text is not None
+ ):
+ roof_dict: Roof = {}
+
+ if roof_construction_text is not None:
+ roof_dict["construction"] = _parse_int(
+ roof_construction_text, "Roof-Construction"
+ )
+
+ if roof_ins_loc_text is not None:
+ try:
+ roof_dict["insulation_location"] = _parse_int(
+ roof_ins_loc_text, "Roof-Insulation-Location"
+ )
+ except ValueError:
+ roof_dict["insulation_location"] = roof_ins_loc_text
+
+ thickness = _parse_thickness_mm(roof_thickness_text)
+ if thickness is not None:
+ roof_dict["insulation_thickness_mm"] = thickness
+
+ roof = roof_dict
+
+ building_part: BuildingPart = {
+ "identifier": identifier,
+ "floors": floors,
+ "roof": roof,
+ }
+
+ building_parts.append(building_part)
+
+ result: SapPropertyDetails = {
+ "reference": reference,
+ "address": address,
+ "property_type": property_type,
+ "building_parts": building_parts,
+ }
+
+ return result
+
+
+def _normalise_identifier(identifier: str) -> str:
+ return identifier.lower().replace(" ", "_").replace("-", "_")
+
+
+def flatten_sap_property(details: SapPropertyDetails) -> dict[str, Any]:
+ row: dict[str, Any] = {}
+
+ row["reference"] = details["reference"]
+ row["address"] = details["address"]
+ row["property_type"] = details["property_type"]
+
+ for bp in details["building_parts"]:
+ prefix = _normalise_identifier(bp["identifier"])
+
+ for i, floor in enumerate(bp["floors"], start=1):
+ floor_prefix = f"{prefix}_floor_{i}"
+ row[f"{floor_prefix}_area_m2"] = floor["area_m2"]
+ row[f"{floor_prefix}_height_m"] = floor["height_m"]
+ row[f"{floor_prefix}_heat_loss_perimeter_m"] = floor[
+ "heat_loss_perimeter_m"
+ ]
+ row[f"{floor_prefix}_party_wall_length_m"] = floor["party_wall_length_m"]
+
+ roof = bp.get("roof")
+ if roof:
+ if "construction" in roof:
+ row[f"{prefix}_roof_construction"] = roof["construction"]
+ if "insulation_location" in roof:
+ row[f"{prefix}_roof_insulation_location"] = roof["insulation_location"]
+ if "insulation_thickness_mm" in roof:
+ row[f"{prefix}_roof_insulation_thickness_mm"] = roof[
+ "insulation_thickness_mm"
+ ]
+
+ return row
diff --git a/backend/pashub_fetcher/The_Guinness_Partnership_AtkinsR_alis_Coordination_Design_Board_1774881298.xlsx b/backend/pashub_fetcher/The_Guinness_Partnership_AtkinsR_alis_Coordination_Design_Board_1774881298.xlsx
index a6478e3b..beb679c1 100644
Binary files a/backend/pashub_fetcher/The_Guinness_Partnership_AtkinsR_alis_Coordination_Design_Board_1774881298.xlsx and b/backend/pashub_fetcher/The_Guinness_Partnership_AtkinsR_alis_Coordination_Design_Board_1774881298.xlsx differ
diff --git a/backend/pashub_fetcher/handler/Dockerfile b/backend/pashub_fetcher/handler/Dockerfile
index d045becd..e450d340 100644
--- a/backend/pashub_fetcher/handler/Dockerfile
+++ b/backend/pashub_fetcher/handler/Dockerfile
@@ -22,5 +22,5 @@ ENTRYPOINT ["python", "-m", "awslambdaric"]
# -----------------------------
# Lambda handler
# -----------------------------
-CMD ["backend.pashub_fetcher.handler.test_handler.handler"]
+CMD ["backend.pashub_fetcher.handler.handler"]
# CMD ["backend.pashub_fetcher.handler.handler.handler"]
\ No newline at end of file
diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py
index 3689efe9..60b946c1 100644
--- a/backend/pashub_fetcher/handler/handler.py
+++ b/backend/pashub_fetcher/handler/handler.py
@@ -1,8 +1,7 @@
from datetime import datetime, timezone
-import json
import os
import re
-from typing import Any, Dict, List, Mapping, Optional
+from typing import Any, Dict, List, Optional
from openpyxl import load_workbook
from backend.app.config import get_settings
@@ -104,10 +103,19 @@ def upload_job_to_sharepoint(
)
-def upload_job_to_s3_and_update_db(job_files: List[str], uprn: str) -> None:
+def upload_job_to_s3_and_update_db(
+ job_files: List[str], uprn: Optional[str], hubspot_deal_id: Optional[str]
+) -> None:
bucket = "retrofit-energy-assessments-dev"
- base_path = f"documents/uprn/{uprn}"
+ if not uprn and not hubspot_deal_id:
+ return
+
+ base_path = (
+ f"documents/uprn/{uprn}"
+ if uprn
+ else f"documents/hubspot_deal_id/{hubspot_deal_id}"
+ )
uploaded_files: List[UploadedFile] = []
@@ -118,12 +126,14 @@ def upload_job_to_s3_and_update_db(job_files: List[str], uprn: str) -> None:
upload_file_to_s3(file_path, bucket, file_key)
# load row to db
+ # TODO: use same upload_file_to_s3_and_update_db method as ecmk fetcher does
uploaded_files.append(
UploadedFile(
s3_file_bucket=bucket,
s3_file_key=file_key,
s3_upload_timestamp=datetime.now(timezone.utc),
- uprn=int(uprn),
+ uprn=int(uprn) if uprn else None,
+ hubspot_deal_id=hubspot_deal_id,
file_source=FileSourceEnum.PAS_HUB.value,
file_type=infer_file_type(filename),
)
@@ -144,6 +154,7 @@ def process_job(
job_id = job.pashub_job_id
uprn: Optional[str] = job.uprn or pashub_client.get_uprn_by_job_id(job_id)
+ hubspot_deal_id: Optional[str] = job.hubspot_deal_id
if uprn:
logger.info(f"Got UPRN {uprn} for job {job_id}")
@@ -152,9 +163,9 @@ def process_job(
job_files: List[str] = pashub_client.get_core_evidence_files_by_job_id(job_id)
- if uprn:
+ if uprn or hubspot_deal_id:
logger.info("Uploading files to s3")
- upload_job_to_s3_and_update_db(job_files, uprn)
+ upload_job_to_s3_and_update_db(job_files, uprn, hubspot_deal_id)
# # Comment out sharepoint loading for now:
# Seems like the sharepoint link in pas hub is inconsistent in terms
@@ -167,9 +178,8 @@ def process_job(
@task_handler()
-def handler(event: Mapping[str, Any], context: Any) -> None:
+def handler(body: Dict[str, Any], context: Any) -> List[str]:
logger.info("Received message")
- logger.info(f"Number of events: {len(event.get('Records', []))}")
settings = get_settings()
@@ -185,48 +195,34 @@ def handler(event: Mapping[str, Any], context: Any) -> None:
sharepoint_location=DomnaSites.SOCIAL_HOUSING_WAVE_3
)
- saved_file_paths: List[str] = []
+ logger.debug("Validating request body")
+ payload = PashubToAraTriggerRequest.model_validate(body)
+ logger.debug("Successfully validated request body")
- for record in event.get("Records", []):
- try:
- body_dict = json.loads(record["body"])
- logger.debug("Validating request body")
+ try:
+ files: List[str] = process_job(
+ payload,
+ pashub_client,
+ sharepoint_client,
+ )
+ except UnauthorizedError:
+ logger.warning("Token expired - refreshing")
- payload = PashubToAraTriggerRequest.model_validate(body_dict)
+ pashub_client = get_pashub_client(
+ pas_hub_email,
+ pas_hub_password,
+ )
- logger.debug("Successfully validated request body")
+ # retry once
+ files = process_job(
+ payload,
+ pashub_client,
+ sharepoint_client,
+ )
- try:
- files: List[str] = process_job(
- payload,
- pashub_client,
- sharepoint_client,
- )
- saved_file_paths.extend(files)
+ logger.info(f"Saved {len(files)} files")
- except UnauthorizedError:
- logger.warning("Token expired - refreshing")
-
- pashub_client = get_pashub_client(
- pas_hub_email,
- pas_hub_password,
- )
-
- # retry once
- files: List[str] = process_job(
- payload,
- pashub_client,
- sharepoint_client,
- )
- saved_file_paths.extend(files)
-
- except Exception as e:
- logger.info("Handler exception")
- logger.error(f"Failed to process record: {e}")
-
- logger.info("Successfully loaded jobs from spreadsheet")
-
- logger.info(f"Saved {len(saved_file_paths)} files")
+ return files
if __name__ == "__main__":
diff --git a/backend/pashub_fetcher/local_handler/invoke_local_lambda.py b/backend/pashub_fetcher/local_handler/invoke_local_lambda.py
index 463ef9d8..219446fd 100644
--- a/backend/pashub_fetcher/local_handler/invoke_local_lambda.py
+++ b/backend/pashub_fetcher/local_handler/invoke_local_lambda.py
@@ -12,7 +12,8 @@ payload = {
{
"body": json.dumps(
{
- "uprn": 123456,
+ "pashub_link": "https://pashub.net/jobs/00000000-0000-0000-0000-000000000000/details",
+ "uprn": "123456",
}
)
}
diff --git a/backend/pashub_fetcher/pashub_to_ara_trigger_request.py b/backend/pashub_fetcher/pashub_to_ara_trigger_request.py
index 2e4f8380..518a8dc3 100644
--- a/backend/pashub_fetcher/pashub_to_ara_trigger_request.py
+++ b/backend/pashub_fetcher/pashub_to_ara_trigger_request.py
@@ -12,6 +12,8 @@ class PashubToAraTriggerRequest(BaseModel):
uprn: Optional[str] = None
landlord_property_id: Optional[str] = None
deal_stage: Optional[str] = None
+ hubspot_listing_id: Optional[int] = None
+ hubspot_deal_id: Optional[str] = None
@property
def pashub_job_id(self) -> str:
diff --git a/backend/pashub_fetcher/trigger_lambda_from_file.py b/backend/pashub_fetcher/trigger_lambda_from_file.py
new file mode 100644
index 00000000..fb9d1cbf
--- /dev/null
+++ b/backend/pashub_fetcher/trigger_lambda_from_file.py
@@ -0,0 +1,63 @@
+import json
+import os
+import re
+from typing import Any, Dict, List
+
+from openpyxl import load_workbook
+
+from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
+ PashubToAraTriggerRequest,
+)
+from backend.pashub_fetcher.handler.handler import handler
+
+
+if __name__ == "__main__":
+ BASE_DIR = os.path.dirname(os.path.dirname(__file__))
+ filepath: str = os.path.join(
+ BASE_DIR,
+ "pashub_fetcher",
+ "The_Guinness_Partnership_AtkinsR_alis_Coordination_Design_Board_1774881298.xlsx",
+ )
+
+ wb = load_workbook(filepath, data_only=True)
+ ws = wb["filtered_2"]
+
+ HEADER_ROW = 3
+
+ headers: Dict[str, int] = {}
+ for col in range(1, ws.max_column + 1):
+ value = str(ws.cell(row=HEADER_ROW, column=col).value)
+ if value:
+ headers[value.strip()] = col
+
+ name_col = headers["Name"]
+ link_col = headers["PasHub Link"]
+ hubspot_deal_id_col = headers["HubSpot ID"]
+
+ trigger_requests: List[PashubToAraTriggerRequest] = []
+
+ for row in range(HEADER_ROW + 1, ws.max_row + 1):
+ name = ws.cell(row=row, column=name_col).value
+ link = ws.cell(row=row, column=link_col).value
+ hubspot_deal_id = ws.cell(row=row, column=hubspot_deal_id_col).value
+
+ if not name or not link or not hubspot_deal_id:
+ continue
+
+ match = re.search(r"/jobs/([0-9a-fA-F\-]+)/", str(link))
+ if not match:
+ continue
+
+ trigger_requests.append(
+ PashubToAraTriggerRequest(
+ pashub_link=str(link), hubspot_deal_id=str(hubspot_deal_id)
+ )
+ )
+
+ # ---- Build fake SQS event ----
+ event: Dict[str, Any] = {
+ "Records": [{"body": json.dumps(req.model_dump())} for req in trigger_requests]
+ }
+
+ context = None
+ handler(event, context)
diff --git a/datatypes/epc/domain/field_mappings.py b/datatypes/epc/domain/field_mappings.py
new file mode 100644
index 00000000..cc0f9067
--- /dev/null
+++ b/datatypes/epc/domain/field_mappings.py
@@ -0,0 +1,3 @@
+PROPERTY_TYPE_LOOKUP = {0: "House", 1: "Bungalow", 2: "Flat", 3: "Maisonette"}
+ROOF_CONSTRUCTION_LOOKUP = {}
+ROOF_INSULATION_LOCATION_LOOKUP = {}
diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py
index b95b544c..74c8264d 100644
--- a/etl/hubspot/hubspot_deal_differ.py
+++ b/etl/hubspot/hubspot_deal_differ.py
@@ -6,9 +6,9 @@ from etl.hubspot.utils import parse_hs_date
class HubspotDealDiffer:
COORDINATION_COMPLETE: List[str] = [
- "v1 ioe/mtp complete",
- "v2 ioe/mtp complete",
- "v3 ioe/mtp complete",
+ "(v1) ioe/mtp complete",
+ "(v2) ioe/mtp complete",
+ "(v3) ioe/mtp complete",
]
RETROFIT_DESIGN_COMPLETE = "uploaded"
LODGEMENT_COMPLETE: List[str] = ["lodgement complete", "measures lodged"]
@@ -149,19 +149,19 @@ class HubspotDealDiffer:
def _coordination_completed(
new_deal: Dict[str, str], old_deal: HubspotDealData
) -> bool:
- new_status: str = new_deal.get("coordination_status", "")
+ new_status: str = new_deal.get("coordination_status") or ""
return (
new_status != ""
- and new_status in HubspotDealDiffer.COORDINATION_COMPLETE
+ and new_status.lower() in HubspotDealDiffer.COORDINATION_COMPLETE
and new_status != old_deal.coordination_status
)
@staticmethod
def _design_completed(new_deal: Dict[str, str], old_deal: HubspotDealData) -> bool:
- new_status: str = new_deal.get("design_status", "")
+ new_status: str = new_deal.get("coordination_status") or ""
return (
new_status != ""
- and new_status == HubspotDealDiffer.RETROFIT_DESIGN_COMPLETE
+ and new_status.lower() == HubspotDealDiffer.RETROFIT_DESIGN_COMPLETE
and new_status != old_deal.design_status
)
@@ -169,9 +169,9 @@ class HubspotDealDiffer:
def _lodgement_completed(
new_deal: Dict[str, str], old_deal: HubspotDealData
) -> bool:
- new_status: str = new_deal.get("lodgement_status", "")
+ new_status: str = new_deal.get("coordination_status") or ""
return (
new_status != ""
- and new_status in HubspotDealDiffer.LODGEMENT_COMPLETE
+ and new_status.lower() in HubspotDealDiffer.LODGEMENT_COMPLETE
and new_status != old_deal.lodgement_status
)
diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py
index e6c788ea..f7dc1076 100644
--- a/etl/hubspot/scripts/scraper/main.py
+++ b/etl/hubspot/scripts/scraper/main.py
@@ -104,10 +104,10 @@ def _trigger_pashub_fetcher(sqs_client: Any, hubspot_deal: Dict[str, str]) -> No
message_body: Dict[str, Optional[str]] = {
"pashub_link": hubspot_deal["pashub_link"],
"address": None, # potentially available from Listing, leave as None for now
- "sharepoint_link": hubspot_deal["sharepoint_link"],
- "uprn": hubspot_deal["national_uprn"],
- "landlord_property_id": hubspot_deal["owner_property_id"],
- "deal_stage": hubspot_deal["deal_stage"],
+ "sharepoint_link": hubspot_deal.get("sharepoint_link", None),
+ "uprn": hubspot_deal.get("national_uprn", None),
+ "landlord_property_id": hubspot_deal.get("owner_property_id", None),
+ "deal_stage": hubspot_deal.get("deal_stage", None),
}
response = sqs_client.send_message(
@@ -121,5 +121,5 @@ def _trigger_pashub_fetcher(sqs_client: Any, hubspot_deal: Dict[str, str]) -> No
if __name__ == "__main__":
- handler({"hubspot_deal_id": "371470706915"}, "")
+ handler({"hubspot_deal_id": "498926855369"}, "")
print("beep")
diff --git a/pytest.ini b/pytest.ini
index 6cb3b611..33231c61 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -3,6 +3,6 @@ pythonpath = .
log_cli = true
log_cli_level = INFO
addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial
-testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/documents_parser/tests
+testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/ecmk_fetcher/tests/ backend/documents_parser/tests
markers =
integration: mark a test as an integration test
diff --git a/utils/sharepoint/domna_sharepoint_client.py b/utils/sharepoint/domna_sharepoint_client.py
index 67e079ed..5e0255ac 100644
--- a/utils/sharepoint/domna_sharepoint_client.py
+++ b/utils/sharepoint/domna_sharepoint_client.py
@@ -90,6 +90,41 @@ class DomnaSharepointClient:
file_name, get_file_stream(file_path), sharepoint_path
)
+ def download_file(self, sharepoint_path: str, local_path: str) -> bool:
+ """
+ Download a file from SharePoint to a local path.
+
+ Returns True if the file was downloaded, False if it does not exist yet.
+ Raises on any other error.
+ """
+ sharepoint_client = SharePointClient(
+ tenant_id=self.sharepoint_tenant_id,
+ client_id=self.sharepoint_client_id,
+ client_secret=self.sharepoint_client_secret,
+ site_id=self.sharepoint_drive.value,
+ )
+
+ try:
+ metadata: Dict[str, Any] = sharepoint_client.get_file_metadata(sharepoint_path)
+ except ValueError:
+ return False
+
+ download_url: Optional[str] = metadata.get("@microsoft.graph.downloadUrl")
+ if not download_url:
+ return False
+
+ content: BytesIO = SharePointClient.download_sharepoint_file(download_url)
+
+ parent_dir = os.path.dirname(local_path)
+ if parent_dir:
+ os.makedirs(parent_dir, exist_ok=True)
+
+ with open(local_path, "wb") as f:
+ f.write(content.getvalue())
+
+ self.logger.debug(f"Downloaded SharePoint file to: {local_path}")
+ return True
+
def create_temp_file(self, content: BytesIO, path: str):
# Ensure the path is under /tmp/
new_path = os.path.join("/tmp/sharepoint", path)
diff --git a/utils/sharepoint/sharepoint_client.py b/utils/sharepoint/sharepoint_client.py
index 71f82b68..5807c3bd 100644
--- a/utils/sharepoint/sharepoint_client.py
+++ b/utils/sharepoint/sharepoint_client.py
@@ -278,6 +278,17 @@ class SharePointClient:
# logger.debug(f"Listing folder contents from URL: {url}")
return "GET", url, None
+ @api_call_decorator
+ def get_file_metadata(self, file_path: str) -> Dict[str, Any]:
+ """
+ GET /drives/{drive-id}/root:/{file_path}
+
+ Returns file metadata, including '@microsoft.graph.downloadUrl'.
+ Raises ValueError if the file does not exist (404).
+ """
+ url = f"https://graph.microsoft.com/v1.0/drives/{self.document_drive_id}/root:/{file_path}"
+ return "GET", url, None
+
@api_call_decorator
def create_folder(self, file_name: str, folder_path: str) -> Dict[str, Any]:
"""
@@ -325,7 +336,7 @@ class SharePointClient:
return self.upload_file(file_name, sharepoint_parent_id, file_stream)
@staticmethod
- def download_sharepoint_file(download_url):
+ def download_sharepoint_file(download_url: str) -> BytesIO:
"""
Downloads a file from the given URL and returns its content.