diff --git a/.devcontainer/backend/Dockerfile b/.devcontainer/backend/Dockerfile index 6a1cc120..59aa0cb6 100644 --- a/.devcontainer/backend/Dockerfile +++ b/.devcontainer/backend/Dockerfile @@ -64,4 +64,13 @@ RUN apt install -y wget gnupg2 lsb-release RUN echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list RUN wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - RUN apt update -RUN apt install -y postgresql-14 \ No newline at end of file +RUN apt install -y postgresql-14 + +# Install Claude +USER ${USER} +RUN curl -fsSL https://claude.ai/install.sh | bash \ + && export PATH="/home/${USER}/.local/bin:${PATH}" \ + && claude plugin marketplace add JuliusBrussee/caveman \ + && claude plugin install caveman@caveman +ENV PATH="/home/vscode/.local/bin:${PATH}" +USER root \ No newline at end of file diff --git a/backend/app/db/models/uploaded_file.py b/backend/app/db/models/uploaded_file.py index 71763790..a516a1df 100644 --- a/backend/app/db/models/uploaded_file.py +++ b/backend/app/db/models/uploaded_file.py @@ -16,6 +16,7 @@ class FileTypeEnum(enum.Enum): PAS_2023_OCCUPANCY = "pas_2023_occupancy" ECMK_SITE_NOTE = "ecmk_site_note" ECMK_RD_SAP_SITE_NOTE = "ecmk_rd_sap_site_note" + ECMK_SURVEY_XML = "ecmk_survey_xml" class FileSourceEnum(enum.Enum): @@ -37,6 +38,7 @@ class UploadedFile(Base): landlord_property_id = Column(Text, nullable=True) uprn = Column(BigInteger, nullable=True) hubspot_listing_id = Column(BigInteger, nullable=True) + hubspot_deal_id = Column(Text, nullable=True) file_type = Column( SqlEnum( diff --git a/backend/ecmk_fetcher/excel_writer.py b/backend/ecmk_fetcher/excel_writer.py new file mode 100644 index 00000000..f290614b --- /dev/null +++ b/backend/ecmk_fetcher/excel_writer.py @@ -0,0 +1,53 @@ +import os +from typing import Any + +from openpyxl import Workbook, load_workbook +from openpyxl.worksheet.worksheet import Worksheet + + +def write_row(file_path: str, row_data: dict[str, Any]) -> None: + new_keys = list(row_data.keys()) + + if not os.path.exists(file_path): + wb = Workbook() + ws: Worksheet = wb.active # type: ignore[assignment] + ws.append(new_keys) + ws.append(list(row_data.values())) + wb.save(file_path) + return + + wb = load_workbook(file_path) + ws = wb.active # type: ignore[assignment] + + # Build a mutable header list and insert new columns using insert_cols so + # that existing row data shifts along with the headers. + # Filter out None to guard against blank columns in the source file. + headers: list[str] = [cell.value for cell in ws[1] if cell.value is not None] # type: ignore[misc] + + for key in new_keys: + if key in headers: + continue + + # Find the first key that comes after this one in new_keys that already + # exists in headers — insert before it to keep columns logically grouped. + insert_before: str | None = None + found = False + for k in new_keys: + if k == key: + found = True + continue + if found and k in headers: + insert_before = k + break + + if insert_before is not None: + col_idx = headers.index(insert_before) + 1 # 1-based + ws.insert_cols(col_idx) + ws.cell(row=1, column=col_idx, value=key) + headers.insert(col_idx - 1, key) + else: + headers.append(key) + ws.cell(row=1, column=len(headers), value=key) + + ws.append([row_data.get(col) for col in headers]) + wb.save(file_path) diff --git a/backend/ecmk_fetcher/processor.py b/backend/ecmk_fetcher/processor.py index 2f122080..4f8c24ea 100644 --- a/backend/ecmk_fetcher/processor.py +++ b/backend/ecmk_fetcher/processor.py @@ -26,13 +26,17 @@ from backend.ecmk_fetcher.browser import ( ) from backend.ecmk_fetcher.reports import ( REPORT_TYPES, + FileDownloadButtonType, build_property_id, map_report_type_to_db_file_type, ) +from backend.ecmk_fetcher.excel_writer import write_row from backend.ecmk_fetcher.upload import ( + upload_excel_to_sharepoint, upload_file_to_s3_and_update_db, upload_file_to_sharepoint, ) +from backend.ecmk_fetcher.xml_processor import flatten_sap_property, parse_rdsap from utils.logger import setup_logger from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient from utils.sharepoint.domna_sites import DomnaSites @@ -59,6 +63,15 @@ def run_job() -> None: ) sharepoint_base_path: str = "/Projects/Southern Housing/SH-SURV-26-001/Assessments" + sharepoint_excel_path: str = "/Projects/Southern Housing/SH-SURV-26-001/Modelling" + + DIMENSIONS_FILENAME: str = "Dimensions.xlsx" + local_dimensions_path: str = os.path.join(BASE_DIR, DIMENSIONS_FILENAME) + + sharepoint_client.download_file( + sharepoint_path=f"{sharepoint_excel_path}/{DIMENSIONS_FILENAME}", + local_path=local_dimensions_path, + ) s3_bucket: str = "retrofit-energy-assessments-dev" @@ -141,15 +154,30 @@ def run_job() -> None: ) try: - upload_file_to_sharepoint( - client=sharepoint_client, - file_path=file_path, - base_path=sharepoint_base_path, - subpath=sharepoint_address, - ) - logger.info( - f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}" - ) + if report_type == FileDownloadButtonType.RAW_XML.value: + with open(file_path, "r", encoding="utf-8") as f: + xml_string = f.read() + details = parse_rdsap(xml_string) + row_data = flatten_sap_property(details) + write_row(local_dimensions_path, row_data) + upload_excel_to_sharepoint( + client=sharepoint_client, + file_path=local_dimensions_path, + sharepoint_path=sharepoint_excel_path, + ) + logger.info( + f"Written dimensions row and uploaded Dimensions.xlsx for {address}" + ) + else: + upload_file_to_sharepoint( + client=sharepoint_client, + file_path=file_path, + base_path=sharepoint_base_path, + subpath=sharepoint_address, + ) + logger.info( + f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}" + ) # Upload to s3 and update db upload_file_to_s3_and_update_db( diff --git a/backend/ecmk_fetcher/reports.py b/backend/ecmk_fetcher/reports.py index d8d11d50..d2f8ea52 100644 --- a/backend/ecmk_fetcher/reports.py +++ b/backend/ecmk_fetcher/reports.py @@ -14,6 +14,7 @@ class FileDownloadButtonType(Enum): REPORT_TYPES = [ FileDownloadButtonType.ASSESSOR_HUB_SITENOTE_REPORT.value, FileDownloadButtonType.SITENOTE_REPORT.value, + FileDownloadButtonType.RAW_XML.value, ] @@ -23,6 +24,8 @@ def map_report_type_to_db_file_type(report_type: int) -> FileTypeEnum: return FileTypeEnum.ECMK_SITE_NOTE case FileDownloadButtonType.SITENOTE_REPORT.value: return FileTypeEnum.ECMK_RD_SAP_SITE_NOTE + case FileDownloadButtonType.RAW_XML.value: + return FileTypeEnum.ECMK_SURVEY_XML case _: raise ValueError("Unknown report type") diff --git a/backend/ecmk_fetcher/tests/test_excel_writer.py b/backend/ecmk_fetcher/tests/test_excel_writer.py new file mode 100644 index 00000000..3f730951 --- /dev/null +++ b/backend/ecmk_fetcher/tests/test_excel_writer.py @@ -0,0 +1,123 @@ +import os +import pathlib +import pytest +from openpyxl import load_workbook +from openpyxl.worksheet.worksheet import Worksheet + +from backend.ecmk_fetcher.excel_writer import write_row + + +@pytest.fixture +def xlsx_path(tmp_path: pathlib.Path) -> str: + return str(tmp_path / "output.xlsx") + + +def _active_sheet(file_path: str) -> Worksheet: + ws = load_workbook(file_path).active + assert isinstance(ws, Worksheet) + return ws + + +def test_write_row_creates_file(xlsx_path: str): + # arrange + row = { + "address": "1 Fake Avenue, AB24 5CD", + "property_type": "House", + "main_dwelling_floor_1_area_m2": 43.61, + } + + # act + write_row(xlsx_path, row) + + # assert + assert os.path.exists(xlsx_path) + ws = _active_sheet(xlsx_path) + assert [c.value for c in ws[1]] == list(row.keys()) + assert [c.value for c in ws[2]] == list(row.values()) + + +def test_write_row_appends_to_existing(xlsx_path: str): + # arrange + row_a = { + "address": "1 Fake Avenue, AB24 5CD", + "property_type": "House", + "main_dwelling_floor_1_area_m2": 43.61, + } + row_b = { + "address": "2 Other Street, XY1 2AB", + "property_type": "Flat", + "main_dwelling_floor_1_area_m2": 30.0, + } + + # act + write_row(xlsx_path, row_a) + write_row(xlsx_path, row_b) + + # assert + ws = _active_sheet(xlsx_path) + assert ws.max_row == 3 # 1 header + 2 data rows + assert [c.value for c in ws[1]] == list(row_a.keys()) + assert [c.value for c in ws[2]] == list(row_a.values()) + assert [c.value for c in ws[3]] == list(row_b.values()) + + +def test_write_row_inserts_new_columns_at_logical_positions(xlsx_path: str): + # arrange + # First row: main_dwelling floor 1 + roof + # Second row: also has main_dwelling floor 2 — should be inserted between floor 1 and roof, + # not appended to the end + row_a = { + "address": "1 Fake Avenue, AB24 5CD", + "property_type": "House", + "main_dwelling_floor_1_area_m2": 43.61, + "main_dwelling_floor_1_height_m": 2.46, + "main_dwelling_roof_construction": 4, + } + row_b = { + "address": "2 Other Street, XY1 2AB", + "property_type": "House", + "main_dwelling_floor_1_area_m2": 50.0, + "main_dwelling_floor_1_height_m": 2.5, + "main_dwelling_floor_2_area_m2": 48.0, + "main_dwelling_floor_2_height_m": 2.4, + "main_dwelling_roof_construction": 4, + } + + # act + write_row(xlsx_path, row_a) + write_row(xlsx_path, row_b) + + # assert + ws = _active_sheet(xlsx_path) + + assert [c.value for c in ws[1]] == [ + "address", + "property_type", + "main_dwelling_floor_1_area_m2", + "main_dwelling_floor_1_height_m", + "main_dwelling_floor_2_area_m2", # inserted before roof, not at end + "main_dwelling_floor_2_height_m", + "main_dwelling_roof_construction", + ] + + # row_a had no floor_2 data — those cells should be empty + assert [c.value for c in ws[2]] == [ + "1 Fake Avenue, AB24 5CD", + "House", + 43.61, + 2.46, + None, # main_dwelling_floor_2_area_m2 + None, # main_dwelling_floor_2_height_m + 4, + ] + + # row_b should be fully populated + assert [c.value for c in ws[3]] == [ + "2 Other Street, XY1 2AB", + "House", + 50.0, + 2.5, + 48.0, + 2.4, + 4, + ] diff --git a/backend/ecmk_fetcher/tests/test_xml_processor.py b/backend/ecmk_fetcher/tests/test_xml_processor.py new file mode 100644 index 00000000..3695b09d --- /dev/null +++ b/backend/ecmk_fetcher/tests/test_xml_processor.py @@ -0,0 +1,329 @@ +from backend.ecmk_fetcher.xml_processor import ( + SapPropertyDetails, + flatten_sap_property, + parse_rdsap, +) + + +SAMPLE_XML = """ + + +
+ 1 + Fake Avenue + Random + AB24 5CD +
+
+
+ + + + 0 + + + + + 1 + Main Dwelling + C + 7 + + 4 + 2 + 100mm + + 4 + 4 + + + + 25.31 + 2.46 + 43.61 + 0 + 0 + + + + 26.16 + 2.44 + 42.33 + 1 + 0 + + + + + + + 2 + Extension + C + + 8 + 7 + AB + + 3 + 4 + + + + 6.85 + 2.24 + 4.46 + 0 + 0 + + + + + + + + + +
+""" + + +NO_ROOF_XML = """ + + +
+ 5 + Somewhere + XY1 2AB +
+
+
+ + + 0 + + + Main Dwelling + + + 10.0 + 2.5 + 50.0 + 0 + 3.0 + + + + + + +
+""" + + +def test_parse_rdsap_contract(): + # arrange + act + result: SapPropertyDetails = parse_rdsap(SAMPLE_XML) + + # assert + assert result == { + "reference": "1AB245CD", + "address": "1, Fake Avenue, Random, AB24 5CD", + "property_type": "House", + "building_parts": [ + { + "identifier": "Main Dwelling", + "floors": [ + { + "area_m2": 43.61, + "height_m": 2.46, + "heat_loss_perimeter_m": 25.31, + "party_wall_length_m": 0.0, + }, + { + "area_m2": 42.33, + "height_m": 2.44, + "heat_loss_perimeter_m": 26.16, + "party_wall_length_m": 0.0, + }, + ], + "roof": { + "construction": 4, + "insulation_location": 2, + "insulation_thickness_mm": 100.0, + }, + }, + { + "identifier": "Extension", + "floors": [ + { + "area_m2": 4.46, + "height_m": 2.24, + "heat_loss_perimeter_m": 6.85, + "party_wall_length_m": 0.0, + } + ], + "roof": { + "construction": 8, + "insulation_location": 7, + }, + }, + ], + } + + +ND_THICKNESS_XML = """ + + +
+ 1 + Somewhere + AB1 2CD +
+
+
+ + + 0 + + + Main Dwelling + 4 + 2 + ND + + + 10.0 + 2.5 + 50.0 + 0 + 0 + + + + + + +
+""" + +ND_INSULATION_LOCATION_XML = """ + + +
+ 1 + Somewhere + AB1 2CD +
+
+
+ + + 0 + + + Main Dwelling + 4 + ND + 250 + + + 10.0 + 2.5 + 50.0 + 0 + 0 + + + + + + +
+""" + + +def test_parse_rdsap_nd_thickness(): + # 'ND' (not determined) is a valid value in the wild for Roof-Insulation-Thickness + # — it should be retained as-is rather than raising + + # arrange + act + result: SapPropertyDetails = parse_rdsap(ND_THICKNESS_XML) + + # assert + assert result["building_parts"][0]["roof"] == { + "construction": 4, + "insulation_location": 2, + "insulation_thickness_mm": "ND", + } + + +def test_parse_rdsap_nd_location(): + # 'ND' (not determined) is a valid value in the wild for Roof-Insulation-Location + # — it should be retained as-is rather than raising + + # arrange + act + result: SapPropertyDetails = parse_rdsap(ND_INSULATION_LOCATION_XML) + + # assert + assert result["building_parts"][0]["roof"] == { + "construction": 4, + "insulation_location": "ND", + "insulation_thickness_mm": 250, + } + + +def test_flatten_full(): + # Two building parts; Main Dwelling has two floors + full roof, + # Extension has one floor + partial roof (no thickness) + + # arrange + details: SapPropertyDetails = parse_rdsap(SAMPLE_XML) + + # act + result = flatten_sap_property(details) + + # assert + assert result == { + "reference": "1AB245CD", + "address": "1, Fake Avenue, Random, AB24 5CD", + "property_type": "House", + "main_dwelling_floor_1_area_m2": 43.61, + "main_dwelling_floor_1_height_m": 2.46, + "main_dwelling_floor_1_heat_loss_perimeter_m": 25.31, + "main_dwelling_floor_1_party_wall_length_m": 0.0, + "main_dwelling_floor_2_area_m2": 42.33, + "main_dwelling_floor_2_height_m": 2.44, + "main_dwelling_floor_2_heat_loss_perimeter_m": 26.16, + "main_dwelling_floor_2_party_wall_length_m": 0.0, + "main_dwelling_roof_construction": 4, + "main_dwelling_roof_insulation_location": 2, + "main_dwelling_roof_insulation_thickness_mm": 100.0, + "extension_floor_1_area_m2": 4.46, + "extension_floor_1_height_m": 2.24, + "extension_floor_1_heat_loss_perimeter_m": 6.85, + "extension_floor_1_party_wall_length_m": 0.0, + "extension_roof_construction": 8, + "extension_roof_insulation_location": 7, + } + + +def test_flatten_no_roof(): + # Single building part with no roof — roof keys must be absent entirely + + # arrange + details: SapPropertyDetails = parse_rdsap(NO_ROOF_XML) + + # act + result = flatten_sap_property(details) + + # assert + assert result == { + "reference": "5XY12AB", + "address": "5, Somewhere, XY1 2AB", + "property_type": "House", + "main_dwelling_floor_1_area_m2": 50.0, + "main_dwelling_floor_1_height_m": 2.5, + "main_dwelling_floor_1_heat_loss_perimeter_m": 10.0, + "main_dwelling_floor_1_party_wall_length_m": 3.0, + } diff --git a/backend/ecmk_fetcher/upload.py b/backend/ecmk_fetcher/upload.py index 0a744e53..cc2c908d 100644 --- a/backend/ecmk_fetcher/upload.py +++ b/backend/ecmk_fetcher/upload.py @@ -28,6 +28,19 @@ def upload_file_to_sharepoint( ) +def upload_excel_to_sharepoint( + client: DomnaSharepointClient, + file_path: str, + sharepoint_path: str, +) -> None: + client.upload_file( + file_path=file_path, + sharepoint_path=sharepoint_path, + file_name=os.path.basename(file_path), + ) + + +# TODO: this should be moved to somewhere common and called by pashub fetcher def upload_file_to_s3_and_update_db( bucket: str, file_path: str, hubspot_listing_id: str, file_type: FileTypeEnum ) -> None: diff --git a/backend/ecmk_fetcher/xml_processor.py b/backend/ecmk_fetcher/xml_processor.py new file mode 100644 index 00000000..f993038b --- /dev/null +++ b/backend/ecmk_fetcher/xml_processor.py @@ -0,0 +1,226 @@ +import xml.etree.ElementTree as ET +from typing import Any, List, Optional, TypedDict + + +from backend.ecmk_fetcher.reports import build_property_id +from datatypes.epc.domain.field_mappings import PROPERTY_TYPE_LOOKUP + + +# This file should ultimately live somewhere different, probably +class Floor(TypedDict): + area_m2: float + height_m: float + heat_loss_perimeter_m: float + party_wall_length_m: float + + +class Roof(TypedDict, total=False): + construction: int # TODO: map to str + insulation_location: int | str # TODO: map to str + insulation_thickness_mm: float | str + + +class BuildingPart(TypedDict): + identifier: str # e.g. "Main Dwelling", "Extension" + floors: List[Floor] + roof: Optional[Roof] + + +class SapPropertyDetails(TypedDict): + reference: str + address: str + property_type: str + building_parts: List[BuildingPart] + + +def _get_namespace(tag: str) -> str: + return tag.split("}")[0].strip("{") + + +def _require_text(value: Optional[str], field: str) -> str: + if value is None: + raise ValueError(f"Missing required field: {field}") + return value + + +def _parse_float(value: Optional[str], field: str) -> float: + if value is None: + raise ValueError(f"Missing float field: {field}") + return float(value) + + +def _parse_int(value: Optional[str], field: str) -> int: + if value is None: + raise ValueError(f"Missing int field: {field}") + return int(value) + + +def _parse_thickness_mm(value: Optional[str]) -> Optional[float | str]: + if value is None: + return None + stripped = value.replace("mm", "").strip() + try: + return float(stripped) + except ValueError: + return stripped + + +def parse_rdsap(xml_string: str) -> SapPropertyDetails: + root = ET.fromstring(xml_string) + + ns_uri: str = _get_namespace(root.tag) + ns: dict[str, str] = {"r": ns_uri} + + # --- Address --- + addr_elem = root.find(".//r:Address", ns) + if addr_elem is None: + raise ValueError("Address element not found") + + address_line_1: str = addr_elem.findtext( + "r:Address-Line-1", default="", namespaces=ns + ) + postcode: str = addr_elem.findtext("r:Postcode", default="", namespaces=ns) + + address_parts: List[str] = [ + address_line_1, + addr_elem.findtext("r:Address-Line-2", default="", namespaces=ns), + addr_elem.findtext("r:Post-Town", default="", namespaces=ns), + postcode, + ] + + address: str = ", ".join(part for part in address_parts if part) + reference: str = build_property_id(address_line_1, postcode) + + # --- Property Type --- + prop_type_text = root.findtext(".//r:Property-Type", namespaces=ns) + prop_type_code: int = _parse_int(prop_type_text, "Property-Type") + property_type: str = PROPERTY_TYPE_LOOKUP[prop_type_code] + + # --- Building Parts --- + building_parts: List[BuildingPart] = [] + + for bp in root.findall(".//r:SAP-Building-Part", ns): + + identifier_text = bp.findtext("r:Identifier", namespaces=ns) + identifier: str = _require_text(identifier_text, "Identifier") + + # Floors + floors: List[Floor] = [] + + for f in bp.findall(".//r:SAP-Floor-Dimension", ns): + + area = _parse_float( + f.findtext("r:Total-Floor-Area", namespaces=ns), + "Total-Floor-Area", + ) + + height = _parse_float( + f.findtext("r:Room-Height", namespaces=ns), + "Room-Height", + ) + + heat_loss = _parse_float( + f.findtext("r:Heat-Loss-Perimeter", namespaces=ns), + "Heat-Loss-Perimeter", + ) + + party_wall = _parse_float( + f.findtext("r:Party-Wall-Length", namespaces=ns), + "Party-Wall-Length", + ) + + floor: Floor = { + "area_m2": area, + "height_m": height, + "heat_loss_perimeter_m": heat_loss, + "party_wall_length_m": party_wall, + } + + floors.append(floor) + + # Roof (optional) + roof: Optional[Roof] = None + + roof_construction_text = bp.findtext("r:Roof-Construction", namespaces=ns) + roof_ins_loc_text = bp.findtext("r:Roof-Insulation-Location", namespaces=ns) + roof_thickness_text = bp.findtext("r:Roof-Insulation-Thickness", namespaces=ns) + + if ( + roof_construction_text is not None + or roof_ins_loc_text is not None + or roof_thickness_text is not None + ): + roof_dict: Roof = {} + + if roof_construction_text is not None: + roof_dict["construction"] = _parse_int( + roof_construction_text, "Roof-Construction" + ) + + if roof_ins_loc_text is not None: + try: + roof_dict["insulation_location"] = _parse_int( + roof_ins_loc_text, "Roof-Insulation-Location" + ) + except ValueError: + roof_dict["insulation_location"] = roof_ins_loc_text + + thickness = _parse_thickness_mm(roof_thickness_text) + if thickness is not None: + roof_dict["insulation_thickness_mm"] = thickness + + roof = roof_dict + + building_part: BuildingPart = { + "identifier": identifier, + "floors": floors, + "roof": roof, + } + + building_parts.append(building_part) + + result: SapPropertyDetails = { + "reference": reference, + "address": address, + "property_type": property_type, + "building_parts": building_parts, + } + + return result + + +def _normalise_identifier(identifier: str) -> str: + return identifier.lower().replace(" ", "_").replace("-", "_") + + +def flatten_sap_property(details: SapPropertyDetails) -> dict[str, Any]: + row: dict[str, Any] = {} + + row["reference"] = details["reference"] + row["address"] = details["address"] + row["property_type"] = details["property_type"] + + for bp in details["building_parts"]: + prefix = _normalise_identifier(bp["identifier"]) + + for i, floor in enumerate(bp["floors"], start=1): + floor_prefix = f"{prefix}_floor_{i}" + row[f"{floor_prefix}_area_m2"] = floor["area_m2"] + row[f"{floor_prefix}_height_m"] = floor["height_m"] + row[f"{floor_prefix}_heat_loss_perimeter_m"] = floor[ + "heat_loss_perimeter_m" + ] + row[f"{floor_prefix}_party_wall_length_m"] = floor["party_wall_length_m"] + + roof = bp.get("roof") + if roof: + if "construction" in roof: + row[f"{prefix}_roof_construction"] = roof["construction"] + if "insulation_location" in roof: + row[f"{prefix}_roof_insulation_location"] = roof["insulation_location"] + if "insulation_thickness_mm" in roof: + row[f"{prefix}_roof_insulation_thickness_mm"] = roof[ + "insulation_thickness_mm" + ] + + return row diff --git a/backend/pashub_fetcher/The_Guinness_Partnership_AtkinsR_alis_Coordination_Design_Board_1774881298.xlsx b/backend/pashub_fetcher/The_Guinness_Partnership_AtkinsR_alis_Coordination_Design_Board_1774881298.xlsx index a6478e3b..beb679c1 100644 Binary files a/backend/pashub_fetcher/The_Guinness_Partnership_AtkinsR_alis_Coordination_Design_Board_1774881298.xlsx and b/backend/pashub_fetcher/The_Guinness_Partnership_AtkinsR_alis_Coordination_Design_Board_1774881298.xlsx differ diff --git a/backend/pashub_fetcher/handler/Dockerfile b/backend/pashub_fetcher/handler/Dockerfile index d045becd..e450d340 100644 --- a/backend/pashub_fetcher/handler/Dockerfile +++ b/backend/pashub_fetcher/handler/Dockerfile @@ -22,5 +22,5 @@ ENTRYPOINT ["python", "-m", "awslambdaric"] # ----------------------------- # Lambda handler # ----------------------------- -CMD ["backend.pashub_fetcher.handler.test_handler.handler"] +CMD ["backend.pashub_fetcher.handler.handler"] # CMD ["backend.pashub_fetcher.handler.handler.handler"] \ No newline at end of file diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index 3689efe9..60b946c1 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,8 +1,7 @@ from datetime import datetime, timezone -import json import os import re -from typing import Any, Dict, List, Mapping, Optional +from typing import Any, Dict, List, Optional from openpyxl import load_workbook from backend.app.config import get_settings @@ -104,10 +103,19 @@ def upload_job_to_sharepoint( ) -def upload_job_to_s3_and_update_db(job_files: List[str], uprn: str) -> None: +def upload_job_to_s3_and_update_db( + job_files: List[str], uprn: Optional[str], hubspot_deal_id: Optional[str] +) -> None: bucket = "retrofit-energy-assessments-dev" - base_path = f"documents/uprn/{uprn}" + if not uprn and not hubspot_deal_id: + return + + base_path = ( + f"documents/uprn/{uprn}" + if uprn + else f"documents/hubspot_deal_id/{hubspot_deal_id}" + ) uploaded_files: List[UploadedFile] = [] @@ -118,12 +126,14 @@ def upload_job_to_s3_and_update_db(job_files: List[str], uprn: str) -> None: upload_file_to_s3(file_path, bucket, file_key) # load row to db + # TODO: use same upload_file_to_s3_and_update_db method as ecmk fetcher does uploaded_files.append( UploadedFile( s3_file_bucket=bucket, s3_file_key=file_key, s3_upload_timestamp=datetime.now(timezone.utc), - uprn=int(uprn), + uprn=int(uprn) if uprn else None, + hubspot_deal_id=hubspot_deal_id, file_source=FileSourceEnum.PAS_HUB.value, file_type=infer_file_type(filename), ) @@ -144,6 +154,7 @@ def process_job( job_id = job.pashub_job_id uprn: Optional[str] = job.uprn or pashub_client.get_uprn_by_job_id(job_id) + hubspot_deal_id: Optional[str] = job.hubspot_deal_id if uprn: logger.info(f"Got UPRN {uprn} for job {job_id}") @@ -152,9 +163,9 @@ def process_job( job_files: List[str] = pashub_client.get_core_evidence_files_by_job_id(job_id) - if uprn: + if uprn or hubspot_deal_id: logger.info("Uploading files to s3") - upload_job_to_s3_and_update_db(job_files, uprn) + upload_job_to_s3_and_update_db(job_files, uprn, hubspot_deal_id) # # Comment out sharepoint loading for now: # Seems like the sharepoint link in pas hub is inconsistent in terms @@ -167,9 +178,8 @@ def process_job( @task_handler() -def handler(event: Mapping[str, Any], context: Any) -> None: +def handler(body: Dict[str, Any], context: Any) -> List[str]: logger.info("Received message") - logger.info(f"Number of events: {len(event.get('Records', []))}") settings = get_settings() @@ -185,48 +195,34 @@ def handler(event: Mapping[str, Any], context: Any) -> None: sharepoint_location=DomnaSites.SOCIAL_HOUSING_WAVE_3 ) - saved_file_paths: List[str] = [] + logger.debug("Validating request body") + payload = PashubToAraTriggerRequest.model_validate(body) + logger.debug("Successfully validated request body") - for record in event.get("Records", []): - try: - body_dict = json.loads(record["body"]) - logger.debug("Validating request body") + try: + files: List[str] = process_job( + payload, + pashub_client, + sharepoint_client, + ) + except UnauthorizedError: + logger.warning("Token expired - refreshing") - payload = PashubToAraTriggerRequest.model_validate(body_dict) + pashub_client = get_pashub_client( + pas_hub_email, + pas_hub_password, + ) - logger.debug("Successfully validated request body") + # retry once + files = process_job( + payload, + pashub_client, + sharepoint_client, + ) - try: - files: List[str] = process_job( - payload, - pashub_client, - sharepoint_client, - ) - saved_file_paths.extend(files) + logger.info(f"Saved {len(files)} files") - except UnauthorizedError: - logger.warning("Token expired - refreshing") - - pashub_client = get_pashub_client( - pas_hub_email, - pas_hub_password, - ) - - # retry once - files: List[str] = process_job( - payload, - pashub_client, - sharepoint_client, - ) - saved_file_paths.extend(files) - - except Exception as e: - logger.info("Handler exception") - logger.error(f"Failed to process record: {e}") - - logger.info("Successfully loaded jobs from spreadsheet") - - logger.info(f"Saved {len(saved_file_paths)} files") + return files if __name__ == "__main__": diff --git a/backend/pashub_fetcher/local_handler/invoke_local_lambda.py b/backend/pashub_fetcher/local_handler/invoke_local_lambda.py index 463ef9d8..219446fd 100644 --- a/backend/pashub_fetcher/local_handler/invoke_local_lambda.py +++ b/backend/pashub_fetcher/local_handler/invoke_local_lambda.py @@ -12,7 +12,8 @@ payload = { { "body": json.dumps( { - "uprn": 123456, + "pashub_link": "https://pashub.net/jobs/00000000-0000-0000-0000-000000000000/details", + "uprn": "123456", } ) } diff --git a/backend/pashub_fetcher/pashub_to_ara_trigger_request.py b/backend/pashub_fetcher/pashub_to_ara_trigger_request.py index 2e4f8380..518a8dc3 100644 --- a/backend/pashub_fetcher/pashub_to_ara_trigger_request.py +++ b/backend/pashub_fetcher/pashub_to_ara_trigger_request.py @@ -12,6 +12,8 @@ class PashubToAraTriggerRequest(BaseModel): uprn: Optional[str] = None landlord_property_id: Optional[str] = None deal_stage: Optional[str] = None + hubspot_listing_id: Optional[int] = None + hubspot_deal_id: Optional[str] = None @property def pashub_job_id(self) -> str: diff --git a/backend/pashub_fetcher/trigger_lambda_from_file.py b/backend/pashub_fetcher/trigger_lambda_from_file.py new file mode 100644 index 00000000..fb9d1cbf --- /dev/null +++ b/backend/pashub_fetcher/trigger_lambda_from_file.py @@ -0,0 +1,63 @@ +import json +import os +import re +from typing import Any, Dict, List + +from openpyxl import load_workbook + +from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( + PashubToAraTriggerRequest, +) +from backend.pashub_fetcher.handler.handler import handler + + +if __name__ == "__main__": + BASE_DIR = os.path.dirname(os.path.dirname(__file__)) + filepath: str = os.path.join( + BASE_DIR, + "pashub_fetcher", + "The_Guinness_Partnership_AtkinsR_alis_Coordination_Design_Board_1774881298.xlsx", + ) + + wb = load_workbook(filepath, data_only=True) + ws = wb["filtered_2"] + + HEADER_ROW = 3 + + headers: Dict[str, int] = {} + for col in range(1, ws.max_column + 1): + value = str(ws.cell(row=HEADER_ROW, column=col).value) + if value: + headers[value.strip()] = col + + name_col = headers["Name"] + link_col = headers["PasHub Link"] + hubspot_deal_id_col = headers["HubSpot ID"] + + trigger_requests: List[PashubToAraTriggerRequest] = [] + + for row in range(HEADER_ROW + 1, ws.max_row + 1): + name = ws.cell(row=row, column=name_col).value + link = ws.cell(row=row, column=link_col).value + hubspot_deal_id = ws.cell(row=row, column=hubspot_deal_id_col).value + + if not name or not link or not hubspot_deal_id: + continue + + match = re.search(r"/jobs/([0-9a-fA-F\-]+)/", str(link)) + if not match: + continue + + trigger_requests.append( + PashubToAraTriggerRequest( + pashub_link=str(link), hubspot_deal_id=str(hubspot_deal_id) + ) + ) + + # ---- Build fake SQS event ---- + event: Dict[str, Any] = { + "Records": [{"body": json.dumps(req.model_dump())} for req in trigger_requests] + } + + context = None + handler(event, context) diff --git a/datatypes/epc/domain/field_mappings.py b/datatypes/epc/domain/field_mappings.py new file mode 100644 index 00000000..cc0f9067 --- /dev/null +++ b/datatypes/epc/domain/field_mappings.py @@ -0,0 +1,3 @@ +PROPERTY_TYPE_LOOKUP = {0: "House", 1: "Bungalow", 2: "Flat", 3: "Maisonette"} +ROOF_CONSTRUCTION_LOOKUP = {} +ROOF_INSULATION_LOCATION_LOOKUP = {} diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index b95b544c..74c8264d 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -6,9 +6,9 @@ from etl.hubspot.utils import parse_hs_date class HubspotDealDiffer: COORDINATION_COMPLETE: List[str] = [ - "v1 ioe/mtp complete", - "v2 ioe/mtp complete", - "v3 ioe/mtp complete", + "(v1) ioe/mtp complete", + "(v2) ioe/mtp complete", + "(v3) ioe/mtp complete", ] RETROFIT_DESIGN_COMPLETE = "uploaded" LODGEMENT_COMPLETE: List[str] = ["lodgement complete", "measures lodged"] @@ -149,19 +149,19 @@ class HubspotDealDiffer: def _coordination_completed( new_deal: Dict[str, str], old_deal: HubspotDealData ) -> bool: - new_status: str = new_deal.get("coordination_status", "") + new_status: str = new_deal.get("coordination_status") or "" return ( new_status != "" - and new_status in HubspotDealDiffer.COORDINATION_COMPLETE + and new_status.lower() in HubspotDealDiffer.COORDINATION_COMPLETE and new_status != old_deal.coordination_status ) @staticmethod def _design_completed(new_deal: Dict[str, str], old_deal: HubspotDealData) -> bool: - new_status: str = new_deal.get("design_status", "") + new_status: str = new_deal.get("coordination_status") or "" return ( new_status != "" - and new_status == HubspotDealDiffer.RETROFIT_DESIGN_COMPLETE + and new_status.lower() == HubspotDealDiffer.RETROFIT_DESIGN_COMPLETE and new_status != old_deal.design_status ) @@ -169,9 +169,9 @@ class HubspotDealDiffer: def _lodgement_completed( new_deal: Dict[str, str], old_deal: HubspotDealData ) -> bool: - new_status: str = new_deal.get("lodgement_status", "") + new_status: str = new_deal.get("coordination_status") or "" return ( new_status != "" - and new_status in HubspotDealDiffer.LODGEMENT_COMPLETE + and new_status.lower() in HubspotDealDiffer.LODGEMENT_COMPLETE and new_status != old_deal.lodgement_status ) diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index e6c788ea..f7dc1076 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -104,10 +104,10 @@ def _trigger_pashub_fetcher(sqs_client: Any, hubspot_deal: Dict[str, str]) -> No message_body: Dict[str, Optional[str]] = { "pashub_link": hubspot_deal["pashub_link"], "address": None, # potentially available from Listing, leave as None for now - "sharepoint_link": hubspot_deal["sharepoint_link"], - "uprn": hubspot_deal["national_uprn"], - "landlord_property_id": hubspot_deal["owner_property_id"], - "deal_stage": hubspot_deal["deal_stage"], + "sharepoint_link": hubspot_deal.get("sharepoint_link", None), + "uprn": hubspot_deal.get("national_uprn", None), + "landlord_property_id": hubspot_deal.get("owner_property_id", None), + "deal_stage": hubspot_deal.get("deal_stage", None), } response = sqs_client.send_message( @@ -121,5 +121,5 @@ def _trigger_pashub_fetcher(sqs_client: Any, hubspot_deal: Dict[str, str]) -> No if __name__ == "__main__": - handler({"hubspot_deal_id": "371470706915"}, "") + handler({"hubspot_deal_id": "498926855369"}, "") print("beep") diff --git a/pytest.ini b/pytest.ini index 6cb3b611..33231c61 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,6 +3,6 @@ pythonpath = . log_cli = true log_cli_level = INFO addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/documents_parser/tests +testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/ecmk_fetcher/tests/ backend/documents_parser/tests markers = integration: mark a test as an integration test diff --git a/utils/sharepoint/domna_sharepoint_client.py b/utils/sharepoint/domna_sharepoint_client.py index 67e079ed..5e0255ac 100644 --- a/utils/sharepoint/domna_sharepoint_client.py +++ b/utils/sharepoint/domna_sharepoint_client.py @@ -90,6 +90,41 @@ class DomnaSharepointClient: file_name, get_file_stream(file_path), sharepoint_path ) + def download_file(self, sharepoint_path: str, local_path: str) -> bool: + """ + Download a file from SharePoint to a local path. + + Returns True if the file was downloaded, False if it does not exist yet. + Raises on any other error. + """ + sharepoint_client = SharePointClient( + tenant_id=self.sharepoint_tenant_id, + client_id=self.sharepoint_client_id, + client_secret=self.sharepoint_client_secret, + site_id=self.sharepoint_drive.value, + ) + + try: + metadata: Dict[str, Any] = sharepoint_client.get_file_metadata(sharepoint_path) + except ValueError: + return False + + download_url: Optional[str] = metadata.get("@microsoft.graph.downloadUrl") + if not download_url: + return False + + content: BytesIO = SharePointClient.download_sharepoint_file(download_url) + + parent_dir = os.path.dirname(local_path) + if parent_dir: + os.makedirs(parent_dir, exist_ok=True) + + with open(local_path, "wb") as f: + f.write(content.getvalue()) + + self.logger.debug(f"Downloaded SharePoint file to: {local_path}") + return True + def create_temp_file(self, content: BytesIO, path: str): # Ensure the path is under /tmp/ new_path = os.path.join("/tmp/sharepoint", path) diff --git a/utils/sharepoint/sharepoint_client.py b/utils/sharepoint/sharepoint_client.py index 71f82b68..5807c3bd 100644 --- a/utils/sharepoint/sharepoint_client.py +++ b/utils/sharepoint/sharepoint_client.py @@ -278,6 +278,17 @@ class SharePointClient: # logger.debug(f"Listing folder contents from URL: {url}") return "GET", url, None + @api_call_decorator + def get_file_metadata(self, file_path: str) -> Dict[str, Any]: + """ + GET /drives/{drive-id}/root:/{file_path} + + Returns file metadata, including '@microsoft.graph.downloadUrl'. + Raises ValueError if the file does not exist (404). + """ + url = f"https://graph.microsoft.com/v1.0/drives/{self.document_drive_id}/root:/{file_path}" + return "GET", url, None + @api_call_decorator def create_folder(self, file_name: str, folder_path: str) -> Dict[str, Any]: """ @@ -325,7 +336,7 @@ class SharePointClient: return self.upload_file(file_name, sharepoint_parent_id, file_stream) @staticmethod - def download_sharepoint_file(download_url): + def download_sharepoint_file(download_url: str) -> BytesIO: """ Downloads a file from the given URL and returns its content.