diff --git a/backend/app/db/models/uploaded_file.py b/backend/app/db/models/uploaded_file.py index 71763790..5b34a752 100644 --- a/backend/app/db/models/uploaded_file.py +++ b/backend/app/db/models/uploaded_file.py @@ -16,6 +16,7 @@ class FileTypeEnum(enum.Enum): PAS_2023_OCCUPANCY = "pas_2023_occupancy" ECMK_SITE_NOTE = "ecmk_site_note" ECMK_RD_SAP_SITE_NOTE = "ecmk_rd_sap_site_note" + ECMK_SURVEY_XML = "ecmk_survey_xml" class FileSourceEnum(enum.Enum): diff --git a/backend/ecmk_fetcher/excel_writer.py b/backend/ecmk_fetcher/excel_writer.py new file mode 100644 index 00000000..f290614b --- /dev/null +++ b/backend/ecmk_fetcher/excel_writer.py @@ -0,0 +1,53 @@ +import os +from typing import Any + +from openpyxl import Workbook, load_workbook +from openpyxl.worksheet.worksheet import Worksheet + + +def write_row(file_path: str, row_data: dict[str, Any]) -> None: + new_keys = list(row_data.keys()) + + if not os.path.exists(file_path): + wb = Workbook() + ws: Worksheet = wb.active # type: ignore[assignment] + ws.append(new_keys) + ws.append(list(row_data.values())) + wb.save(file_path) + return + + wb = load_workbook(file_path) + ws = wb.active # type: ignore[assignment] + + # Build a mutable header list and insert new columns using insert_cols so + # that existing row data shifts along with the headers. + # Filter out None to guard against blank columns in the source file. + headers: list[str] = [cell.value for cell in ws[1] if cell.value is not None] # type: ignore[misc] + + for key in new_keys: + if key in headers: + continue + + # Find the first key that comes after this one in new_keys that already + # exists in headers — insert before it to keep columns logically grouped. + insert_before: str | None = None + found = False + for k in new_keys: + if k == key: + found = True + continue + if found and k in headers: + insert_before = k + break + + if insert_before is not None: + col_idx = headers.index(insert_before) + 1 # 1-based + ws.insert_cols(col_idx) + ws.cell(row=1, column=col_idx, value=key) + headers.insert(col_idx - 1, key) + else: + headers.append(key) + ws.cell(row=1, column=len(headers), value=key) + + ws.append([row_data.get(col) for col in headers]) + wb.save(file_path) diff --git a/backend/ecmk_fetcher/processor.py b/backend/ecmk_fetcher/processor.py index 2f122080..4f8c24ea 100644 --- a/backend/ecmk_fetcher/processor.py +++ b/backend/ecmk_fetcher/processor.py @@ -26,13 +26,17 @@ from backend.ecmk_fetcher.browser import ( ) from backend.ecmk_fetcher.reports import ( REPORT_TYPES, + FileDownloadButtonType, build_property_id, map_report_type_to_db_file_type, ) +from backend.ecmk_fetcher.excel_writer import write_row from backend.ecmk_fetcher.upload import ( + upload_excel_to_sharepoint, upload_file_to_s3_and_update_db, upload_file_to_sharepoint, ) +from backend.ecmk_fetcher.xml_processor import flatten_sap_property, parse_rdsap from utils.logger import setup_logger from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient from utils.sharepoint.domna_sites import DomnaSites @@ -59,6 +63,15 @@ def run_job() -> None: ) sharepoint_base_path: str = "/Projects/Southern Housing/SH-SURV-26-001/Assessments" + sharepoint_excel_path: str = "/Projects/Southern Housing/SH-SURV-26-001/Modelling" + + DIMENSIONS_FILENAME: str = "Dimensions.xlsx" + local_dimensions_path: str = os.path.join(BASE_DIR, DIMENSIONS_FILENAME) + + sharepoint_client.download_file( + sharepoint_path=f"{sharepoint_excel_path}/{DIMENSIONS_FILENAME}", + local_path=local_dimensions_path, + ) s3_bucket: str = "retrofit-energy-assessments-dev" @@ -141,15 +154,30 @@ def run_job() -> None: ) try: - upload_file_to_sharepoint( - client=sharepoint_client, - file_path=file_path, - base_path=sharepoint_base_path, - subpath=sharepoint_address, - ) - logger.info( - f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}" - ) + if report_type == FileDownloadButtonType.RAW_XML.value: + with open(file_path, "r", encoding="utf-8") as f: + xml_string = f.read() + details = parse_rdsap(xml_string) + row_data = flatten_sap_property(details) + write_row(local_dimensions_path, row_data) + upload_excel_to_sharepoint( + client=sharepoint_client, + file_path=local_dimensions_path, + sharepoint_path=sharepoint_excel_path, + ) + logger.info( + f"Written dimensions row and uploaded Dimensions.xlsx for {address}" + ) + else: + upload_file_to_sharepoint( + client=sharepoint_client, + file_path=file_path, + base_path=sharepoint_base_path, + subpath=sharepoint_address, + ) + logger.info( + f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}" + ) # Upload to s3 and update db upload_file_to_s3_and_update_db( diff --git a/backend/ecmk_fetcher/reports.py b/backend/ecmk_fetcher/reports.py index d8d11d50..d2f8ea52 100644 --- a/backend/ecmk_fetcher/reports.py +++ b/backend/ecmk_fetcher/reports.py @@ -14,6 +14,7 @@ class FileDownloadButtonType(Enum): REPORT_TYPES = [ FileDownloadButtonType.ASSESSOR_HUB_SITENOTE_REPORT.value, FileDownloadButtonType.SITENOTE_REPORT.value, + FileDownloadButtonType.RAW_XML.value, ] @@ -23,6 +24,8 @@ def map_report_type_to_db_file_type(report_type: int) -> FileTypeEnum: return FileTypeEnum.ECMK_SITE_NOTE case FileDownloadButtonType.SITENOTE_REPORT.value: return FileTypeEnum.ECMK_RD_SAP_SITE_NOTE + case FileDownloadButtonType.RAW_XML.value: + return FileTypeEnum.ECMK_SURVEY_XML case _: raise ValueError("Unknown report type") diff --git a/backend/ecmk_fetcher/tests/test_excel_writer.py b/backend/ecmk_fetcher/tests/test_excel_writer.py new file mode 100644 index 00000000..3f730951 --- /dev/null +++ b/backend/ecmk_fetcher/tests/test_excel_writer.py @@ -0,0 +1,123 @@ +import os +import pathlib +import pytest +from openpyxl import load_workbook +from openpyxl.worksheet.worksheet import Worksheet + +from backend.ecmk_fetcher.excel_writer import write_row + + +@pytest.fixture +def xlsx_path(tmp_path: pathlib.Path) -> str: + return str(tmp_path / "output.xlsx") + + +def _active_sheet(file_path: str) -> Worksheet: + ws = load_workbook(file_path).active + assert isinstance(ws, Worksheet) + return ws + + +def test_write_row_creates_file(xlsx_path: str): + # arrange + row = { + "address": "1 Fake Avenue, AB24 5CD", + "property_type": "House", + "main_dwelling_floor_1_area_m2": 43.61, + } + + # act + write_row(xlsx_path, row) + + # assert + assert os.path.exists(xlsx_path) + ws = _active_sheet(xlsx_path) + assert [c.value for c in ws[1]] == list(row.keys()) + assert [c.value for c in ws[2]] == list(row.values()) + + +def test_write_row_appends_to_existing(xlsx_path: str): + # arrange + row_a = { + "address": "1 Fake Avenue, AB24 5CD", + "property_type": "House", + "main_dwelling_floor_1_area_m2": 43.61, + } + row_b = { + "address": "2 Other Street, XY1 2AB", + "property_type": "Flat", + "main_dwelling_floor_1_area_m2": 30.0, + } + + # act + write_row(xlsx_path, row_a) + write_row(xlsx_path, row_b) + + # assert + ws = _active_sheet(xlsx_path) + assert ws.max_row == 3 # 1 header + 2 data rows + assert [c.value for c in ws[1]] == list(row_a.keys()) + assert [c.value for c in ws[2]] == list(row_a.values()) + assert [c.value for c in ws[3]] == list(row_b.values()) + + +def test_write_row_inserts_new_columns_at_logical_positions(xlsx_path: str): + # arrange + # First row: main_dwelling floor 1 + roof + # Second row: also has main_dwelling floor 2 — should be inserted between floor 1 and roof, + # not appended to the end + row_a = { + "address": "1 Fake Avenue, AB24 5CD", + "property_type": "House", + "main_dwelling_floor_1_area_m2": 43.61, + "main_dwelling_floor_1_height_m": 2.46, + "main_dwelling_roof_construction": 4, + } + row_b = { + "address": "2 Other Street, XY1 2AB", + "property_type": "House", + "main_dwelling_floor_1_area_m2": 50.0, + "main_dwelling_floor_1_height_m": 2.5, + "main_dwelling_floor_2_area_m2": 48.0, + "main_dwelling_floor_2_height_m": 2.4, + "main_dwelling_roof_construction": 4, + } + + # act + write_row(xlsx_path, row_a) + write_row(xlsx_path, row_b) + + # assert + ws = _active_sheet(xlsx_path) + + assert [c.value for c in ws[1]] == [ + "address", + "property_type", + "main_dwelling_floor_1_area_m2", + "main_dwelling_floor_1_height_m", + "main_dwelling_floor_2_area_m2", # inserted before roof, not at end + "main_dwelling_floor_2_height_m", + "main_dwelling_roof_construction", + ] + + # row_a had no floor_2 data — those cells should be empty + assert [c.value for c in ws[2]] == [ + "1 Fake Avenue, AB24 5CD", + "House", + 43.61, + 2.46, + None, # main_dwelling_floor_2_area_m2 + None, # main_dwelling_floor_2_height_m + 4, + ] + + # row_b should be fully populated + assert [c.value for c in ws[3]] == [ + "2 Other Street, XY1 2AB", + "House", + 50.0, + 2.5, + 48.0, + 2.4, + 4, + ] diff --git a/backend/ecmk_fetcher/tests/test_xml_processor.py b/backend/ecmk_fetcher/tests/test_xml_processor.py new file mode 100644 index 00000000..3695b09d --- /dev/null +++ b/backend/ecmk_fetcher/tests/test_xml_processor.py @@ -0,0 +1,329 @@ +from backend.ecmk_fetcher.xml_processor import ( + SapPropertyDetails, + flatten_sap_property, + parse_rdsap, +) + + +SAMPLE_XML = """ + + +
+ 1 + Fake Avenue + Random + AB24 5CD +
+
+
+ + + + 0 + + + + + 1 + Main Dwelling + C + 7 + + 4 + 2 + 100mm + + 4 + 4 + + + + 25.31 + 2.46 + 43.61 + 0 + 0 + + + + 26.16 + 2.44 + 42.33 + 1 + 0 + + + + + + + 2 + Extension + C + + 8 + 7 + AB + + 3 + 4 + + + + 6.85 + 2.24 + 4.46 + 0 + 0 + + + + + + + + + +
+""" + + +NO_ROOF_XML = """ + + +
+ 5 + Somewhere + XY1 2AB +
+
+
+ + + 0 + + + Main Dwelling + + + 10.0 + 2.5 + 50.0 + 0 + 3.0 + + + + + + +
+""" + + +def test_parse_rdsap_contract(): + # arrange + act + result: SapPropertyDetails = parse_rdsap(SAMPLE_XML) + + # assert + assert result == { + "reference": "1AB245CD", + "address": "1, Fake Avenue, Random, AB24 5CD", + "property_type": "House", + "building_parts": [ + { + "identifier": "Main Dwelling", + "floors": [ + { + "area_m2": 43.61, + "height_m": 2.46, + "heat_loss_perimeter_m": 25.31, + "party_wall_length_m": 0.0, + }, + { + "area_m2": 42.33, + "height_m": 2.44, + "heat_loss_perimeter_m": 26.16, + "party_wall_length_m": 0.0, + }, + ], + "roof": { + "construction": 4, + "insulation_location": 2, + "insulation_thickness_mm": 100.0, + }, + }, + { + "identifier": "Extension", + "floors": [ + { + "area_m2": 4.46, + "height_m": 2.24, + "heat_loss_perimeter_m": 6.85, + "party_wall_length_m": 0.0, + } + ], + "roof": { + "construction": 8, + "insulation_location": 7, + }, + }, + ], + } + + +ND_THICKNESS_XML = """ + + +
+ 1 + Somewhere + AB1 2CD +
+
+
+ + + 0 + + + Main Dwelling + 4 + 2 + ND + + + 10.0 + 2.5 + 50.0 + 0 + 0 + + + + + + +
+""" + +ND_INSULATION_LOCATION_XML = """ + + +
+ 1 + Somewhere + AB1 2CD +
+
+
+ + + 0 + + + Main Dwelling + 4 + ND + 250 + + + 10.0 + 2.5 + 50.0 + 0 + 0 + + + + + + +
+""" + + +def test_parse_rdsap_nd_thickness(): + # 'ND' (not determined) is a valid value in the wild for Roof-Insulation-Thickness + # — it should be retained as-is rather than raising + + # arrange + act + result: SapPropertyDetails = parse_rdsap(ND_THICKNESS_XML) + + # assert + assert result["building_parts"][0]["roof"] == { + "construction": 4, + "insulation_location": 2, + "insulation_thickness_mm": "ND", + } + + +def test_parse_rdsap_nd_location(): + # 'ND' (not determined) is a valid value in the wild for Roof-Insulation-Location + # — it should be retained as-is rather than raising + + # arrange + act + result: SapPropertyDetails = parse_rdsap(ND_INSULATION_LOCATION_XML) + + # assert + assert result["building_parts"][0]["roof"] == { + "construction": 4, + "insulation_location": "ND", + "insulation_thickness_mm": 250, + } + + +def test_flatten_full(): + # Two building parts; Main Dwelling has two floors + full roof, + # Extension has one floor + partial roof (no thickness) + + # arrange + details: SapPropertyDetails = parse_rdsap(SAMPLE_XML) + + # act + result = flatten_sap_property(details) + + # assert + assert result == { + "reference": "1AB245CD", + "address": "1, Fake Avenue, Random, AB24 5CD", + "property_type": "House", + "main_dwelling_floor_1_area_m2": 43.61, + "main_dwelling_floor_1_height_m": 2.46, + "main_dwelling_floor_1_heat_loss_perimeter_m": 25.31, + "main_dwelling_floor_1_party_wall_length_m": 0.0, + "main_dwelling_floor_2_area_m2": 42.33, + "main_dwelling_floor_2_height_m": 2.44, + "main_dwelling_floor_2_heat_loss_perimeter_m": 26.16, + "main_dwelling_floor_2_party_wall_length_m": 0.0, + "main_dwelling_roof_construction": 4, + "main_dwelling_roof_insulation_location": 2, + "main_dwelling_roof_insulation_thickness_mm": 100.0, + "extension_floor_1_area_m2": 4.46, + "extension_floor_1_height_m": 2.24, + "extension_floor_1_heat_loss_perimeter_m": 6.85, + "extension_floor_1_party_wall_length_m": 0.0, + "extension_roof_construction": 8, + "extension_roof_insulation_location": 7, + } + + +def test_flatten_no_roof(): + # Single building part with no roof — roof keys must be absent entirely + + # arrange + details: SapPropertyDetails = parse_rdsap(NO_ROOF_XML) + + # act + result = flatten_sap_property(details) + + # assert + assert result == { + "reference": "5XY12AB", + "address": "5, Somewhere, XY1 2AB", + "property_type": "House", + "main_dwelling_floor_1_area_m2": 50.0, + "main_dwelling_floor_1_height_m": 2.5, + "main_dwelling_floor_1_heat_loss_perimeter_m": 10.0, + "main_dwelling_floor_1_party_wall_length_m": 3.0, + } diff --git a/backend/ecmk_fetcher/upload.py b/backend/ecmk_fetcher/upload.py index 0a744e53..8cb451b0 100644 --- a/backend/ecmk_fetcher/upload.py +++ b/backend/ecmk_fetcher/upload.py @@ -28,6 +28,18 @@ def upload_file_to_sharepoint( ) +def upload_excel_to_sharepoint( + client: DomnaSharepointClient, + file_path: str, + sharepoint_path: str, +) -> None: + client.upload_file( + file_path=file_path, + sharepoint_path=sharepoint_path, + file_name=os.path.basename(file_path), + ) + + def upload_file_to_s3_and_update_db( bucket: str, file_path: str, hubspot_listing_id: str, file_type: FileTypeEnum ) -> None: diff --git a/backend/ecmk_fetcher/xml_processor.py b/backend/ecmk_fetcher/xml_processor.py new file mode 100644 index 00000000..f993038b --- /dev/null +++ b/backend/ecmk_fetcher/xml_processor.py @@ -0,0 +1,226 @@ +import xml.etree.ElementTree as ET +from typing import Any, List, Optional, TypedDict + + +from backend.ecmk_fetcher.reports import build_property_id +from datatypes.epc.domain.field_mappings import PROPERTY_TYPE_LOOKUP + + +# This file should ultimately live somewhere different, probably +class Floor(TypedDict): + area_m2: float + height_m: float + heat_loss_perimeter_m: float + party_wall_length_m: float + + +class Roof(TypedDict, total=False): + construction: int # TODO: map to str + insulation_location: int | str # TODO: map to str + insulation_thickness_mm: float | str + + +class BuildingPart(TypedDict): + identifier: str # e.g. "Main Dwelling", "Extension" + floors: List[Floor] + roof: Optional[Roof] + + +class SapPropertyDetails(TypedDict): + reference: str + address: str + property_type: str + building_parts: List[BuildingPart] + + +def _get_namespace(tag: str) -> str: + return tag.split("}")[0].strip("{") + + +def _require_text(value: Optional[str], field: str) -> str: + if value is None: + raise ValueError(f"Missing required field: {field}") + return value + + +def _parse_float(value: Optional[str], field: str) -> float: + if value is None: + raise ValueError(f"Missing float field: {field}") + return float(value) + + +def _parse_int(value: Optional[str], field: str) -> int: + if value is None: + raise ValueError(f"Missing int field: {field}") + return int(value) + + +def _parse_thickness_mm(value: Optional[str]) -> Optional[float | str]: + if value is None: + return None + stripped = value.replace("mm", "").strip() + try: + return float(stripped) + except ValueError: + return stripped + + +def parse_rdsap(xml_string: str) -> SapPropertyDetails: + root = ET.fromstring(xml_string) + + ns_uri: str = _get_namespace(root.tag) + ns: dict[str, str] = {"r": ns_uri} + + # --- Address --- + addr_elem = root.find(".//r:Address", ns) + if addr_elem is None: + raise ValueError("Address element not found") + + address_line_1: str = addr_elem.findtext( + "r:Address-Line-1", default="", namespaces=ns + ) + postcode: str = addr_elem.findtext("r:Postcode", default="", namespaces=ns) + + address_parts: List[str] = [ + address_line_1, + addr_elem.findtext("r:Address-Line-2", default="", namespaces=ns), + addr_elem.findtext("r:Post-Town", default="", namespaces=ns), + postcode, + ] + + address: str = ", ".join(part for part in address_parts if part) + reference: str = build_property_id(address_line_1, postcode) + + # --- Property Type --- + prop_type_text = root.findtext(".//r:Property-Type", namespaces=ns) + prop_type_code: int = _parse_int(prop_type_text, "Property-Type") + property_type: str = PROPERTY_TYPE_LOOKUP[prop_type_code] + + # --- Building Parts --- + building_parts: List[BuildingPart] = [] + + for bp in root.findall(".//r:SAP-Building-Part", ns): + + identifier_text = bp.findtext("r:Identifier", namespaces=ns) + identifier: str = _require_text(identifier_text, "Identifier") + + # Floors + floors: List[Floor] = [] + + for f in bp.findall(".//r:SAP-Floor-Dimension", ns): + + area = _parse_float( + f.findtext("r:Total-Floor-Area", namespaces=ns), + "Total-Floor-Area", + ) + + height = _parse_float( + f.findtext("r:Room-Height", namespaces=ns), + "Room-Height", + ) + + heat_loss = _parse_float( + f.findtext("r:Heat-Loss-Perimeter", namespaces=ns), + "Heat-Loss-Perimeter", + ) + + party_wall = _parse_float( + f.findtext("r:Party-Wall-Length", namespaces=ns), + "Party-Wall-Length", + ) + + floor: Floor = { + "area_m2": area, + "height_m": height, + "heat_loss_perimeter_m": heat_loss, + "party_wall_length_m": party_wall, + } + + floors.append(floor) + + # Roof (optional) + roof: Optional[Roof] = None + + roof_construction_text = bp.findtext("r:Roof-Construction", namespaces=ns) + roof_ins_loc_text = bp.findtext("r:Roof-Insulation-Location", namespaces=ns) + roof_thickness_text = bp.findtext("r:Roof-Insulation-Thickness", namespaces=ns) + + if ( + roof_construction_text is not None + or roof_ins_loc_text is not None + or roof_thickness_text is not None + ): + roof_dict: Roof = {} + + if roof_construction_text is not None: + roof_dict["construction"] = _parse_int( + roof_construction_text, "Roof-Construction" + ) + + if roof_ins_loc_text is not None: + try: + roof_dict["insulation_location"] = _parse_int( + roof_ins_loc_text, "Roof-Insulation-Location" + ) + except ValueError: + roof_dict["insulation_location"] = roof_ins_loc_text + + thickness = _parse_thickness_mm(roof_thickness_text) + if thickness is not None: + roof_dict["insulation_thickness_mm"] = thickness + + roof = roof_dict + + building_part: BuildingPart = { + "identifier": identifier, + "floors": floors, + "roof": roof, + } + + building_parts.append(building_part) + + result: SapPropertyDetails = { + "reference": reference, + "address": address, + "property_type": property_type, + "building_parts": building_parts, + } + + return result + + +def _normalise_identifier(identifier: str) -> str: + return identifier.lower().replace(" ", "_").replace("-", "_") + + +def flatten_sap_property(details: SapPropertyDetails) -> dict[str, Any]: + row: dict[str, Any] = {} + + row["reference"] = details["reference"] + row["address"] = details["address"] + row["property_type"] = details["property_type"] + + for bp in details["building_parts"]: + prefix = _normalise_identifier(bp["identifier"]) + + for i, floor in enumerate(bp["floors"], start=1): + floor_prefix = f"{prefix}_floor_{i}" + row[f"{floor_prefix}_area_m2"] = floor["area_m2"] + row[f"{floor_prefix}_height_m"] = floor["height_m"] + row[f"{floor_prefix}_heat_loss_perimeter_m"] = floor[ + "heat_loss_perimeter_m" + ] + row[f"{floor_prefix}_party_wall_length_m"] = floor["party_wall_length_m"] + + roof = bp.get("roof") + if roof: + if "construction" in roof: + row[f"{prefix}_roof_construction"] = roof["construction"] + if "insulation_location" in roof: + row[f"{prefix}_roof_insulation_location"] = roof["insulation_location"] + if "insulation_thickness_mm" in roof: + row[f"{prefix}_roof_insulation_thickness_mm"] = roof[ + "insulation_thickness_mm" + ] + + return row diff --git a/datatypes/epc/domain/field_mappings.py b/datatypes/epc/domain/field_mappings.py new file mode 100644 index 00000000..cc0f9067 --- /dev/null +++ b/datatypes/epc/domain/field_mappings.py @@ -0,0 +1,3 @@ +PROPERTY_TYPE_LOOKUP = {0: "House", 1: "Bungalow", 2: "Flat", 3: "Maisonette"} +ROOF_CONSTRUCTION_LOOKUP = {} +ROOF_INSULATION_LOCATION_LOOKUP = {} diff --git a/pytest.ini b/pytest.ini index 55c2873a..8f8ceeef 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,6 +3,6 @@ pythonpath = . log_cli = true log_cli_level = INFO addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests +testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/ecmk_fetcher/tests/ markers = integration: mark a test as an integration test diff --git a/utils/sharepoint/domna_sharepoint_client.py b/utils/sharepoint/domna_sharepoint_client.py index 67e079ed..5e0255ac 100644 --- a/utils/sharepoint/domna_sharepoint_client.py +++ b/utils/sharepoint/domna_sharepoint_client.py @@ -90,6 +90,41 @@ class DomnaSharepointClient: file_name, get_file_stream(file_path), sharepoint_path ) + def download_file(self, sharepoint_path: str, local_path: str) -> bool: + """ + Download a file from SharePoint to a local path. + + Returns True if the file was downloaded, False if it does not exist yet. + Raises on any other error. + """ + sharepoint_client = SharePointClient( + tenant_id=self.sharepoint_tenant_id, + client_id=self.sharepoint_client_id, + client_secret=self.sharepoint_client_secret, + site_id=self.sharepoint_drive.value, + ) + + try: + metadata: Dict[str, Any] = sharepoint_client.get_file_metadata(sharepoint_path) + except ValueError: + return False + + download_url: Optional[str] = metadata.get("@microsoft.graph.downloadUrl") + if not download_url: + return False + + content: BytesIO = SharePointClient.download_sharepoint_file(download_url) + + parent_dir = os.path.dirname(local_path) + if parent_dir: + os.makedirs(parent_dir, exist_ok=True) + + with open(local_path, "wb") as f: + f.write(content.getvalue()) + + self.logger.debug(f"Downloaded SharePoint file to: {local_path}") + return True + def create_temp_file(self, content: BytesIO, path: str): # Ensure the path is under /tmp/ new_path = os.path.join("/tmp/sharepoint", path) diff --git a/utils/sharepoint/sharepoint_client.py b/utils/sharepoint/sharepoint_client.py index 71f82b68..5807c3bd 100644 --- a/utils/sharepoint/sharepoint_client.py +++ b/utils/sharepoint/sharepoint_client.py @@ -278,6 +278,17 @@ class SharePointClient: # logger.debug(f"Listing folder contents from URL: {url}") return "GET", url, None + @api_call_decorator + def get_file_metadata(self, file_path: str) -> Dict[str, Any]: + """ + GET /drives/{drive-id}/root:/{file_path} + + Returns file metadata, including '@microsoft.graph.downloadUrl'. + Raises ValueError if the file does not exist (404). + """ + url = f"https://graph.microsoft.com/v1.0/drives/{self.document_drive_id}/root:/{file_path}" + return "GET", url, None + @api_call_decorator def create_folder(self, file_name: str, folder_path: str) -> Dict[str, Any]: """ @@ -325,7 +336,7 @@ class SharePointClient: return self.upload_file(file_name, sharepoint_parent_id, file_stream) @staticmethod - def download_sharepoint_file(download_url): + def download_sharepoint_file(download_url: str) -> BytesIO: """ Downloads a file from the given URL and returns its content.