From 5f9c3e9bea4efedbae82ecc11a7b372c3acaaa79 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 16 Apr 2026 14:05:11 +0000 Subject: [PATCH] =?UTF-8?q?Load=20BuildingConstruction=20from=20SiteNotes?= =?UTF-8?q?=20JSON=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/documents_parser/extractor.py | 134 +++++++++++++++++++++++++- 1 file changed, 133 insertions(+), 1 deletion(-) diff --git a/backend/documents_parser/extractor.py b/backend/documents_parser/extractor.py index 32572261..ad3c5220 100644 --- a/backend/documents_parser/extractor.py +++ b/backend/documents_parser/extractor.py @@ -1,7 +1,12 @@ from datetime import datetime +from typing import List, Optional from datatypes.epc.surveys.pashub_rdsap_site_notes import ( + BuildingConstruction, + ExtensionConstruction, + FloorConstruction, General, + MainBuildingConstruction, PasHubRdSapSiteNotes, ) @@ -10,7 +15,9 @@ class PasHubRdSapSiteNotesExtractor: def __init__(self, text_list: list[str]) -> None: self.text_list = text_list - def _get(self, key: str, offset: int = 1) -> str | None: + # --- generic helpers --- + + def _get(self, key: str, offset: int = 1) -> Optional[str]: try: idx = self.text_list.index(key) return self.text_list[idx + offset].strip() or None @@ -21,6 +28,35 @@ class PasHubRdSapSiteNotesExtractor: val = self._get(key, offset) return val is not None and val.lower() == "yes" + def _get_in(self, lst: List[str], key: str, offset: int = 1) -> Optional[str]: + try: + idx = lst.index(key) + return lst[idx + offset].strip() or None + except (ValueError, IndexError): + return None + + def _bool_in(self, lst: List[str], key: str) -> bool: + val = self._get_in(lst, key) + return val is not None and val.lower() == "yes" + + def _is_known_in(self, lst: List[str], key: str) -> bool: + val = self._get_in(lst, key) + return val is not None and val.lower() != "not known" + + def _wall_thickness_in(self, lst: List[str]) -> int: + val = self._get_in(lst, "Wall thickness:") + return int(val.split()[0]) if val else 0 + + def _section(self, start: str, end: str) -> List[str]: + try: + start_idx = self.text_list.index(start) + end_idx = self.text_list.index(end, start_idx) + return self.text_list[start_idx:end_idx] + except ValueError: + return [] + + # --- public extract methods --- + def extract(self) -> PasHubRdSapSiteNotes: raise NotImplementedError @@ -58,3 +94,99 @@ class PasHubRdSapSiteNotesExtractor: gas_meter_accessible=self._bool("Is the gas meter accessible?"), measurements_location=self._get("Select Measurements Location:") or "", ) + + def extract_building_construction(self) -> BuildingConstruction: + bc_section = self._section("Building Construction", "Building Measurements") + + # Find extension markers within this section + extension_markers = [] + i = 1 + while f"Extension {i}" in bc_section: + extension_markers.append(f"Extension {i}") + i += 1 + + # Slice main building data: from "Main Building" to first extension or end + main_start = bc_section.index("Main Building") + main_end = ( + bc_section.index(extension_markers[0]) + if extension_markers + else len(bc_section) + ) + main_data = bc_section[main_start:main_end] + + # Slice each extension's data + extensions = [] + for n, marker in enumerate(extension_markers): + ext_start = bc_section.index(marker) + ext_end = ( + bc_section.index(extension_markers[n + 1]) + if n + 1 < len(extension_markers) + else len(bc_section) + ) + ext_data = bc_section[ext_start:ext_end] + extensions.append(self._parse_extension_construction(n + 1, ext_data)) + + return BuildingConstruction( + main_building=self._parse_main_building_construction(main_data), + floor=self._parse_floor_construction(main_data), + extensions=extensions if extensions else None, + ) + + # --- private parsing helpers --- + + def _parse_main_building_construction( + self, data: List[str] + ) -> MainBuildingConstruction: + return MainBuildingConstruction( + age_range=self._get_in(data, "Age Range:") or "", + age_indicators=self._get_in(data, "Record indicators of property age:") or "", + walls_construction_type=self._get_in(data, "Walls - Construction Type:") or "", + cavity_construction_indicators=self._get_in( + data, "Record external indicators of Cavity Construction:" + ) or "", + walls_insulation_type=self._get_in(data, "Walls - Insulation Type:") or "", + filled_cavity_indicators=self._get_in( + data, "Record indicators of filled cavity:" + ), + thermal_conductivity_of_wall_insulation=self._get_in( + data, "Thermal conductivity of wall insulation:" + ) or "", + wall_u_value_known=self._is_known_in(data, "Wall U-Value known?"), + wall_thickness_mm=self._wall_thickness_in(data), + party_wall_construction_type=self._get_in( + data, "Party wall construction type:" + ) or "", + ) + + def _parse_extension_construction( + self, ext_id: int, data: List[str] + ) -> ExtensionConstruction: + return ExtensionConstruction( + id=ext_id, + age_range=self._get_in(data, "Age Range:") or "", + age_indicators=self._get_in(data, "Record indicators of property age:") or "", + walls_construction_type=self._get_in(data, "Walls - Construction Type:") or "", + cavity_construction_indicators=self._get_in( + data, "Record external indicators of Cavity Construction:" + ) or "", + walls_insulation_type=self._get_in(data, "Walls - Insulation Type:") or "", + filled_cavity_indicators=self._get_in( + data, "Record indicators of filled cavity:" + ), + thermal_conductivity_of_wall_insulation=self._get_in( + data, "Thermal conductivity of wall insulation:" + ) or "", + wall_u_value_known=self._is_known_in(data, "Wall U-Value known?"), + wall_thickness_mm=self._wall_thickness_in(data), + party_wall_construction_type=self._get_in( + data, "Party wall construction type:" + ) or "", + ) + + def _parse_floor_construction(self, data: List[str]) -> FloorConstruction: + return FloorConstruction( + floor_type=self._get_in(data, "Floor type:") or "", + floor_construction=self._get_in(data, "Floor Construction:") or "", + floor_insulation_type=self._get_in(data, "Floor Insulation Type:") or "", + floor_u_value_known=self._is_known_in(data, "Floor U-Value known?"), + )