#!/usr/bin/env python3 """ Parse a local site-notes PDF and load the result into the database. Usage: python local_runner.py """ from typing import List, Optional, Tuple from backend.app.db.connection import db_session from backend.app.db.models.epc_property import ( EpcBuildingPartModel, EpcEnergyElementModel, EpcFlatDetailsModel, EpcFloorDimensionModel, EpcMainHeatingDetailModel, EpcPropertyEnergyPerformanceModel, EpcPropertyModel, EpcWindowModel, ) from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor from backend.documents_parser.pdf import pdf_to_pages, pdf_to_text_list from domain.epc.epc_property_data import EnergyElement, EpcPropertyData from domain.epc.mapper import EpcPropertyDataMapper def _parse_pdf(pdf_path: str) -> EpcPropertyData: with open(pdf_path, "rb") as f: pdf_bytes: bytes = f.read() pages: List[str] = pdf_to_pages(pdf_bytes) full_text: str = "\n".join(pages) if "Elmhurst Energy Systems" in full_text: site_notes = ElmhurstSiteNotesExtractor(pages).extract() return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) tokens: List[str] = pdf_to_text_list(pdf_bytes) pashub_notes = PasHubRdSapSiteNotesExtractor(tokens).extract() return EpcPropertyDataMapper.from_site_notes(pashub_notes) def _insert_energy_elements( session, elements: List[EnergyElement], element_type: str, epc_property_id: int, ) -> None: for el in elements: session.add( EpcEnergyElementModel.from_domain(el, element_type, epc_property_id) ) def _insert_optional_energy_element( session, el: Optional[EnergyElement], element_type: str, epc_property_id: int, ) -> None: if el is not None: session.add( EpcEnergyElementModel.from_domain(el, element_type, epc_property_id) ) def run(pdf_path: str) -> None: data: EpcPropertyData = _parse_pdf(pdf_path) print("successfully mapped pdf") with db_session() as session: epc_prop: EpcPropertyModel = EpcPropertyModel.from_epc_property_data(data) session.add(epc_prop) session.flush() assert epc_prop.id is not None epc_property_id: int = epc_prop.id session.add( EpcPropertyEnergyPerformanceModel.from_epc_property_data( data, epc_property_id=epc_property_id ) ) for detail in data.sap_heating.main_heating_details: session.add(EpcMainHeatingDetailModel.from_domain(detail, epc_property_id)) for part in data.sap_building_parts: bp: EpcBuildingPartModel = EpcBuildingPartModel.from_domain( part, epc_property_id ) session.add(bp) session.flush() assert bp.id is not None for dim in part.sap_floor_dimensions: session.add(EpcFloorDimensionModel.from_domain(dim, bp.id)) for window in data.sap_windows: session.add(EpcWindowModel.from_domain(window, epc_property_id)) list_elements: List[Tuple[List[EnergyElement], str]] = [ (data.roofs, "roof"), (data.walls, "wall"), (data.floors, "floor"), (data.main_heating, "main_heating"), ] for elements, etype in list_elements: _insert_energy_elements(session, elements, etype, epc_property_id) optional_elements: List[Tuple[Optional[EnergyElement], str]] = [ (data.window, "window"), (data.lighting, "lighting"), (data.hot_water, "hot_water"), (data.secondary_heating, "secondary_heating"), (data.main_heating_controls, "main_heating_controls"), ] for el, etype in optional_elements: _insert_optional_energy_element(session, el, etype, epc_property_id) if data.sap_flat_details is not None: session.add( EpcFlatDetailsModel.from_domain(data.sap_flat_details, epc_property_id) ) print(f"epc_property_id={epc_property_id}") print(f"address: {data.address_line_1}, {data.post_town}, {data.postcode}") if __name__ == "__main__": # run("backend/documents_parser/tests/fixtures/PasHubSiteNotes_6.pdf") run("backend/documents_parser/tests/fixtures/ElmhurstSiteNotes.pdf")